1
0
mirror of https://github.com/vikstrous/pirate-get synced 2025-01-24 12:14:20 +01:00

handle mirrors with fake results table

This commit is contained in:
Michele Guerini Rocco 2019-11-24 11:01:58 +01:00
parent 757231a188
commit 6642130fc5
Signed by: rnhmjoj
GPG Key ID: BFBAF4C975F76450

View File

@ -80,14 +80,12 @@ def build_request_path(page, category, sort, mode, terms):
# this returns a list of dictionaries # this returns a list of dictionaries
def parse_page(html): def parse_page(html):
soup = BeautifulSoup(html, 'html.parser') soup = BeautifulSoup(html, 'html.parser')
table = soup.find('table', id='searchResult') tables = soup.find_all('table', id='searchResult')
results = []
no_results = re.search(r'No hits\. Try adding an asterisk in ' no_results = re.search(r'No hits\. Try adding an asterisk in '
r'you search phrase\.', html) r'you search phrase\.', html)
# check for a blocked mirror # check for a blocked mirror
if not table and not no_results: if not tables and not no_results:
# Contradiction - we found no results, # Contradiction - we found no results,
# but the page didn't say there were no results. # but the page didn't say there were no results.
# The page is probably not actually the pirate bay, # The page is probably not actually the pirate bay,
@ -95,19 +93,36 @@ def parse_page(html):
raise IOError('Blocked mirror detected.') raise IOError('Blocked mirror detected.')
if no_results: if no_results:
return []
# handle ads disguised as fake result tables
for table in tables:
results = parse_table(table)
if results:
break
else:
raise IOError('Mirror does not contain magnets.')
return results return results
def parse_table(table):
results = []
# parse the rows one by one (skipping headings) # parse the rows one by one (skipping headings)
for row in table('tr')[1:]: for row in table('tr')[1:]:
# grab info about the row # grab info about the row
row_link = row.find('a', class_='detLink') row_link = row.find('a', class_='detLink')
if row_link is None: if row_link is None:
continue continue
id_ = row_link['href'].split('/')[2] id_ = row_link['href'].split('/')[2]
seeds, leechers = [i.text for i in row('td')[-2:]] seeds, leechers = [i.text for i in row('td')[-2:]]
magnet = row.find(lambda tag: magnet_tag = row.find(lambda tag: tag.name == 'a' and
tag.name == 'a' and tag['href'].startswith('magnet'))
tag['href'].startswith('magnet'))['href'] if magnet_tag is None:
continue
magnet = magnet_tag['href']
# parse descriptions separately # parse descriptions separately
description = row.find('font', class_='detDesc').text description = row.find('font', class_='detDesc').text
@ -203,6 +218,7 @@ def save_magnets(printer, chosen_links, results, folder):
with open(file, 'w') as f: with open(file, 'w') as f:
f.write(magnet + '\n') f.write(magnet + '\n')
def copy_magnets(printer, chosen_links, results): def copy_magnets(printer, chosen_links, results):
clipboard_text = '' clipboard_text = ''
for link in chosen_links: for link in chosen_links: