handle mirrors with fake results table

2025-01-09 09:59:51 +01:00 · 2019-11-24 11:01:58 +01:00 · 2019-11-24 11:01:58 +01:00 · 6642130fc5
commit 6642130fc5
parent 757231a188
1 changed files with 24 additions and 8 deletions
--- a/pirate/torrent.py
+++ b/pirate/torrent.py
@ -80,14 +80,12 @@ def build_request_path(page, category, sort, mode, terms):
 # this returns a list of dictionaries
 def parse_page(html):
    soup = BeautifulSoup(html, 'html.parser')
-    table = soup.find('table', id='searchResult')
-
-    results = []
+    tables = soup.find_all('table', id='searchResult')
    no_results = re.search(r'No hits\. Try adding an asterisk in '
                           r'you search phrase\.', html)

    # check for a blocked mirror
-    if not table and not no_results:
+    if not tables and not no_results:
        # Contradiction - we found no results,
        # but the page didn't say there were no results.
        # The page is probably not actually the pirate bay,
@ -95,7 +93,21 @@ def parse_page(html):
        raise IOError('Blocked mirror detected.')

    if no_results:
-        return results
+        return []
+
+    # handle ads disguised as fake result tables
+    for table in tables:
+        results = parse_table(table)
+        if results:
+            break
+    else:
+        raise IOError('Mirror does not contain magnets.')
+
+    return results
+
+
+def parse_table(table):
+    results = []

    # parse the rows one by one (skipping headings)
    for row in table('tr')[1:]:
@ -103,11 +115,14 @@ def parse_page(html):
        row_link = row.find('a', class_='detLink')
        if row_link is None:
            continue
+
        id_ = row_link['href'].split('/')[2]
        seeds, leechers = [i.text for i in row('td')[-2:]]
-        magnet = row.find(lambda tag:
-                          tag.name == 'a' and
-                          tag['href'].startswith('magnet'))['href']
+        magnet_tag = row.find(lambda tag: tag.name == 'a' and
+                              tag['href'].startswith('magnet'))
+        if magnet_tag is None:
+            continue
+        magnet = magnet_tag['href']

        # parse descriptions separately
        description = row.find('font', class_='detDesc').text
@ -203,6 +218,7 @@ def save_magnets(printer, chosen_links, results, folder):
        with open(file, 'w') as f:
            f.write(magnet + '\n')

+
 def copy_magnets(printer, chosen_links, results):
    clipboard_text = ''
    for link in chosen_links: