switch to BeautifulSoup

2025-01-10 10:04:21 +01:00 · 2016-07-03 19:26:58 +02:00 · 2016-07-03 19:26:58 +02:00 · 41b3d56721
commit 41b3d56721
parent 4837a3e453
2 changed files with 27 additions and 25 deletions
--- a/pirate/torrent.py
+++ b/pirate/torrent.py
@ -6,7 +6,7 @@ import urllib.parse as parse
 import urllib.error
 import os.path

-from pyquery import PyQuery as pq
+from bs4 import BeautifulSoup

 import pirate.data

@ -76,25 +76,37 @@ def build_request_path(page, category, sort, mode, terms):

 # this returns a list of dictionaries
 def parse_page(html):
-    d = pq(html)
+    soup = BeautifulSoup(html, 'html.parser')
+    table = soup.find('table', id='searchResult')

    results = []
-    # parse the rows one by one
-    for row in d('table#searchResult tr'):
-        drow = d(row)
-        if len(drow('th')) > 0:
-            continue
+    no_results = re.search(r'No hits\. Try adding an asterisk in '
+                           r'you search phrase\.', html)

+    # check for a blocked mirror
+    if not table and not no_results:
+        # Contradiction - we found no results,
+        # but the page didn't say there were no results.
+        # The page is probably not actually the pirate bay,
+        # so let's try another mirror
+        raise IOError('Blocked mirror detected.')
+
+    if no_results:
+        return results
+
+    # parse the rows one by one (skipping headings)
+    for row in table('tr')[1:]:
        # grab info about the row
-        magnet = pq(drow(':eq(0)>td:nth-child(2)>a:nth-child(2)')[0]).attr('href')
-        seeds = pq(drow(':eq(0)>td:nth-child(3)')).text()
-        leechers = pq(drow(':eq(0)>td:nth-child(4)')).text()
-        id_ = pq(drow('.detLink')).attr('href').split('/')[2]
+        id_ = row.find('a', class_='detLink')['href'].split('/')[2]
+        seeds, leechers = [i.text for i in row('td')[-2:]]
+        magnet = row.find(lambda tag:
+                          tag.name == 'a' and
+                          tag['href'].startswith('magnet'))['href']

        # parse descriptions separately
-        desc_text = pq(drow('font.detDesc')[0]).text()
-        size = re.findall(r'(?<=Size )[0-9.]+\s[KMGT]*[i ]*B', desc_text)[0].split()
-        uploaded = re.findall(r'(?<=Uploaded ).+(?=\, Size)', desc_text)[0]
+        description = row.find('font', class_='detDesc').text
+        size = re.findall(r'(?<=Size )[0-9.]+\s[KMGT]*[i ]*B', description)[0].split()
+        uploaded = re.findall(r'(?<=Uploaded ).+(?=\, Size)', description)[0]

        results.append({
            'magnet': magnet,
@ -105,16 +117,6 @@ def parse_page(html):
            'id': id_
        })

-    # check for a blocked mirror
-    no_results = re.search(r'No hits\. Try adding an asterisk in '
-                           r'you search phrase\.', html)
-    if len(results) == 0 and no_results is None:
-        # Contradiction - we found no results,
-        # but the page didn't say there were no results.
-        # The page is probably not actually the pirate bay,
-        # so let's try another mirror
-        raise IOError('Blocked mirror detected.')
-
    return results


--- a/setup.py
+++ b/setup.py
@ -13,7 +13,7 @@ setup(name='pirate-get',
    entry_points={
        'console_scripts': ['pirate-get = pirate.pirate:main']
    },
-    install_requires=['colorama>=0.3.3', 'pyquery>=1.2.9', 'veryprettytable>=0.8.1'],
+    install_requires=['colorama>=0.3.3', 'beautifulsoup4>=4.4.1', 'veryprettytable>=0.8.1'],
    keywords=['torrent', 'magnet', 'download', 'tpb', 'client'],
    classifiers=[
        'Topic :: Utilities',