rewrite html parser with pyquery

2025-01-10 10:04:21 +01:00 · 2015-09-03 21:44:02 -07:00 · 2015-09-03 21:44:02 -07:00 · 43f8ffefea
commit 43f8ffefea
parent d0a9d0f51e
5 changed files with 240 additions and 42 deletions
--- a/pirate/torrent.py
+++ b/pirate/torrent.py
@ -6,6 +6,8 @@ import urllib.parse as parse
 import urllib.error
 import os.path
 from pyquery import PyQuery as pq
 import pirate.data
 from pirate.print import print
@ -43,25 +45,6 @@ def parse_sort(sort):
        return '99'
 def parse_magnets_seeds_leechers(found):
    res = []
    state = 'seeds'
    curr = ['', 0, 0] #magnet, seeds, leeches
    for f in found:
        if f[1] == '':
            curr[0] = f[0]
        else:
            if state == 'seeds':
                curr[1] = f[1]
                state = 'leeches'
            else:
                curr[2] = f[1]
                state = 'seeds'
                res.append(curr)
                curr = ['', 0, 0]
    return res
 #TODO: warn users when using a sort in a mode that doesn't accept sorts
 #TODO: warn users when using search terms in a mode that doesn't accept search terms
 #TODO: same with page parameter for top and top48h
@ -92,37 +75,40 @@ def build_request_path(page, category, sort, mode, terms):
        raise Exception('Unknown mode.')
-#TODO: redo this with html parser instead of regex
+def parse_page(html):
-def parse_page(res):
+    d = pq(html)
-    found = re.findall(parser_regex, res)
+
    # first get the magnet links and make sure there are results
    magnets = list(map(lambda l: pq(l).attr('href'), 
        d('table#searchResult tr>td:nth-child(2)>a:nth-child(2)')))
    # check for a blocked mirror
    no_results = re.search(r'No hits\. Try adding an asterisk in '
-                           r'you search phrase\.', res)
+                           r'you search phrase\.', html)
-    if found == [] and no_results is None:
+    if len(magnets) == 0 and no_results is None:
        # Contradiction - we found no results,
        # but the page didn't say there were no results.
        # The page is probably not actually the pirate bay,
        # so let's try another mirror
        raise IOError('Blocked mirror detected.')
-    # get sizes as well and substitute the &nbsp; character
+    # next get more info
-    # TODO: use actual html decode
+    seeds = list(map(lambda l: pq(l).text(), 
-    sizes = [match.replace('&nbsp;', ' ').split()
+        d('table#searchResult tr>td:nth-child(3)')))
-                 for match in re.findall(r'(?<=Size )[0-9.]'
+    leechers = list(map(lambda l: pq(l).text(), 
-                 r'+\&nbsp\;[KMGT]*[i ]*B', res)]
+        d('table#searchResult tr>td:nth-child(4)')))
    identifiers = list(map(lambda l: pq(l).attr('href').split('/')[2],
        d('table#searchResult .detLink')))
-    uploaded = [match.replace('&nbsp;', ' ')
+    sizes = []
-                    for match in re.findall(r'(?<=Uploaded )'
+    uploaded = []
-                    r'.+(?=\, Size)',res)]
+    # parse descriptions separately
    for node in d('font.detDesc'):
        text = pq(node).text()
        sizes.append(re.findall(r'(?<=Size )[0-9.]+\s[KMGT]*[i ]*B', text)[0].split())
        uploaded.append(re.findall(r'(?<=Uploaded ).+(?=\, Size)', text)[0])
-    identifiers = [match.replace('&nbsp;', ' ')
+    return list(zip(magnets,seeds,leechers)), sizes, uploaded, identifiers
                    for match in re.findall('(?<=/torrent/)'
                    '[0-9]+(?=/)',res)]
    res_l = parse_magnets_seeds_leechers(found)
    return res_l, sizes, uploaded, identifiers
 def remote(pages, category, sort, mode, terms, mirror):
--- a/setup.py
+++ b/setup.py
@ -13,7 +13,7 @@ setup(name='pirate-get',
    entry_points={
        'console_scripts': ['pirate-get = pirate.pirate:main']
    },
-    install_requires=['colorama>=0.3.3'],
+    install_requires=['colorama>=0.3.3', 'pyquery>=1.2.9'],
    keywords=['torrent', 'magnet', 'download', 'tpb', 'client'],
    classifiers=[
        'Topic :: Utilities',
--- a/tests/data/blocked.html
+++ b/tests/data/blocked.html
@ -0,0 +1 @@
 blocked.
--- a/tests/data/no_hits.html
+++ b/tests/data/no_hits.html
--- a/tests/test_torrent.py
+++ b/tests/test_torrent.py