refactor results parser to be row-oriented

2025-01-10 10:04:21 +01:00 · 2015-09-03 22:34:08 -07:00 · 2015-09-03 22:34:08 -07:00 · 101f7e0fb1
commit 101f7e0fb1
parent 1f04700158
1 changed files with 28 additions and 23 deletions
--- a/pirate/torrent.py
+++ b/pirate/torrent.py
@ -79,39 +79,44 @@ def build_request_path(page, category, sort, mode, terms):
 def parse_page(html):
    d = pq(html)
-    # first get the magnet links and make sure there are results
+    results = []
-    magnets = list(map(lambda l: pq(l).attr('href'), 
+    # parse the rows one by one
-        d('table#searchResult tr>td:nth-child(2)>a:nth-child(2)')))
+    for row in d('table#searchResult tr'):
        drow = d(row)
        if len(drow('th')) > 0:
            continue
        # grab info about the row
        magnet = pq(drow(':eq(0)>td:nth-child(2)>a:nth-child(2)')[0]).attr('href')
        seeds = pq(drow(':eq(0)>td:nth-child(3)')).text()
        leechers = pq(drow(':eq(0)>td:nth-child(4)')).text()
        id_ = pq(drow('.detLink')).attr('href').split('/')[2]
        # parse descriptions separately
        desc_text = pq(drow('font.detDesc')[0]).text()
        size = re.findall(r'(?<=Size )[0-9.]+\s[KMGT]*[i ]*B', desc_text)[0].split()
        uploaded = re.findall(r'(?<=Uploaded ).+(?=\, Size)', desc_text)[0]
        results.append({
            'magnet': magnet,
            'seeds': seeds,
            'leechers': leechers,
            'size': size,
            'uploaded': uploaded,
            'id': id_
        })
    # check for a blocked mirror
    no_results = re.search(r'No hits\. Try adding an asterisk in '
                           r'you search phrase\.', html)
-    if len(magnets) == 0 and no_results is None:
+    if len(results) == 0 and no_results is None:
        # Contradiction - we found no results,
        # but the page didn't say there were no results.
        # The page is probably not actually the pirate bay,
        # so let's try another mirror
        raise IOError('Blocked mirror detected.')
-    # next get more info
+    return results
    seeds = list(map(lambda l: pq(l).text(), 
        d('table#searchResult tr>td:nth-child(3)')))
    leechers = list(map(lambda l: pq(l).text(), 
        d('table#searchResult tr>td:nth-child(4)')))
    ids = list(map(lambda l: pq(l).attr('href').split('/')[2],
        d('table#searchResult .detLink')))
    sizes = []
    uploaded = []
    # parse descriptions separately
    for node in d('font.detDesc'):
        text = pq(node).text()
        sizes.append(re.findall(r'(?<=Size )[0-9.]+\s[KMGT]*[i ]*B', text)[0].split())
        uploaded.append(re.findall(r'(?<=Uploaded ).+(?=\, Size)', text)[0])
    titles = ('magnet', 'seeds', 'leechers', 'size', 'uploaded', 'id')
    rows = list(zip(magnets, seeds, leechers, sizes, uploaded, ids))
    return [dict(zip(titles,row)) for row in rows]
 def remote(pages, category, sort, mode, terms, mirror):