factor out page parser

2025-01-24 12:14:20 +01:00 · 2015-09-03 20:25:24 -07:00 · 2015-09-03 20:25:24 -07:00 · cecc8ba68b
commit cecc8ba68b
parent 0ad94fca46
2 changed files with 72 additions and 58 deletions
--- a/pirate/pirate.py
+++ b/pirate/pirate.py
@ -91,7 +91,7 @@ def main():
                        help='list Sortable Types')
    parser.add_argument('-L', '--local', dest='database',
                        help='an xml file containing the Pirate Bay database')
-    parser.add_argument('-p', dest='pages', default=1,
+    parser.add_argument('-p', dest='pages', default=1, type=int,
                        help='the number of pages to fetch '
                             "(doesn't work with --local)")
    parser.add_argument('-0', dest='first',
--- a/pirate/torrent.py
+++ b/pirate/torrent.py
@ -62,48 +62,83 @@ def parse_magnets_seeds_leechers(found):
    return res
 #TODO: redo this with html parser instead of regex
 #TODO: warn users when using a sort in a mode that doesn't accept sorts
 #TODO: warn users when using search terms in a mode that doesn't accept search terms
 #TODO: same with page parameter for top and top48h
 #TODO: warn the user if trying to use a minor category with top48h
 def build_request_path(page, category, sort, mode, terms):
    if mode == 'browse':
        if(category == 0):
            category = 100
        return '/browse/{}/{}/{}'.format(category, page, sort)
    elif mode == 'recent':
        # This is not a typo. There is no / between 48h and the category.
        path = '/top/48h'
        # only major categories can be used with this mode
        if(category == 0):
            return path + 'all'
        else:
            return path + str(category)
    elif mode == 'top':
        path = '/top/'
        if(category == 0):
            return path + 'all'
        else:
            return path + str(category)
    elif mode == 'search':
        query = urllib.parse.quote_plus(' '.join(terms))
        return '/search/{}/{}/{}/{}'.format(query, page, sort, category)
    else:
        raise Exception('Unknown mode.')
 #TODO: redo this with html parser instead of regex
 def parse_page(res):
    found = re.findall(parser_regex, res)
    # check for a blocked mirror
    no_results = re.search(r'No hits\. Try adding an asterisk in '
                           r'you search phrase\.', res)
    if found == [] and no_results is None:
        # Contradiction - we found no results,
        # but the page didn't say there were no results.
        # The page is probably not actually the pirate bay,
        # so let's try another mirror
        raise IOError('Blocked mirror detected.')
    # get sizes as well and substitute the &nbsp; character
    # TODO: use actual html decode
    sizes = [match.replace('&nbsp;', ' ').split()
                 for match in re.findall(r'(?<=Size )[0-9.]'
                 r'+\&nbsp\;[KMGT]*[i ]*B', res)]
    uploaded = [match.replace('&nbsp;', ' ')
                    for match in re.findall(r'(?<=Uploaded )'
                    r'.+(?=\, Size)',res)]
    identifiers = [match.replace('&nbsp;', ' ')
                    for match in re.findall('(?<=/torrent/)'
                    '[0-9]+(?=/)',res)]
    res_l = parse_magnets_seeds_leechers(found)
    return res_l, sizes, uploaded, identifiers
 def remote(pages, category, sort, mode, terms, mirror):
    res_l = []
-    pages = int(pages)
+    sizes = []
    uploaded = []
    identifiers = []
    if pages < 1:
        raise ValueError('Please provide an integer greater than 0 '
                         'for the number of pages to fetch.')
    # Catch the Ctrl-C exception and exit cleanly
    try:
        sizes = []
        uploaded = []
        identifiers = []
        for page in range(pages):
-            if mode == 'browse':
+            path = build_request_path(page, category, sort, mode, terms)
                path = '/browse/'
                if(category == 0):
                    category = 100
                path = '/browse/{}/{}/{}'.format(category, page, sort)
            elif mode == 'recent':
                # This is not a typo. There is no / between 48h and the category.
                path = '/top/48h'
                # only major categories can be used with this mode
                if(category == 0):
                    path += 'all'
                else:
                    path += str(category)
            elif mode == 'top':
                path = '/top/'
                if(category == 0):
                    path += 'all'
                else:
                    path += str(category)
            elif mode == 'search':
                query = urllib.parse.quote_plus(' '.join(terms))
                path = '/search/{}/{}/{}/{}'.format(query, page, sort, category)
            else:
                raise Exception('Unknown mode.')
            req = request.Request(mirror + path,
                                  headers=pirate.data.default_headers)
@ -112,39 +147,18 @@ def remote(pages, category, sort, mode, terms, mirror):
            if f.info().get('Content-Encoding') == 'gzip':
                f = gzip.GzipFile(fileobj=BytesIO(f.read()))
            res = f.read().decode('utf-8')
            found = re.findall(parser_regex, res)
-            # check for a blocked mirror
+            page_res_l, page_sizes, page_uploaded, page_identifiers = parse_page(res)
-            no_results = re.search(r'No hits\. Try adding an asterisk in '
+            res_l += page_res_l
-                                   r'you search phrase\.', res)
+            sizes += page_sizes
-            if found == [] and no_results is None:
+            uploaded += page_uploaded
-                # Contradiction - we found no results,
+            identifiers += page_identifiers
                # but the page didn't say there were no results.
                # The page is probably not actually the pirate bay,
                # so let's try another mirror
                raise IOError('Blocked mirror detected.')
-            # get sizes as well and substitute the &nbsp; character
+    except KeyboardInterrupt:
            # TODO: use actual html decode
            sizes.extend([match.replace('&nbsp;', ' ').split()
                         for match in re.findall(r'(?<=Size )[0-9.]'
                         r'+\&nbsp\;[KMGT]*[i ]*B', res)])
            uploaded.extend([match.replace('&nbsp;', ' ')
                            for match in re.findall(r'(?<=Uploaded )'
                            r'.+(?=\, Size)',res)])
            identifiers.extend([match.replace('&nbsp;', ' ')
                            for match in re.findall('(?<=/torrent/)'
                            '[0-9]+(?=/)',res)])
            res_l += parse_magnets_seeds_leechers(found)
    except KeyboardInterrupt :
        print('\nCancelled.')
        sys.exit(0)
-    # return the sizes in a spearate list
+    # return the sizes in a separate list
    return res_l, sizes, uploaded, identifiers