From cecc8ba68b9d949ba463fdcf4364c0f3999ab07c Mon Sep 17 00:00:00 2001 From: Viktor Stanchev Date: Thu, 3 Sep 2015 20:25:24 -0700 Subject: [PATCH] factor out page parser --- pirate/pirate.py | 2 +- pirate/torrent.py | 128 +++++++++++++++++++++++++--------------------- 2 files changed, 72 insertions(+), 58 deletions(-) diff --git a/pirate/pirate.py b/pirate/pirate.py index c526f17..674ca46 100755 --- a/pirate/pirate.py +++ b/pirate/pirate.py @@ -91,7 +91,7 @@ def main(): help='list Sortable Types') parser.add_argument('-L', '--local', dest='database', help='an xml file containing the Pirate Bay database') - parser.add_argument('-p', dest='pages', default=1, + parser.add_argument('-p', dest='pages', default=1, type=int, help='the number of pages to fetch ' "(doesn't work with --local)") parser.add_argument('-0', dest='first', diff --git a/pirate/torrent.py b/pirate/torrent.py index e2e7eca..83b6202 100644 --- a/pirate/torrent.py +++ b/pirate/torrent.py @@ -62,48 +62,83 @@ def parse_magnets_seeds_leechers(found): return res -#TODO: redo this with html parser instead of regex #TODO: warn users when using a sort in a mode that doesn't accept sorts #TODO: warn users when using search terms in a mode that doesn't accept search terms #TODO: same with page parameter for top and top48h #TODO: warn the user if trying to use a minor category with top48h +def build_request_path(page, category, sort, mode, terms): + if mode == 'browse': + if(category == 0): + category = 100 + return '/browse/{}/{}/{}'.format(category, page, sort) + elif mode == 'recent': + # This is not a typo. There is no / between 48h and the category. + path = '/top/48h' + # only major categories can be used with this mode + if(category == 0): + return path + 'all' + else: + return path + str(category) + elif mode == 'top': + path = '/top/' + if(category == 0): + return path + 'all' + else: + return path + str(category) + elif mode == 'search': + query = urllib.parse.quote_plus(' '.join(terms)) + return '/search/{}/{}/{}/{}'.format(query, page, sort, category) + else: + raise Exception('Unknown mode.') + + +#TODO: redo this with html parser instead of regex +def parse_page(res): + found = re.findall(parser_regex, res) + + # check for a blocked mirror + no_results = re.search(r'No hits\. Try adding an asterisk in ' + r'you search phrase\.', res) + if found == [] and no_results is None: + # Contradiction - we found no results, + # but the page didn't say there were no results. + # The page is probably not actually the pirate bay, + # so let's try another mirror + raise IOError('Blocked mirror detected.') + + # get sizes as well and substitute the   character + # TODO: use actual html decode + sizes = [match.replace(' ', ' ').split() + for match in re.findall(r'(?<=Size )[0-9.]' + r'+\ \;[KMGT]*[i ]*B', res)] + + uploaded = [match.replace(' ', ' ') + for match in re.findall(r'(?<=Uploaded )' + r'.+(?=\, Size)',res)] + + identifiers = [match.replace(' ', ' ') + for match in re.findall('(?<=/torrent/)' + '[0-9]+(?=/)',res)] + + res_l = parse_magnets_seeds_leechers(found) + + return res_l, sizes, uploaded, identifiers + + def remote(pages, category, sort, mode, terms, mirror): res_l = [] - pages = int(pages) + sizes = [] + uploaded = [] + identifiers = [] + if pages < 1: raise ValueError('Please provide an integer greater than 0 ' 'for the number of pages to fetch.') # Catch the Ctrl-C exception and exit cleanly try: - sizes = [] - uploaded = [] - identifiers = [] for page in range(pages): - if mode == 'browse': - path = '/browse/' - if(category == 0): - category = 100 - path = '/browse/{}/{}/{}'.format(category, page, sort) - elif mode == 'recent': - # This is not a typo. There is no / between 48h and the category. - path = '/top/48h' - # only major categories can be used with this mode - if(category == 0): - path += 'all' - else: - path += str(category) - elif mode == 'top': - path = '/top/' - if(category == 0): - path += 'all' - else: - path += str(category) - elif mode == 'search': - query = urllib.parse.quote_plus(' '.join(terms)) - path = '/search/{}/{}/{}/{}'.format(query, page, sort, category) - else: - raise Exception('Unknown mode.') + path = build_request_path(page, category, sort, mode, terms) req = request.Request(mirror + path, headers=pirate.data.default_headers) @@ -112,39 +147,18 @@ def remote(pages, category, sort, mode, terms, mirror): if f.info().get('Content-Encoding') == 'gzip': f = gzip.GzipFile(fileobj=BytesIO(f.read())) res = f.read().decode('utf-8') - found = re.findall(parser_regex, res) - # check for a blocked mirror - no_results = re.search(r'No hits\. Try adding an asterisk in ' - r'you search phrase\.', res) - if found == [] and no_results is None: - # Contradiction - we found no results, - # but the page didn't say there were no results. - # The page is probably not actually the pirate bay, - # so let's try another mirror - raise IOError('Blocked mirror detected.') + page_res_l, page_sizes, page_uploaded, page_identifiers = parse_page(res) + res_l += page_res_l + sizes += page_sizes + uploaded += page_uploaded + identifiers += page_identifiers - # get sizes as well and substitute the   character - # TODO: use actual html decode - sizes.extend([match.replace(' ', ' ').split() - for match in re.findall(r'(?<=Size )[0-9.]' - r'+\ \;[KMGT]*[i ]*B', res)]) - - uploaded.extend([match.replace(' ', ' ') - for match in re.findall(r'(?<=Uploaded )' - r'.+(?=\, Size)',res)]) - - identifiers.extend([match.replace(' ', ' ') - for match in re.findall('(?<=/torrent/)' - '[0-9]+(?=/)',res)]) - - res_l += parse_magnets_seeds_leechers(found) - - except KeyboardInterrupt : + except KeyboardInterrupt: print('\nCancelled.') sys.exit(0) - # return the sizes in a spearate list + # return the sizes in a separate list return res_l, sizes, uploaded, identifiers