mirror of
https://github.com/vikstrous/pirate-get
synced 2025-01-24 12:14:20 +01:00
factor out page parser
This commit is contained in:
parent
0ad94fca46
commit
cecc8ba68b
@ -91,7 +91,7 @@ def main():
|
|||||||
help='list Sortable Types')
|
help='list Sortable Types')
|
||||||
parser.add_argument('-L', '--local', dest='database',
|
parser.add_argument('-L', '--local', dest='database',
|
||||||
help='an xml file containing the Pirate Bay database')
|
help='an xml file containing the Pirate Bay database')
|
||||||
parser.add_argument('-p', dest='pages', default=1,
|
parser.add_argument('-p', dest='pages', default=1, type=int,
|
||||||
help='the number of pages to fetch '
|
help='the number of pages to fetch '
|
||||||
"(doesn't work with --local)")
|
"(doesn't work with --local)")
|
||||||
parser.add_argument('-0', dest='first',
|
parser.add_argument('-0', dest='first',
|
||||||
|
@ -62,48 +62,83 @@ def parse_magnets_seeds_leechers(found):
|
|||||||
return res
|
return res
|
||||||
|
|
||||||
|
|
||||||
#TODO: redo this with html parser instead of regex
|
|
||||||
#TODO: warn users when using a sort in a mode that doesn't accept sorts
|
#TODO: warn users when using a sort in a mode that doesn't accept sorts
|
||||||
#TODO: warn users when using search terms in a mode that doesn't accept search terms
|
#TODO: warn users when using search terms in a mode that doesn't accept search terms
|
||||||
#TODO: same with page parameter for top and top48h
|
#TODO: same with page parameter for top and top48h
|
||||||
#TODO: warn the user if trying to use a minor category with top48h
|
#TODO: warn the user if trying to use a minor category with top48h
|
||||||
|
def build_request_path(page, category, sort, mode, terms):
|
||||||
|
if mode == 'browse':
|
||||||
|
if(category == 0):
|
||||||
|
category = 100
|
||||||
|
return '/browse/{}/{}/{}'.format(category, page, sort)
|
||||||
|
elif mode == 'recent':
|
||||||
|
# This is not a typo. There is no / between 48h and the category.
|
||||||
|
path = '/top/48h'
|
||||||
|
# only major categories can be used with this mode
|
||||||
|
if(category == 0):
|
||||||
|
return path + 'all'
|
||||||
|
else:
|
||||||
|
return path + str(category)
|
||||||
|
elif mode == 'top':
|
||||||
|
path = '/top/'
|
||||||
|
if(category == 0):
|
||||||
|
return path + 'all'
|
||||||
|
else:
|
||||||
|
return path + str(category)
|
||||||
|
elif mode == 'search':
|
||||||
|
query = urllib.parse.quote_plus(' '.join(terms))
|
||||||
|
return '/search/{}/{}/{}/{}'.format(query, page, sort, category)
|
||||||
|
else:
|
||||||
|
raise Exception('Unknown mode.')
|
||||||
|
|
||||||
|
|
||||||
|
#TODO: redo this with html parser instead of regex
|
||||||
|
def parse_page(res):
|
||||||
|
found = re.findall(parser_regex, res)
|
||||||
|
|
||||||
|
# check for a blocked mirror
|
||||||
|
no_results = re.search(r'No hits\. Try adding an asterisk in '
|
||||||
|
r'you search phrase\.', res)
|
||||||
|
if found == [] and no_results is None:
|
||||||
|
# Contradiction - we found no results,
|
||||||
|
# but the page didn't say there were no results.
|
||||||
|
# The page is probably not actually the pirate bay,
|
||||||
|
# so let's try another mirror
|
||||||
|
raise IOError('Blocked mirror detected.')
|
||||||
|
|
||||||
|
# get sizes as well and substitute the character
|
||||||
|
# TODO: use actual html decode
|
||||||
|
sizes = [match.replace(' ', ' ').split()
|
||||||
|
for match in re.findall(r'(?<=Size )[0-9.]'
|
||||||
|
r'+\ \;[KMGT]*[i ]*B', res)]
|
||||||
|
|
||||||
|
uploaded = [match.replace(' ', ' ')
|
||||||
|
for match in re.findall(r'(?<=Uploaded )'
|
||||||
|
r'.+(?=\, Size)',res)]
|
||||||
|
|
||||||
|
identifiers = [match.replace(' ', ' ')
|
||||||
|
for match in re.findall('(?<=/torrent/)'
|
||||||
|
'[0-9]+(?=/)',res)]
|
||||||
|
|
||||||
|
res_l = parse_magnets_seeds_leechers(found)
|
||||||
|
|
||||||
|
return res_l, sizes, uploaded, identifiers
|
||||||
|
|
||||||
|
|
||||||
def remote(pages, category, sort, mode, terms, mirror):
|
def remote(pages, category, sort, mode, terms, mirror):
|
||||||
res_l = []
|
res_l = []
|
||||||
pages = int(pages)
|
sizes = []
|
||||||
|
uploaded = []
|
||||||
|
identifiers = []
|
||||||
|
|
||||||
if pages < 1:
|
if pages < 1:
|
||||||
raise ValueError('Please provide an integer greater than 0 '
|
raise ValueError('Please provide an integer greater than 0 '
|
||||||
'for the number of pages to fetch.')
|
'for the number of pages to fetch.')
|
||||||
|
|
||||||
# Catch the Ctrl-C exception and exit cleanly
|
# Catch the Ctrl-C exception and exit cleanly
|
||||||
try:
|
try:
|
||||||
sizes = []
|
|
||||||
uploaded = []
|
|
||||||
identifiers = []
|
|
||||||
for page in range(pages):
|
for page in range(pages):
|
||||||
if mode == 'browse':
|
path = build_request_path(page, category, sort, mode, terms)
|
||||||
path = '/browse/'
|
|
||||||
if(category == 0):
|
|
||||||
category = 100
|
|
||||||
path = '/browse/{}/{}/{}'.format(category, page, sort)
|
|
||||||
elif mode == 'recent':
|
|
||||||
# This is not a typo. There is no / between 48h and the category.
|
|
||||||
path = '/top/48h'
|
|
||||||
# only major categories can be used with this mode
|
|
||||||
if(category == 0):
|
|
||||||
path += 'all'
|
|
||||||
else:
|
|
||||||
path += str(category)
|
|
||||||
elif mode == 'top':
|
|
||||||
path = '/top/'
|
|
||||||
if(category == 0):
|
|
||||||
path += 'all'
|
|
||||||
else:
|
|
||||||
path += str(category)
|
|
||||||
elif mode == 'search':
|
|
||||||
query = urllib.parse.quote_plus(' '.join(terms))
|
|
||||||
path = '/search/{}/{}/{}/{}'.format(query, page, sort, category)
|
|
||||||
else:
|
|
||||||
raise Exception('Unknown mode.')
|
|
||||||
|
|
||||||
req = request.Request(mirror + path,
|
req = request.Request(mirror + path,
|
||||||
headers=pirate.data.default_headers)
|
headers=pirate.data.default_headers)
|
||||||
@ -112,39 +147,18 @@ def remote(pages, category, sort, mode, terms, mirror):
|
|||||||
if f.info().get('Content-Encoding') == 'gzip':
|
if f.info().get('Content-Encoding') == 'gzip':
|
||||||
f = gzip.GzipFile(fileobj=BytesIO(f.read()))
|
f = gzip.GzipFile(fileobj=BytesIO(f.read()))
|
||||||
res = f.read().decode('utf-8')
|
res = f.read().decode('utf-8')
|
||||||
found = re.findall(parser_regex, res)
|
|
||||||
|
|
||||||
# check for a blocked mirror
|
page_res_l, page_sizes, page_uploaded, page_identifiers = parse_page(res)
|
||||||
no_results = re.search(r'No hits\. Try adding an asterisk in '
|
res_l += page_res_l
|
||||||
r'you search phrase\.', res)
|
sizes += page_sizes
|
||||||
if found == [] and no_results is None:
|
uploaded += page_uploaded
|
||||||
# Contradiction - we found no results,
|
identifiers += page_identifiers
|
||||||
# but the page didn't say there were no results.
|
|
||||||
# The page is probably not actually the pirate bay,
|
|
||||||
# so let's try another mirror
|
|
||||||
raise IOError('Blocked mirror detected.')
|
|
||||||
|
|
||||||
# get sizes as well and substitute the character
|
except KeyboardInterrupt:
|
||||||
# TODO: use actual html decode
|
|
||||||
sizes.extend([match.replace(' ', ' ').split()
|
|
||||||
for match in re.findall(r'(?<=Size )[0-9.]'
|
|
||||||
r'+\ \;[KMGT]*[i ]*B', res)])
|
|
||||||
|
|
||||||
uploaded.extend([match.replace(' ', ' ')
|
|
||||||
for match in re.findall(r'(?<=Uploaded )'
|
|
||||||
r'.+(?=\, Size)',res)])
|
|
||||||
|
|
||||||
identifiers.extend([match.replace(' ', ' ')
|
|
||||||
for match in re.findall('(?<=/torrent/)'
|
|
||||||
'[0-9]+(?=/)',res)])
|
|
||||||
|
|
||||||
res_l += parse_magnets_seeds_leechers(found)
|
|
||||||
|
|
||||||
except KeyboardInterrupt :
|
|
||||||
print('\nCancelled.')
|
print('\nCancelled.')
|
||||||
sys.exit(0)
|
sys.exit(0)
|
||||||
|
|
||||||
# return the sizes in a spearate list
|
# return the sizes in a separate list
|
||||||
return res_l, sizes, uploaded, identifiers
|
return res_l, sizes, uploaded, identifiers
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user