mirror of
https://github.com/vikstrous/pirate-get
synced 2025-01-09 09:59:51 +01:00
rewrite html parser with pyquery
This commit is contained in:
parent
d0a9d0f51e
commit
43f8ffefea
@ -6,6 +6,8 @@ import urllib.parse as parse
|
||||
import urllib.error
|
||||
import os.path
|
||||
|
||||
from pyquery import PyQuery as pq
|
||||
|
||||
import pirate.data
|
||||
from pirate.print import print
|
||||
|
||||
@ -43,25 +45,6 @@ def parse_sort(sort):
|
||||
return '99'
|
||||
|
||||
|
||||
def parse_magnets_seeds_leechers(found):
|
||||
res = []
|
||||
state = 'seeds'
|
||||
curr = ['', 0, 0] #magnet, seeds, leeches
|
||||
for f in found:
|
||||
if f[1] == '':
|
||||
curr[0] = f[0]
|
||||
else:
|
||||
if state == 'seeds':
|
||||
curr[1] = f[1]
|
||||
state = 'leeches'
|
||||
else:
|
||||
curr[2] = f[1]
|
||||
state = 'seeds'
|
||||
res.append(curr)
|
||||
curr = ['', 0, 0]
|
||||
return res
|
||||
|
||||
|
||||
#TODO: warn users when using a sort in a mode that doesn't accept sorts
|
||||
#TODO: warn users when using search terms in a mode that doesn't accept search terms
|
||||
#TODO: same with page parameter for top and top48h
|
||||
@ -92,37 +75,40 @@ def build_request_path(page, category, sort, mode, terms):
|
||||
raise Exception('Unknown mode.')
|
||||
|
||||
|
||||
#TODO: redo this with html parser instead of regex
|
||||
def parse_page(res):
|
||||
found = re.findall(parser_regex, res)
|
||||
def parse_page(html):
|
||||
d = pq(html)
|
||||
|
||||
# first get the magnet links and make sure there are results
|
||||
magnets = list(map(lambda l: pq(l).attr('href'),
|
||||
d('table#searchResult tr>td:nth-child(2)>a:nth-child(2)')))
|
||||
|
||||
# check for a blocked mirror
|
||||
no_results = re.search(r'No hits\. Try adding an asterisk in '
|
||||
r'you search phrase\.', res)
|
||||
if found == [] and no_results is None:
|
||||
r'you search phrase\.', html)
|
||||
if len(magnets) == 0 and no_results is None:
|
||||
# Contradiction - we found no results,
|
||||
# but the page didn't say there were no results.
|
||||
# The page is probably not actually the pirate bay,
|
||||
# so let's try another mirror
|
||||
raise IOError('Blocked mirror detected.')
|
||||
|
||||
# get sizes as well and substitute the character
|
||||
# TODO: use actual html decode
|
||||
sizes = [match.replace(' ', ' ').split()
|
||||
for match in re.findall(r'(?<=Size )[0-9.]'
|
||||
r'+\ \;[KMGT]*[i ]*B', res)]
|
||||
# next get more info
|
||||
seeds = list(map(lambda l: pq(l).text(),
|
||||
d('table#searchResult tr>td:nth-child(3)')))
|
||||
leechers = list(map(lambda l: pq(l).text(),
|
||||
d('table#searchResult tr>td:nth-child(4)')))
|
||||
identifiers = list(map(lambda l: pq(l).attr('href').split('/')[2],
|
||||
d('table#searchResult .detLink')))
|
||||
|
||||
uploaded = [match.replace(' ', ' ')
|
||||
for match in re.findall(r'(?<=Uploaded )'
|
||||
r'.+(?=\, Size)',res)]
|
||||
sizes = []
|
||||
uploaded = []
|
||||
# parse descriptions separately
|
||||
for node in d('font.detDesc'):
|
||||
text = pq(node).text()
|
||||
sizes.append(re.findall(r'(?<=Size )[0-9.]+\s[KMGT]*[i ]*B', text)[0].split())
|
||||
uploaded.append(re.findall(r'(?<=Uploaded ).+(?=\, Size)', text)[0])
|
||||
|
||||
identifiers = [match.replace(' ', ' ')
|
||||
for match in re.findall('(?<=/torrent/)'
|
||||
'[0-9]+(?=/)',res)]
|
||||
|
||||
res_l = parse_magnets_seeds_leechers(found)
|
||||
|
||||
return res_l, sizes, uploaded, identifiers
|
||||
return list(zip(magnets,seeds,leechers)), sizes, uploaded, identifiers
|
||||
|
||||
|
||||
def remote(pages, category, sort, mode, terms, mirror):
|
||||
|
2
setup.py
2
setup.py
@ -13,7 +13,7 @@ setup(name='pirate-get',
|
||||
entry_points={
|
||||
'console_scripts': ['pirate-get = pirate.pirate:main']
|
||||
},
|
||||
install_requires=['colorama>=0.3.3'],
|
||||
install_requires=['colorama>=0.3.3', 'pyquery>=1.2.9'],
|
||||
keywords=['torrent', 'magnet', 'download', 'tpb', 'client'],
|
||||
classifiers=[
|
||||
'Topic :: Utilities',
|
||||
|
1
tests/data/blocked.html
Normal file
1
tests/data/blocked.html
Normal file
@ -0,0 +1 @@
|
||||
blocked.
|
200
tests/data/no_hits.html
Normal file
200
tests/data/no_hits.html
Normal file
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue
Block a user