mirror of
https://github.com/vikstrous/pirate-get
synced 2025-01-10 10:04:21 +01:00
rewrite html parser with pyquery
This commit is contained in:
parent
d0a9d0f51e
commit
43f8ffefea
@ -6,6 +6,8 @@ import urllib.parse as parse
|
|||||||
import urllib.error
|
import urllib.error
|
||||||
import os.path
|
import os.path
|
||||||
|
|
||||||
|
from pyquery import PyQuery as pq
|
||||||
|
|
||||||
import pirate.data
|
import pirate.data
|
||||||
from pirate.print import print
|
from pirate.print import print
|
||||||
|
|
||||||
@ -43,25 +45,6 @@ def parse_sort(sort):
|
|||||||
return '99'
|
return '99'
|
||||||
|
|
||||||
|
|
||||||
def parse_magnets_seeds_leechers(found):
|
|
||||||
res = []
|
|
||||||
state = 'seeds'
|
|
||||||
curr = ['', 0, 0] #magnet, seeds, leeches
|
|
||||||
for f in found:
|
|
||||||
if f[1] == '':
|
|
||||||
curr[0] = f[0]
|
|
||||||
else:
|
|
||||||
if state == 'seeds':
|
|
||||||
curr[1] = f[1]
|
|
||||||
state = 'leeches'
|
|
||||||
else:
|
|
||||||
curr[2] = f[1]
|
|
||||||
state = 'seeds'
|
|
||||||
res.append(curr)
|
|
||||||
curr = ['', 0, 0]
|
|
||||||
return res
|
|
||||||
|
|
||||||
|
|
||||||
#TODO: warn users when using a sort in a mode that doesn't accept sorts
|
#TODO: warn users when using a sort in a mode that doesn't accept sorts
|
||||||
#TODO: warn users when using search terms in a mode that doesn't accept search terms
|
#TODO: warn users when using search terms in a mode that doesn't accept search terms
|
||||||
#TODO: same with page parameter for top and top48h
|
#TODO: same with page parameter for top and top48h
|
||||||
@ -92,37 +75,40 @@ def build_request_path(page, category, sort, mode, terms):
|
|||||||
raise Exception('Unknown mode.')
|
raise Exception('Unknown mode.')
|
||||||
|
|
||||||
|
|
||||||
#TODO: redo this with html parser instead of regex
|
def parse_page(html):
|
||||||
def parse_page(res):
|
d = pq(html)
|
||||||
found = re.findall(parser_regex, res)
|
|
||||||
|
# first get the magnet links and make sure there are results
|
||||||
|
magnets = list(map(lambda l: pq(l).attr('href'),
|
||||||
|
d('table#searchResult tr>td:nth-child(2)>a:nth-child(2)')))
|
||||||
|
|
||||||
# check for a blocked mirror
|
# check for a blocked mirror
|
||||||
no_results = re.search(r'No hits\. Try adding an asterisk in '
|
no_results = re.search(r'No hits\. Try adding an asterisk in '
|
||||||
r'you search phrase\.', res)
|
r'you search phrase\.', html)
|
||||||
if found == [] and no_results is None:
|
if len(magnets) == 0 and no_results is None:
|
||||||
# Contradiction - we found no results,
|
# Contradiction - we found no results,
|
||||||
# but the page didn't say there were no results.
|
# but the page didn't say there were no results.
|
||||||
# The page is probably not actually the pirate bay,
|
# The page is probably not actually the pirate bay,
|
||||||
# so let's try another mirror
|
# so let's try another mirror
|
||||||
raise IOError('Blocked mirror detected.')
|
raise IOError('Blocked mirror detected.')
|
||||||
|
|
||||||
# get sizes as well and substitute the character
|
# next get more info
|
||||||
# TODO: use actual html decode
|
seeds = list(map(lambda l: pq(l).text(),
|
||||||
sizes = [match.replace(' ', ' ').split()
|
d('table#searchResult tr>td:nth-child(3)')))
|
||||||
for match in re.findall(r'(?<=Size )[0-9.]'
|
leechers = list(map(lambda l: pq(l).text(),
|
||||||
r'+\ \;[KMGT]*[i ]*B', res)]
|
d('table#searchResult tr>td:nth-child(4)')))
|
||||||
|
identifiers = list(map(lambda l: pq(l).attr('href').split('/')[2],
|
||||||
|
d('table#searchResult .detLink')))
|
||||||
|
|
||||||
uploaded = [match.replace(' ', ' ')
|
sizes = []
|
||||||
for match in re.findall(r'(?<=Uploaded )'
|
uploaded = []
|
||||||
r'.+(?=\, Size)',res)]
|
# parse descriptions separately
|
||||||
|
for node in d('font.detDesc'):
|
||||||
|
text = pq(node).text()
|
||||||
|
sizes.append(re.findall(r'(?<=Size )[0-9.]+\s[KMGT]*[i ]*B', text)[0].split())
|
||||||
|
uploaded.append(re.findall(r'(?<=Uploaded ).+(?=\, Size)', text)[0])
|
||||||
|
|
||||||
identifiers = [match.replace(' ', ' ')
|
return list(zip(magnets,seeds,leechers)), sizes, uploaded, identifiers
|
||||||
for match in re.findall('(?<=/torrent/)'
|
|
||||||
'[0-9]+(?=/)',res)]
|
|
||||||
|
|
||||||
res_l = parse_magnets_seeds_leechers(found)
|
|
||||||
|
|
||||||
return res_l, sizes, uploaded, identifiers
|
|
||||||
|
|
||||||
|
|
||||||
def remote(pages, category, sort, mode, terms, mirror):
|
def remote(pages, category, sort, mode, terms, mirror):
|
||||||
|
2
setup.py
2
setup.py
@ -13,7 +13,7 @@ setup(name='pirate-get',
|
|||||||
entry_points={
|
entry_points={
|
||||||
'console_scripts': ['pirate-get = pirate.pirate:main']
|
'console_scripts': ['pirate-get = pirate.pirate:main']
|
||||||
},
|
},
|
||||||
install_requires=['colorama>=0.3.3'],
|
install_requires=['colorama>=0.3.3', 'pyquery>=1.2.9'],
|
||||||
keywords=['torrent', 'magnet', 'download', 'tpb', 'client'],
|
keywords=['torrent', 'magnet', 'download', 'tpb', 'client'],
|
||||||
classifiers=[
|
classifiers=[
|
||||||
'Topic :: Utilities',
|
'Topic :: Utilities',
|
||||||
|
1
tests/data/blocked.html
Normal file
1
tests/data/blocked.html
Normal file
@ -0,0 +1 @@
|
|||||||
|
blocked.
|
200
tests/data/no_hits.html
Normal file
200
tests/data/no_hits.html
Normal file
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue
Block a user