mirror of
https://github.com/vikstrous/pirate-get
synced 2025-01-10 10:04:21 +01:00
refactor results parser to be row-oriented
This commit is contained in:
parent
1f04700158
commit
101f7e0fb1
@ -79,39 +79,44 @@ def build_request_path(page, category, sort, mode, terms):
|
|||||||
def parse_page(html):
|
def parse_page(html):
|
||||||
d = pq(html)
|
d = pq(html)
|
||||||
|
|
||||||
# first get the magnet links and make sure there are results
|
results = []
|
||||||
magnets = list(map(lambda l: pq(l).attr('href'),
|
# parse the rows one by one
|
||||||
d('table#searchResult tr>td:nth-child(2)>a:nth-child(2)')))
|
for row in d('table#searchResult tr'):
|
||||||
|
drow = d(row)
|
||||||
|
if len(drow('th')) > 0:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# grab info about the row
|
||||||
|
magnet = pq(drow(':eq(0)>td:nth-child(2)>a:nth-child(2)')[0]).attr('href')
|
||||||
|
seeds = pq(drow(':eq(0)>td:nth-child(3)')).text()
|
||||||
|
leechers = pq(drow(':eq(0)>td:nth-child(4)')).text()
|
||||||
|
id_ = pq(drow('.detLink')).attr('href').split('/')[2]
|
||||||
|
|
||||||
|
# parse descriptions separately
|
||||||
|
desc_text = pq(drow('font.detDesc')[0]).text()
|
||||||
|
size = re.findall(r'(?<=Size )[0-9.]+\s[KMGT]*[i ]*B', desc_text)[0].split()
|
||||||
|
uploaded = re.findall(r'(?<=Uploaded ).+(?=\, Size)', desc_text)[0]
|
||||||
|
|
||||||
|
results.append({
|
||||||
|
'magnet': magnet,
|
||||||
|
'seeds': seeds,
|
||||||
|
'leechers': leechers,
|
||||||
|
'size': size,
|
||||||
|
'uploaded': uploaded,
|
||||||
|
'id': id_
|
||||||
|
})
|
||||||
|
|
||||||
# check for a blocked mirror
|
# check for a blocked mirror
|
||||||
no_results = re.search(r'No hits\. Try adding an asterisk in '
|
no_results = re.search(r'No hits\. Try adding an asterisk in '
|
||||||
r'you search phrase\.', html)
|
r'you search phrase\.', html)
|
||||||
if len(magnets) == 0 and no_results is None:
|
if len(results) == 0 and no_results is None:
|
||||||
# Contradiction - we found no results,
|
# Contradiction - we found no results,
|
||||||
# but the page didn't say there were no results.
|
# but the page didn't say there were no results.
|
||||||
# The page is probably not actually the pirate bay,
|
# The page is probably not actually the pirate bay,
|
||||||
# so let's try another mirror
|
# so let's try another mirror
|
||||||
raise IOError('Blocked mirror detected.')
|
raise IOError('Blocked mirror detected.')
|
||||||
|
|
||||||
# next get more info
|
return results
|
||||||
seeds = list(map(lambda l: pq(l).text(),
|
|
||||||
d('table#searchResult tr>td:nth-child(3)')))
|
|
||||||
leechers = list(map(lambda l: pq(l).text(),
|
|
||||||
d('table#searchResult tr>td:nth-child(4)')))
|
|
||||||
ids = list(map(lambda l: pq(l).attr('href').split('/')[2],
|
|
||||||
d('table#searchResult .detLink')))
|
|
||||||
|
|
||||||
sizes = []
|
|
||||||
uploaded = []
|
|
||||||
# parse descriptions separately
|
|
||||||
for node in d('font.detDesc'):
|
|
||||||
text = pq(node).text()
|
|
||||||
sizes.append(re.findall(r'(?<=Size )[0-9.]+\s[KMGT]*[i ]*B', text)[0].split())
|
|
||||||
uploaded.append(re.findall(r'(?<=Uploaded ).+(?=\, Size)', text)[0])
|
|
||||||
|
|
||||||
titles = ('magnet', 'seeds', 'leechers', 'size', 'uploaded', 'id')
|
|
||||||
rows = list(zip(magnets, seeds, leechers, sizes, uploaded, ids))
|
|
||||||
return [dict(zip(titles,row)) for row in rows]
|
|
||||||
|
|
||||||
|
|
||||||
def remote(pages, category, sort, mode, terms, mirror):
|
def remote(pages, category, sort, mode, terms, mirror):
|
||||||
|
Loading…
Reference in New Issue
Block a user