1
0
mirror of https://github.com/vikstrous/pirate-get synced 2025-01-25 12:24:20 +01:00

Merge pull request #86 from vikstrous/beautifulsoup

switch to BeautifulSoup
This commit is contained in:
Michele Guerini Rocco 2016-07-03 19:31:23 +02:00 committed by GitHub
commit 7deac793b5
2 changed files with 27 additions and 25 deletions

View File

@ -6,7 +6,7 @@ import urllib.parse as parse
import urllib.error import urllib.error
import os.path import os.path
from pyquery import PyQuery as pq from bs4 import BeautifulSoup
import pirate.data import pirate.data
@ -76,25 +76,37 @@ def build_request_path(page, category, sort, mode, terms):
# this returns a list of dictionaries # this returns a list of dictionaries
def parse_page(html): def parse_page(html):
d = pq(html) soup = BeautifulSoup(html, 'html.parser')
table = soup.find('table', id='searchResult')
results = [] results = []
# parse the rows one by one no_results = re.search(r'No hits\. Try adding an asterisk in '
for row in d('table#searchResult tr'): r'you search phrase\.', html)
drow = d(row)
if len(drow('th')) > 0:
continue
# check for a blocked mirror
if not table and not no_results:
# Contradiction - we found no results,
# but the page didn't say there were no results.
# The page is probably not actually the pirate bay,
# so let's try another mirror
raise IOError('Blocked mirror detected.')
if no_results:
return results
# parse the rows one by one (skipping headings)
for row in table('tr')[1:]:
# grab info about the row # grab info about the row
magnet = pq(drow(':eq(0)>td:nth-child(2)>a:nth-child(2)')[0]).attr('href') id_ = row.find('a', class_='detLink')['href'].split('/')[2]
seeds = pq(drow(':eq(0)>td:nth-child(3)')).text() seeds, leechers = [i.text for i in row('td')[-2:]]
leechers = pq(drow(':eq(0)>td:nth-child(4)')).text() magnet = row.find(lambda tag:
id_ = pq(drow('.detLink')).attr('href').split('/')[2] tag.name == 'a' and
tag['href'].startswith('magnet'))['href']
# parse descriptions separately # parse descriptions separately
desc_text = pq(drow('font.detDesc')[0]).text() description = row.find('font', class_='detDesc').text
size = re.findall(r'(?<=Size )[0-9.]+\s[KMGT]*[i ]*B', desc_text)[0].split() size = re.findall(r'(?<=Size )[0-9.]+\s[KMGT]*[i ]*B', description)[0].split()
uploaded = re.findall(r'(?<=Uploaded ).+(?=\, Size)', desc_text)[0] uploaded = re.findall(r'(?<=Uploaded ).+(?=\, Size)', description)[0]
results.append({ results.append({
'magnet': magnet, 'magnet': magnet,
@ -105,16 +117,6 @@ def parse_page(html):
'id': id_ 'id': id_
}) })
# check for a blocked mirror
no_results = re.search(r'No hits\. Try adding an asterisk in '
r'you search phrase\.', html)
if len(results) == 0 and no_results is None:
# Contradiction - we found no results,
# but the page didn't say there were no results.
# The page is probably not actually the pirate bay,
# so let's try another mirror
raise IOError('Blocked mirror detected.')
return results return results

View File

@ -13,7 +13,7 @@ setup(name='pirate-get',
entry_points={ entry_points={
'console_scripts': ['pirate-get = pirate.pirate:main'] 'console_scripts': ['pirate-get = pirate.pirate:main']
}, },
install_requires=['colorama>=0.3.3', 'pyquery>=1.2.9', 'veryprettytable>=0.8.1'], install_requires=['colorama>=0.3.3', 'beautifulsoup4>=4.4.1', 'veryprettytable>=0.8.1'],
keywords=['torrent', 'magnet', 'download', 'tpb', 'client'], keywords=['torrent', 'magnet', 'download', 'tpb', 'client'],
classifiers=[ classifiers=[
'Topic :: Utilities', 'Topic :: Utilities',