From 41b3d56721d6ae3e16ae97aa8bc066a5b6c5e77e Mon Sep 17 00:00:00 2001 From: rnhmjoj Date: Sun, 3 Jul 2016 19:26:58 +0200 Subject: [PATCH] switch to BeautifulSoup --- pirate/torrent.py | 50 ++++++++++++++++++++++++----------------------- setup.py | 2 +- 2 files changed, 27 insertions(+), 25 deletions(-) diff --git a/pirate/torrent.py b/pirate/torrent.py index e59d6d4..3cb8eba 100644 --- a/pirate/torrent.py +++ b/pirate/torrent.py @@ -6,7 +6,7 @@ import urllib.parse as parse import urllib.error import os.path -from pyquery import PyQuery as pq +from bs4 import BeautifulSoup import pirate.data @@ -76,25 +76,37 @@ def build_request_path(page, category, sort, mode, terms): # this returns a list of dictionaries def parse_page(html): - d = pq(html) + soup = BeautifulSoup(html, 'html.parser') + table = soup.find('table', id='searchResult') results = [] - # parse the rows one by one - for row in d('table#searchResult tr'): - drow = d(row) - if len(drow('th')) > 0: - continue + no_results = re.search(r'No hits\. Try adding an asterisk in ' + r'you search phrase\.', html) + # check for a blocked mirror + if not table and not no_results: + # Contradiction - we found no results, + # but the page didn't say there were no results. + # The page is probably not actually the pirate bay, + # so let's try another mirror + raise IOError('Blocked mirror detected.') + + if no_results: + return results + + # parse the rows one by one (skipping headings) + for row in table('tr')[1:]: # grab info about the row - magnet = pq(drow(':eq(0)>td:nth-child(2)>a:nth-child(2)')[0]).attr('href') - seeds = pq(drow(':eq(0)>td:nth-child(3)')).text() - leechers = pq(drow(':eq(0)>td:nth-child(4)')).text() - id_ = pq(drow('.detLink')).attr('href').split('/')[2] + id_ = row.find('a', class_='detLink')['href'].split('/')[2] + seeds, leechers = [i.text for i in row('td')[-2:]] + magnet = row.find(lambda tag: + tag.name == 'a' and + tag['href'].startswith('magnet'))['href'] # parse descriptions separately - desc_text = pq(drow('font.detDesc')[0]).text() - size = re.findall(r'(?<=Size )[0-9.]+\s[KMGT]*[i ]*B', desc_text)[0].split() - uploaded = re.findall(r'(?<=Uploaded ).+(?=\, Size)', desc_text)[0] + description = row.find('font', class_='detDesc').text + size = re.findall(r'(?<=Size )[0-9.]+\s[KMGT]*[i ]*B', description)[0].split() + uploaded = re.findall(r'(?<=Uploaded ).+(?=\, Size)', description)[0] results.append({ 'magnet': magnet, @@ -105,16 +117,6 @@ def parse_page(html): 'id': id_ }) - # check for a blocked mirror - no_results = re.search(r'No hits\. Try adding an asterisk in ' - r'you search phrase\.', html) - if len(results) == 0 and no_results is None: - # Contradiction - we found no results, - # but the page didn't say there were no results. - # The page is probably not actually the pirate bay, - # so let's try another mirror - raise IOError('Blocked mirror detected.') - return results diff --git a/setup.py b/setup.py index 3b1c9d0..170db9d 100755 --- a/setup.py +++ b/setup.py @@ -13,7 +13,7 @@ setup(name='pirate-get', entry_points={ 'console_scripts': ['pirate-get = pirate.pirate:main'] }, - install_requires=['colorama>=0.3.3', 'pyquery>=1.2.9', 'veryprettytable>=0.8.1'], + install_requires=['colorama>=0.3.3', 'beautifulsoup4>=4.4.1', 'veryprettytable>=0.8.1'], keywords=['torrent', 'magnet', 'download', 'tpb', 'client'], classifiers=[ 'Topic :: Utilities',