mirror of
https://github.com/vikstrous/pirate-get
synced 2025-01-25 12:24:20 +01:00
switch to BeautifulSoup
This commit is contained in:
parent
4837a3e453
commit
41b3d56721
@ -6,7 +6,7 @@ import urllib.parse as parse
|
|||||||
import urllib.error
|
import urllib.error
|
||||||
import os.path
|
import os.path
|
||||||
|
|
||||||
from pyquery import PyQuery as pq
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
import pirate.data
|
import pirate.data
|
||||||
|
|
||||||
@ -76,25 +76,37 @@ def build_request_path(page, category, sort, mode, terms):
|
|||||||
|
|
||||||
# this returns a list of dictionaries
|
# this returns a list of dictionaries
|
||||||
def parse_page(html):
|
def parse_page(html):
|
||||||
d = pq(html)
|
soup = BeautifulSoup(html, 'html.parser')
|
||||||
|
table = soup.find('table', id='searchResult')
|
||||||
|
|
||||||
results = []
|
results = []
|
||||||
# parse the rows one by one
|
no_results = re.search(r'No hits\. Try adding an asterisk in '
|
||||||
for row in d('table#searchResult tr'):
|
r'you search phrase\.', html)
|
||||||
drow = d(row)
|
|
||||||
if len(drow('th')) > 0:
|
|
||||||
continue
|
|
||||||
|
|
||||||
|
# check for a blocked mirror
|
||||||
|
if not table and not no_results:
|
||||||
|
# Contradiction - we found no results,
|
||||||
|
# but the page didn't say there were no results.
|
||||||
|
# The page is probably not actually the pirate bay,
|
||||||
|
# so let's try another mirror
|
||||||
|
raise IOError('Blocked mirror detected.')
|
||||||
|
|
||||||
|
if no_results:
|
||||||
|
return results
|
||||||
|
|
||||||
|
# parse the rows one by one (skipping headings)
|
||||||
|
for row in table('tr')[1:]:
|
||||||
# grab info about the row
|
# grab info about the row
|
||||||
magnet = pq(drow(':eq(0)>td:nth-child(2)>a:nth-child(2)')[0]).attr('href')
|
id_ = row.find('a', class_='detLink')['href'].split('/')[2]
|
||||||
seeds = pq(drow(':eq(0)>td:nth-child(3)')).text()
|
seeds, leechers = [i.text for i in row('td')[-2:]]
|
||||||
leechers = pq(drow(':eq(0)>td:nth-child(4)')).text()
|
magnet = row.find(lambda tag:
|
||||||
id_ = pq(drow('.detLink')).attr('href').split('/')[2]
|
tag.name == 'a' and
|
||||||
|
tag['href'].startswith('magnet'))['href']
|
||||||
|
|
||||||
# parse descriptions separately
|
# parse descriptions separately
|
||||||
desc_text = pq(drow('font.detDesc')[0]).text()
|
description = row.find('font', class_='detDesc').text
|
||||||
size = re.findall(r'(?<=Size )[0-9.]+\s[KMGT]*[i ]*B', desc_text)[0].split()
|
size = re.findall(r'(?<=Size )[0-9.]+\s[KMGT]*[i ]*B', description)[0].split()
|
||||||
uploaded = re.findall(r'(?<=Uploaded ).+(?=\, Size)', desc_text)[0]
|
uploaded = re.findall(r'(?<=Uploaded ).+(?=\, Size)', description)[0]
|
||||||
|
|
||||||
results.append({
|
results.append({
|
||||||
'magnet': magnet,
|
'magnet': magnet,
|
||||||
@ -105,16 +117,6 @@ def parse_page(html):
|
|||||||
'id': id_
|
'id': id_
|
||||||
})
|
})
|
||||||
|
|
||||||
# check for a blocked mirror
|
|
||||||
no_results = re.search(r'No hits\. Try adding an asterisk in '
|
|
||||||
r'you search phrase\.', html)
|
|
||||||
if len(results) == 0 and no_results is None:
|
|
||||||
# Contradiction - we found no results,
|
|
||||||
# but the page didn't say there were no results.
|
|
||||||
# The page is probably not actually the pirate bay,
|
|
||||||
# so let's try another mirror
|
|
||||||
raise IOError('Blocked mirror detected.')
|
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
2
setup.py
2
setup.py
@ -13,7 +13,7 @@ setup(name='pirate-get',
|
|||||||
entry_points={
|
entry_points={
|
||||||
'console_scripts': ['pirate-get = pirate.pirate:main']
|
'console_scripts': ['pirate-get = pirate.pirate:main']
|
||||||
},
|
},
|
||||||
install_requires=['colorama>=0.3.3', 'pyquery>=1.2.9', 'veryprettytable>=0.8.1'],
|
install_requires=['colorama>=0.3.3', 'beautifulsoup4>=4.4.1', 'veryprettytable>=0.8.1'],
|
||||||
keywords=['torrent', 'magnet', 'download', 'tpb', 'client'],
|
keywords=['torrent', 'magnet', 'download', 'tpb', 'client'],
|
||||||
classifiers=[
|
classifiers=[
|
||||||
'Topic :: Utilities',
|
'Topic :: Utilities',
|
||||||
|
Loading…
Reference in New Issue
Block a user