From 43f8ffefea5617268f13c14b628ca989f716da83 Mon Sep 17 00:00:00 2001 From: Viktor Stanchev Date: Thu, 3 Sep 2015 21:44:02 -0700 Subject: [PATCH] rewrite html parser with pyquery --- pirate/torrent.py | 64 +++++-------- setup.py | 2 +- tests/data/blocked.html | 1 + tests/data/no_hits.html | 200 ++++++++++++++++++++++++++++++++++++++++ tests/test_torrent.py | 15 ++- 5 files changed, 240 insertions(+), 42 deletions(-) create mode 100644 tests/data/blocked.html create mode 100644 tests/data/no_hits.html diff --git a/pirate/torrent.py b/pirate/torrent.py index 83b6202..eb62f21 100644 --- a/pirate/torrent.py +++ b/pirate/torrent.py @@ -6,6 +6,8 @@ import urllib.parse as parse import urllib.error import os.path +from pyquery import PyQuery as pq + import pirate.data from pirate.print import print @@ -43,25 +45,6 @@ def parse_sort(sort): return '99' -def parse_magnets_seeds_leechers(found): - res = [] - state = 'seeds' - curr = ['', 0, 0] #magnet, seeds, leeches - for f in found: - if f[1] == '': - curr[0] = f[0] - else: - if state == 'seeds': - curr[1] = f[1] - state = 'leeches' - else: - curr[2] = f[1] - state = 'seeds' - res.append(curr) - curr = ['', 0, 0] - return res - - #TODO: warn users when using a sort in a mode that doesn't accept sorts #TODO: warn users when using search terms in a mode that doesn't accept search terms #TODO: same with page parameter for top and top48h @@ -92,37 +75,40 @@ def build_request_path(page, category, sort, mode, terms): raise Exception('Unknown mode.') -#TODO: redo this with html parser instead of regex -def parse_page(res): - found = re.findall(parser_regex, res) +def parse_page(html): + d = pq(html) + + # first get the magnet links and make sure there are results + magnets = list(map(lambda l: pq(l).attr('href'), + d('table#searchResult tr>td:nth-child(2)>a:nth-child(2)'))) # check for a blocked mirror no_results = re.search(r'No hits\. Try adding an asterisk in ' - r'you search phrase\.', res) - if found == [] and no_results is None: + r'you search phrase\.', html) + if len(magnets) == 0 and no_results is None: # Contradiction - we found no results, # but the page didn't say there were no results. # The page is probably not actually the pirate bay, # so let's try another mirror raise IOError('Blocked mirror detected.') - # get sizes as well and substitute the   character - # TODO: use actual html decode - sizes = [match.replace(' ', ' ').split() - for match in re.findall(r'(?<=Size )[0-9.]' - r'+\ \;[KMGT]*[i ]*B', res)] + # next get more info + seeds = list(map(lambda l: pq(l).text(), + d('table#searchResult tr>td:nth-child(3)'))) + leechers = list(map(lambda l: pq(l).text(), + d('table#searchResult tr>td:nth-child(4)'))) + identifiers = list(map(lambda l: pq(l).attr('href').split('/')[2], + d('table#searchResult .detLink'))) - uploaded = [match.replace(' ', ' ') - for match in re.findall(r'(?<=Uploaded )' - r'.+(?=\, Size)',res)] + sizes = [] + uploaded = [] + # parse descriptions separately + for node in d('font.detDesc'): + text = pq(node).text() + sizes.append(re.findall(r'(?<=Size )[0-9.]+\s[KMGT]*[i ]*B', text)[0].split()) + uploaded.append(re.findall(r'(?<=Uploaded ).+(?=\, Size)', text)[0]) - identifiers = [match.replace(' ', ' ') - for match in re.findall('(?<=/torrent/)' - '[0-9]+(?=/)',res)] - - res_l = parse_magnets_seeds_leechers(found) - - return res_l, sizes, uploaded, identifiers + return list(zip(magnets,seeds,leechers)), sizes, uploaded, identifiers def remote(pages, category, sort, mode, terms, mirror): diff --git a/setup.py b/setup.py index 8f8f6c1..42707ef 100755 --- a/setup.py +++ b/setup.py @@ -13,7 +13,7 @@ setup(name='pirate-get', entry_points={ 'console_scripts': ['pirate-get = pirate.pirate:main'] }, - install_requires=['colorama>=0.3.3'], + install_requires=['colorama>=0.3.3', 'pyquery>=1.2.9'], keywords=['torrent', 'magnet', 'download', 'tpb', 'client'], classifiers=[ 'Topic :: Utilities', diff --git a/tests/data/blocked.html b/tests/data/blocked.html new file mode 100644 index 0000000..dd4bc42 --- /dev/null +++ b/tests/data/blocked.html @@ -0,0 +1 @@ +blocked. diff --git a/tests/data/no_hits.html b/tests/data/no_hits.html new file mode 100644 index 0000000..a3a9156 --- /dev/null +++ b/tests/data/no_hits.html @@ -0,0 +1,200 @@ + + + + The Pirate Bay - The galaxy's most resilient bittorrent site + + + + + + + + + + + + + + + + + + + + + + + + +

Search results: aaaaaaaaaaaaaaaaa No hits. Try adding an asterisk in you search phrase.

+ +
+ +
+
+
+
+ + + + + + \ No newline at end of file diff --git a/tests/test_torrent.py b/tests/test_torrent.py index b166359..06c62fd 100755 --- a/tests/test_torrent.py +++ b/tests/test_torrent.py @@ -8,10 +8,21 @@ from tests import util class TestTorrent(unittest.TestCase): - def test_rich_xml(self): + def test_no_hits(self): + res = util.read_data('no_hits.html') + actual = pirate.torrent.parse_page(res) + expected = ([], [], [], []) + self.assertEqual(actual, expected) + + def test_blocked_mirror(self): + res = util.read_data('blocked.html') + with self.assertRaises(IOError): + pirate.torrent.parse_page(res) + + def test_search_results(self): res = util.read_data('dan_bull_search.html') actual = pirate.torrent.parse_page(res) - expected = ([['magnet:?xt=urn:btih:30df4f8b42b8fd77f5e5aa34abbffe97f5e81fbf&dn=Dan+Croll+%26bull%3B+Sweet+Disarray+%5B2014%5D+320&tr=udp%3A%2F%2Ftracker.openbittorrent.com%3A80&tr=udp%3A%2F%2Fopen.demonii.com%3A1337&tr=udp%3A%2F%2Ftracker.coppersurfer.tk%3A6969&tr=udp%3A%2F%2Fexodus.desync.com%3A6969', '16', '1'], ['magnet:?xt=urn:btih:7abd3eda600996b8e6fc9a61b83288e0c6ac0d83&dn=Dan+Bull+-+Massive+Collection&tr=udp%3A%2F%2Ftracker.openbittorrent.com%3A80&tr=udp%3A%2F%2Fopen.demonii.com%3A1337&tr=udp%3A%2F%2Ftracker.coppersurfer.tk%3A6969&tr=udp%3A%2F%2Fexodus.desync.com%3A6969', '4', '0'], ['magnet:?xt=urn:btih:8f8d68fd0a51237c89692c428ed8a8f64a969c70&dn=Dan+Bull+-+Generation+Gaming+-+2013&tr=udp%3A%2F%2Ftracker.openbittorrent.com%3A80&tr=udp%3A%2F%2Fopen.demonii.com%3A1337&tr=udp%3A%2F%2Ftracker.coppersurfer.tk%3A6969&tr=udp%3A%2F%2Fexodus.desync.com%3A6969', '2', '0'], ['magnet:?xt=urn:btih:3da6a0fdc1d67a768cb32597e926abdf3e1a2fdd&dn=Dan+Bull+Collection&tr=udp%3A%2F%2Ftracker.openbittorrent.com%3A80&tr=udp%3A%2F%2Fopen.demonii.com%3A1337&tr=udp%3A%2F%2Ftracker.coppersurfer.tk%3A6969&tr=udp%3A%2F%2Fexodus.desync.com%3A6969', '1', '0'], ['magnet:?xt=urn:btih:5cd371a235317319db7da52c64422f9c2ac75d77&dn=Dan+Bull+-+The+Garden+%7B2014-Album%7D&tr=udp%3A%2F%2Ftracker.openbittorrent.com%3A80&tr=udp%3A%2F%2Fopen.demonii.com%3A1337&tr=udp%3A%2F%2Ftracker.coppersurfer.tk%3A6969&tr=udp%3A%2F%2Fexodus.desync.com%3A6969', '1', '0'], ['magnet:?xt=urn:btih:4e14dbd077c920875be4c15971b23b609ad6716a&dn=Dan+Bull+-+Dear+Lily+%5Ban+open+letter+to+Lily+Allen%5D+-+2009%5BMP3+%40&tr=udp%3A%2F%2Ftracker.openbittorrent.com%3A80&tr=udp%3A%2F%2Fopen.demonii.com%3A1337&tr=udp%3A%2F%2Ftracker.coppersurfer.tk%3A6969&tr=udp%3A%2F%2Fexodus.desync.com%3A6969', '0', '1'], ['magnet:?xt=urn:btih:5d9319cf852f7462422cb1bffc37b65174645047&dn=Dan+Bull+-+Dear+Mandy+%5Ban+open+letter+to+Lord+Mandelson%5D&tr=udp%3A%2F%2Ftracker.openbittorrent.com%3A80&tr=udp%3A%2F%2Fopen.demonii.com%3A1337&tr=udp%3A%2F%2Ftracker.coppersurfer.tk%3A6969&tr=udp%3A%2F%2Fexodus.desync.com%3A6969', '0', '0'], ['magnet:?xt=urn:btih:1c54af57426f53fdef4bbf1a9dbddf32f7b4988a&dn=Dan+Bull+-+Dear+Lily+%28Lily+Allen%29+%28Song+about+filesharing%29&tr=udp%3A%2F%2Ftracker.openbittorrent.com%3A80&tr=udp%3A%2F%2Fopen.demonii.com%3A1337&tr=udp%3A%2F%2Ftracker.coppersurfer.tk%3A6969&tr=udp%3A%2F%2Fexodus.desync.com%3A6969', '0', '0'], ['magnet:?xt=urn:btih:942c5bf3e1e9bc263939e13cea6ad7bd5f62aa36&dn=Dan+Bull+-+SOPA+Cabana.mp3&tr=udp%3A%2F%2Ftracker.openbittorrent.com%3A80&tr=udp%3A%2F%2Fopen.demonii.com%3A1337&tr=udp%3A%2F%2Ftracker.coppersurfer.tk%3A6969&tr=udp%3A%2F%2Fexodus.desync.com%3A6969', '0', '0'], ['magnet:?xt=urn:btih:d376f68a31b0db652234e790ed7256ac5e32db57&dn=Dan+Bull+-+SOPA+Cabana&tr=udp%3A%2F%2Ftracker.openbittorrent.com%3A80&tr=udp%3A%2F%2Fopen.demonii.com%3A1337&tr=udp%3A%2F%2Ftracker.coppersurfer.tk%3A6969&tr=udp%3A%2F%2Fexodus.desync.com%3A6969', '0', '1'], ['magnet:?xt=urn:btih:28163770a532eb24b9e0865878288a9bbdb7a5e6&dn=Dan+Bull+-+SOPA+Cabana+%5BWORKING%5D&tr=udp%3A%2F%2Ftracker.openbittorrent.com%3A80&tr=udp%3A%2F%2Fopen.demonii.com%3A1337&tr=udp%3A%2F%2Ftracker.coppersurfer.tk%3A6969&tr=udp%3A%2F%2Fexodus.desync.com%3A6969', '0', '1'], ['magnet:?xt=urn:btih:779ab0f13a3fbb12ba68b27721491e4d143f26eb&dn=Dan+Bull+-+Bye+Bye+BPI+2012++%5BMP3%40192%5D%28oan%29&tr=udp%3A%2F%2Ftracker.openbittorrent.com%3A80&tr=udp%3A%2F%2Fopen.demonii.com%3A1337&tr=udp%3A%2F%2Ftracker.coppersurfer.tk%3A6969&tr=udp%3A%2F%2Fexodus.desync.com%3A6969', '0', '1'], ['magnet:?xt=urn:btih:2667e4795bd5c868dedcabcb52943f4bb7212bab&dn=Dan+Bull+-+Dishonored+%5BExplicit+ver.%5D+%28Single+2012%29&tr=udp%3A%2F%2Ftracker.openbittorrent.com%3A80&tr=udp%3A%2F%2Fopen.demonii.com%3A1337&tr=udp%3A%2F%2Ftracker.coppersurfer.tk%3A6969&tr=udp%3A%2F%2Fexodus.desync.com%3A6969', '0', '0'], ['magnet:?xt=urn:btih:16364f83c556ad0fd3bb57a4a7c890e7e8087414&dn=Halo+4+EPIC+Rap+By+Dan+Bull&tr=udp%3A%2F%2Ftracker.openbittorrent.com%3A80&tr=udp%3A%2F%2Fopen.demonii.com%3A1337&tr=udp%3A%2F%2Ftracker.coppersurfer.tk%3A6969&tr=udp%3A%2F%2Fexodus.desync.com%3A6969', '0', '0'], ['magnet:?xt=urn:btih:843b466d9fd1f0bee3a476573b272dc2d6d0ebae&dn=Dan+Bull+-+Generation+Gaming+-+2013&tr=udp%3A%2F%2Ftracker.openbittorrent.com%3A80&tr=udp%3A%2F%2Fopen.demonii.com%3A1337&tr=udp%3A%2F%2Ftracker.coppersurfer.tk%3A6969&tr=udp%3A%2F%2Fexodus.desync.com%3A6969', '0', '1']], [['89.33', 'MiB'], ['294', 'MiB'], ['54.86', 'MiB'], ['236.78', 'MiB'], ['36.27', 'MiB'], ['5.51', 'MiB'], ['5.07', 'MiB'], ['5.34', 'MiB'], ['4.8', 'MiB'], ['3.4', 'MiB'], ['4.8', 'MiB'], ['60.72', 'MiB'], ['6.29', 'MiB'], ['6.41', 'MiB'], ['54.87', 'MiB']], ['04-04 2014', '03-02 2014', '01-19 2013', '01-21 2010', '09-02 2014', '09-27 2009', '11-29 2009', '11-10 2011', '12-20 2011', '12-21 2011', '12-21 2011', '03-09 2012', '10-24 2012', '11-10 2012', '01-19 2013'], ['9890864', '9684858', '8037968', '5295449', '10954408', '5101630', '5185893', '6806996', '6901871', '6902247', '6903548', '7088979', '7756344', '7812951', '8037899']) + expected = ([('magnet:?xt=urn:btih:30df4f8b42b8fd77f5e5aa34abbffe97f5e81fbf&dn=Dan+Croll+%26bull%3B+Sweet+Disarray+%5B2014%5D+320&tr=udp%3A%2F%2Ftracker.openbittorrent.com%3A80&tr=udp%3A%2F%2Fopen.demonii.com%3A1337&tr=udp%3A%2F%2Ftracker.coppersurfer.tk%3A6969&tr=udp%3A%2F%2Fexodus.desync.com%3A6969', '16', '1'), ('magnet:?xt=urn:btih:7abd3eda600996b8e6fc9a61b83288e0c6ac0d83&dn=Dan+Bull+-+Massive+Collection&tr=udp%3A%2F%2Ftracker.openbittorrent.com%3A80&tr=udp%3A%2F%2Fopen.demonii.com%3A1337&tr=udp%3A%2F%2Ftracker.coppersurfer.tk%3A6969&tr=udp%3A%2F%2Fexodus.desync.com%3A6969', '4', '0'), ('magnet:?xt=urn:btih:8f8d68fd0a51237c89692c428ed8a8f64a969c70&dn=Dan+Bull+-+Generation+Gaming+-+2013&tr=udp%3A%2F%2Ftracker.openbittorrent.com%3A80&tr=udp%3A%2F%2Fopen.demonii.com%3A1337&tr=udp%3A%2F%2Ftracker.coppersurfer.tk%3A6969&tr=udp%3A%2F%2Fexodus.desync.com%3A6969', '2', '0'), ('magnet:?xt=urn:btih:3da6a0fdc1d67a768cb32597e926abdf3e1a2fdd&dn=Dan+Bull+Collection&tr=udp%3A%2F%2Ftracker.openbittorrent.com%3A80&tr=udp%3A%2F%2Fopen.demonii.com%3A1337&tr=udp%3A%2F%2Ftracker.coppersurfer.tk%3A6969&tr=udp%3A%2F%2Fexodus.desync.com%3A6969', '1', '0'), ('magnet:?xt=urn:btih:5cd371a235317319db7da52c64422f9c2ac75d77&dn=Dan+Bull+-+The+Garden+%7B2014-Album%7D&tr=udp%3A%2F%2Ftracker.openbittorrent.com%3A80&tr=udp%3A%2F%2Fopen.demonii.com%3A1337&tr=udp%3A%2F%2Ftracker.coppersurfer.tk%3A6969&tr=udp%3A%2F%2Fexodus.desync.com%3A6969', '1', '0'), ('magnet:?xt=urn:btih:4e14dbd077c920875be4c15971b23b609ad6716a&dn=Dan+Bull+-+Dear+Lily+%5Ban+open+letter+to+Lily+Allen%5D+-+2009%5BMP3+%40&tr=udp%3A%2F%2Ftracker.openbittorrent.com%3A80&tr=udp%3A%2F%2Fopen.demonii.com%3A1337&tr=udp%3A%2F%2Ftracker.coppersurfer.tk%3A6969&tr=udp%3A%2F%2Fexodus.desync.com%3A6969', '0', '1'), ('magnet:?xt=urn:btih:5d9319cf852f7462422cb1bffc37b65174645047&dn=Dan+Bull+-+Dear+Mandy+%5Ban+open+letter+to+Lord+Mandelson%5D&tr=udp%3A%2F%2Ftracker.openbittorrent.com%3A80&tr=udp%3A%2F%2Fopen.demonii.com%3A1337&tr=udp%3A%2F%2Ftracker.coppersurfer.tk%3A6969&tr=udp%3A%2F%2Fexodus.desync.com%3A6969', '0', '0'), ('magnet:?xt=urn:btih:1c54af57426f53fdef4bbf1a9dbddf32f7b4988a&dn=Dan+Bull+-+Dear+Lily+%28Lily+Allen%29+%28Song+about+filesharing%29&tr=udp%3A%2F%2Ftracker.openbittorrent.com%3A80&tr=udp%3A%2F%2Fopen.demonii.com%3A1337&tr=udp%3A%2F%2Ftracker.coppersurfer.tk%3A6969&tr=udp%3A%2F%2Fexodus.desync.com%3A6969', '0', '0'), ('magnet:?xt=urn:btih:942c5bf3e1e9bc263939e13cea6ad7bd5f62aa36&dn=Dan+Bull+-+SOPA+Cabana.mp3&tr=udp%3A%2F%2Ftracker.openbittorrent.com%3A80&tr=udp%3A%2F%2Fopen.demonii.com%3A1337&tr=udp%3A%2F%2Ftracker.coppersurfer.tk%3A6969&tr=udp%3A%2F%2Fexodus.desync.com%3A6969', '0', '0'), ('magnet:?xt=urn:btih:d376f68a31b0db652234e790ed7256ac5e32db57&dn=Dan+Bull+-+SOPA+Cabana&tr=udp%3A%2F%2Ftracker.openbittorrent.com%3A80&tr=udp%3A%2F%2Fopen.demonii.com%3A1337&tr=udp%3A%2F%2Ftracker.coppersurfer.tk%3A6969&tr=udp%3A%2F%2Fexodus.desync.com%3A6969', '0', '1'), ('magnet:?xt=urn:btih:28163770a532eb24b9e0865878288a9bbdb7a5e6&dn=Dan+Bull+-+SOPA+Cabana+%5BWORKING%5D&tr=udp%3A%2F%2Ftracker.openbittorrent.com%3A80&tr=udp%3A%2F%2Fopen.demonii.com%3A1337&tr=udp%3A%2F%2Ftracker.coppersurfer.tk%3A6969&tr=udp%3A%2F%2Fexodus.desync.com%3A6969', '0', '1'), ('magnet:?xt=urn:btih:779ab0f13a3fbb12ba68b27721491e4d143f26eb&dn=Dan+Bull+-+Bye+Bye+BPI+2012++%5BMP3%40192%5D%28oan%29&tr=udp%3A%2F%2Ftracker.openbittorrent.com%3A80&tr=udp%3A%2F%2Fopen.demonii.com%3A1337&tr=udp%3A%2F%2Ftracker.coppersurfer.tk%3A6969&tr=udp%3A%2F%2Fexodus.desync.com%3A6969', '0', '1'), ('magnet:?xt=urn:btih:2667e4795bd5c868dedcabcb52943f4bb7212bab&dn=Dan+Bull+-+Dishonored+%5BExplicit+ver.%5D+%28Single+2012%29&tr=udp%3A%2F%2Ftracker.openbittorrent.com%3A80&tr=udp%3A%2F%2Fopen.demonii.com%3A1337&tr=udp%3A%2F%2Ftracker.coppersurfer.tk%3A6969&tr=udp%3A%2F%2Fexodus.desync.com%3A6969', '0', '0'), ('magnet:?xt=urn:btih:16364f83c556ad0fd3bb57a4a7c890e7e8087414&dn=Halo+4+EPIC+Rap+By+Dan+Bull&tr=udp%3A%2F%2Ftracker.openbittorrent.com%3A80&tr=udp%3A%2F%2Fopen.demonii.com%3A1337&tr=udp%3A%2F%2Ftracker.coppersurfer.tk%3A6969&tr=udp%3A%2F%2Fexodus.desync.com%3A6969', '0', '0'), ('magnet:?xt=urn:btih:843b466d9fd1f0bee3a476573b272dc2d6d0ebae&dn=Dan+Bull+-+Generation+Gaming+-+2013&tr=udp%3A%2F%2Ftracker.openbittorrent.com%3A80&tr=udp%3A%2F%2Fopen.demonii.com%3A1337&tr=udp%3A%2F%2Ftracker.coppersurfer.tk%3A6969&tr=udp%3A%2F%2Fexodus.desync.com%3A6969', '0', '1')], [['89.33', 'MiB'], ['294', 'MiB'], ['54.86', 'MiB'], ['236.78', 'MiB'], ['36.27', 'MiB'], ['5.51', 'MiB'], ['5.07', 'MiB'], ['5.34', 'MiB'], ['4.8', 'MiB'], ['3.4', 'MiB'], ['4.8', 'MiB'], ['60.72', 'MiB'], ['6.29', 'MiB'], ['6.41', 'MiB'], ['54.87', 'MiB']],['04-04\xa02014', '03-02\xa02014', '01-19\xa02013', '01-21\xa02010', '09-02\xa02014', '09-27\xa02009', '11-29\xa02009', '11-10\xa02011', '12-20\xa02011', '12-21\xa02011', '12-21\xa02011', '03-09\xa02012', '10-24\xa02012', '11-10\xa02012', '01-19\xa02013'], ['9890864', '9684858', '8037968', '5295449', '10954408', '5101630', '5185893', '6806996', '6901871', '6902247', '6903548', '7088979', '7756344', '7812951', '8037899']) self.assertEqual(actual, expected) if __name__ == '__main__':