From c23c3db3d808bbfda4db968fee2f92f50116a138 Mon Sep 17 00:00:00 2001 From: rnhmjoj Date: Thu, 21 May 2020 01:57:46 +0200 Subject: [PATCH] initial work on tpb api support --- pirate/data.py | 5 +- pirate/pirate.py | 13 ++- pirate/print.py | 96 ++++++++++---------- pirate/torrent.py | 220 ++++++++++++++++------------------------------ 4 files changed, 126 insertions(+), 208 deletions(-) diff --git a/pirate/data.py b/pirate/data.py index dd3fe1f..192bd01 100644 --- a/pirate/data.py +++ b/pirate/data.py @@ -5,7 +5,8 @@ import pkgutil def get_resource(filename): return pkgutil.get_data(__package__, 'data/' + filename) -version = '0.3.7' + +version = '0.4.0' categories = json.loads(get_resource('categories.json').decode()) sorts = json.loads(get_resource('sorts.json').decode()) @@ -14,5 +15,5 @@ blacklist = set(json.loads(get_resource('blacklist.json').decode())) default_headers = {'User-Agent': 'pirate get'} default_timeout = 10 -default_mirror = 'https://thepiratebay.org/' +default_mirror = 'https://apibay.org' mirror_list = 'https://proxybay.bz/list.txt' diff --git a/pirate/pirate.py b/pirate/pirate.py index 59c8824..b27742b 100755 --- a/pirate/pirate.py +++ b/pirate/pirate.py @@ -144,9 +144,6 @@ def parse_args(args_in): help='a csv file containing the Pirate Bay database ' 'downloaded from ' 'https://thepiratebay.org/static/dump/csv/') - parser.add_argument('-p', dest='pages', default=1, type=int, - help='the number of pages to fetch ' - "(doesn't work with --local)") parser.add_argument('-0', dest='first', action='store_true', help='choose the top result') @@ -261,14 +258,14 @@ def combine_configs(config, args): def connect_mirror(mirror, printer, args): try: printer.print('Trying', mirror, end='... ') + url = pirate.torrent.find_api(mirror) results = pirate.torrent.remote( printer=printer, - pages=args.pages, category=pirate.torrent.parse_category(printer, args.category), sort=pirate.torrent.parse_sort(printer, args.sort), mode=args.action, terms=args.search, - mirror=mirror) + mirror=url) except (urllib.error.URLError, socket.timeout, IOError, ValueError) as e: printer.print('Failed', color='WARN', end=' ') printer.print('(', e, ')', sep='') @@ -380,13 +377,13 @@ def pirate_main(args): printer.print("\nSelect links (Type 'h' for more options" ", 'q' to quit)", end='\b', color='alt') try: - l = builtins.input(': ') + cmd = builtins.input(': ') except (KeyboardInterrupt, EOFError): printer.print('\nCancelled.') return try: - code, choices = parse_torrent_command(l) + code, choices = parse_torrent_command(cmd) # Act on option, if supplied printer.print('') if code == 'h': @@ -416,7 +413,7 @@ def pirate_main(args): elif code == 't': pirate.torrent.save_torrents(printer, choices, results, args.save_directory) - elif not l: + elif not cmd: printer.print('No links entered!', color='WARN') else: break diff --git a/pirate/print.py b/pirate/print.py index c7490c2..98b2b02 100644 --- a/pirate/print.py +++ b/pirate/print.py @@ -1,17 +1,17 @@ import builtins import re import gzip -import urllib.parse as parse import urllib.request as request import shutil +import json import pirate.data +import pirate.torrent import colorama -import veryprettytable +from veryprettytable import VeryPrettyTable from io import BytesIO -from http.cookiejar import CookieJar class Printer: @@ -45,14 +45,14 @@ class Printer: even = True if local: - table = veryprettytable.VeryPrettyTable(['LINK', 'DATE', 'SIZE', 'NAME']) + table = VeryPrettyTable(['LINK', 'DATE', 'SIZE', 'NAME']) table.align['SIZE'] = 'r' table.align['NAME'] = 'l' else: - table = veryprettytable.VeryPrettyTable(['LINK', 'SEED', 'LEECH', - 'RATIO', 'SIZE', - 'UPLOAD', 'NAME']) + table = VeryPrettyTable(['LINK', 'SEED', 'LEECH', + 'RATIO', 'SIZE', + 'UPLOAD', 'NAME']) table.align['NAME'] = 'l' table.align['SEED'] = 'r' table.align['LEECH'] = 'r' @@ -65,21 +65,15 @@ class Printer: table.padding_width = 1 for n, result in enumerate(results): - - name = re.search(r'dn=([^\&]*)', result['magnet']) - torrent_name = parse.unquote_plus(name.group(1)) + torrent_name = result['name'] if local: - content = [n, result['date'], result['size'], torrent_name[:columns - 42]] + content = [n, result['date'], result['size'], + torrent_name[:columns - 42]] else: - no_seeders = int(result['seeds']) + no_seeders = int(result['seeders']) no_leechers = int(result['leechers']) - if result['size'] != []: - size = float(result['size'][0]) - unit = result['size'][1] - else: - size = 0 - unit = '???' + size = result['size'] date = result['uploaded'] # compute the S/L ratio (Higher is better) @@ -90,8 +84,7 @@ class Printer: content = [n, no_seeders, no_leechers, '{:.1f}'.format(ratio), - '{:.1f} '.format(size) + unit, - date, torrent_name[:columns - 50]] + size, date, torrent_name[:columns - 50]] if even or not self.enable_color: table.add_row(content) @@ -103,64 +96,63 @@ class Printer: self.print(table) def descriptions(self, chosen_links, results, site): - jar = CookieJar() - opener = request.build_opener( - request.HTTPErrorProcessor, - request.HTTPCookieProcessor(jar)) + opener = request.build_opener(request.HTTPErrorProcessor) for link in chosen_links: - path = '/torrent/%s/' % results[link]['id'] - req = request.Request(site + path, - headers=pirate.data.default_headers) + result = results[link] + req = request.Request( + site + '/t.php?id=' + result['id'], + headers=pirate.data.default_headers) req.add_header('Accept-encoding', 'gzip') f = opener.open(req, timeout=pirate.data.default_timeout) if f.info().get('Content-Encoding') == 'gzip': f = gzip.GzipFile(fileobj=BytesIO(f.read())) - res = f.read().decode('utf-8') - name = re.search(r'dn=([^\&]*)', results[link]['magnet']) - torrent_name = parse.unquote(name.group(1)).replace('+', ' ') - desc = re.search(r'
\s*
(.+?)(?=
)', - res, re.DOTALL).group(1) + res = json.load(f) # Replace HTML links with markdown style versions desc = re.sub(r']*>(\s*)([^<]+?)(\s*' - r')', r'\2[\3](\1)\4', desc) + r')', r'\2[\3](\1)\4', res['descr']) - self.print('Description for "%s":' % torrent_name, color='zebra_1') + self.print('Description for "{}":'.format(result['name']), + color='zebra_1') self.print(desc, color='zebra_0') def file_lists(self, chosen_links, results, site): - jar = CookieJar() - opener = request.build_opener( - request.HTTPErrorProcessor, - request.HTTPCookieProcessor(jar)) + opener = request.build_opener(request.HTTPErrorProcessor) + + # the API may returns object instead of list + def get(obj): + try: + return obj[0] + except KeyError: + return obj['0'] for link in chosen_links: - path = '/ajax_details_filelist.php' - query = '?id=' + results[link]['id'] - req = request.Request(site + path + query, - headers=pirate.data.default_headers) + result = results[link] + req = request.Request( + site + '/f.php?id=' + result['id'], + headers=pirate.data.default_headers) req.add_header('Accept-encoding', 'gzip') f = opener.open(req, timeout=pirate.data.default_timeout) if f.info().get('Content-Encoding') == 'gzip': f = gzip.GzipFile(fileobj=BytesIO(f.read())) - # TODO: proper html decoding/parsing - res = f.read().decode('utf-8').replace(' ', ' ') - if 'File list not available.' in res: + res = json.load(f) + + if len(res) == 1 and 'not found' in get(res[0]['name']): self.print('File list not available.') return - files = re.findall(r'\s*([^<]+?)\s*\s*([^<]+?)\s*', res) - name = re.search(r'dn=([^\&]*)', results[link]['magnet']) - torrent_name = parse.unquote(name.group(1)).replace('+', ' ') - self.print('Files in "%s":' % torrent_name, color='zebra_1') + self.print('Files in {}:'.format(result['name']), color='zebra_1') cur_color = 'zebra_0' - for f in files: - self.print('{0[0]:>11} {0[1]}'.format(f), color=cur_color) + for f in res: + name = get(f['name']) + size = pirate.torrent.pretty_size(int(get(f['size']))) + self.print('{:>11} {}'.format( + size, name), + color=cur_color) cur_color = 'zebra_0' if cur_color == 'zebra_1' else 'zebra_1' diff --git a/pirate/torrent.py b/pirate/torrent.py index defccbc..c2b4468 100644 --- a/pirate/torrent.py +++ b/pirate/torrent.py @@ -8,13 +8,10 @@ import urllib.error import os.path import pirate.data +import json -from bs4 import BeautifulSoup +from datetime import datetime from io import BytesIO -from http.cookiejar import CookieJar - - -parser_regex = r'"(magnet\:\?xt=[^"]*)|([^<]+)' def parse_category(printer, category): @@ -45,144 +42,82 @@ def parse_sort(printer, sort): return 99 -# TODO: -# * warn users when using a sort in a mode that doesn't accept sorts -# * warn users when using search terms in a mode -# that doesn't accept search terms -# * same with page parameter for top and top48h -# * warn the user if trying to use a minor category with top48h -def build_request_path(page, category, sort, mode, terms): - if mode == 'browse': - if(category == 0): - category = 100 - return '/browse/{}/{}/{}'.format(category, page, sort) - elif mode == 'recent': - # This is not a typo. There is no / between 48h and the category. - path = '/top/48h' - # only major categories can be used with this mode - if(category == 0): - return path + 'all' - else: - return path + str(category) - elif mode == 'top': - path = '/top/' - if(category == 0): - return path + 'all' - else: - return path + str(category) - elif mode == 'search': - query = urllib.parse.quote_plus(' '.join(terms)) - return '/search/{}/{}/{}/{}'.format(query, page, sort, category) - else: - raise Exception('Unknown mode.') +def pretty_size(size): + ranges = [('PiB', 1125899906842624), + ('TiB', 1099511627776), + ('GiB', 1073741824), + ('MiB', 1048576), + ('KiB', 1024)] + for unit, value in ranges: + if size >= value: + return '{:.1f} {}'.format(size/value, unit) + return str(size) + ' B' -# this returns a list of dictionaries -def parse_page(html): - soup = BeautifulSoup(html, 'html.parser') - tables = soup.find_all('table', id='searchResult') - no_results = re.search(r'No hits\. Try adding an asterisk in ' - r'you search phrase\.', html) - - # check for a blocked mirror - if not tables and not no_results: - # Contradiction - we found no results, - # but the page didn't say there were no results. - # The page is probably not actually the pirate bay, - # so let's try another mirror - raise IOError('Blocked mirror detected.') - - if no_results: - return [] - - # handle ads disguised as fake result tables - for table in tables: - results = parse_table(table) - if results: - break - else: - raise IOError('Mirror does not contain magnets.') - - return results +def pretty_date(ts): + date = datetime.fromtimestamp(int(ts)) + return date.strftime('%Y-%m-%d %H:%M') -def parse_table(table): +def make_magnet(name, info_hash): + return 'magnet:?xt=urn:btih:{}&dn={}'.format( + info_hash, parse.quote(name, '')) + + +def remote(printer, category, sort, mode, terms, mirror): results = [] - # parse the rows one by one (skipping headings) - for row in table('tr')[1:]: - # grab info about the row - row_link = row.find('a', class_='detLink') - if row_link is None: - continue - - id_ = row_link['href'].split('/')[2] - seeds, leechers = [i.text for i in row('td')[-2:]] - magnet_tag = row.find(lambda tag: tag.name == 'a' and - tag['href'].startswith('magnet')) - if magnet_tag is None: - continue - magnet = magnet_tag['href'] - - # parse descriptions separately - description = row.find('font', class_='detDesc').text - size = re.findall(r'(?<=Size )[0-9.]+\s[KMGT]*[i ]*B', - description)[0].split() - uploaded = re.findall(r'(?<=Uploaded ).+(?=\, Size)', - description)[0] - - results.append({ - 'magnet': magnet, - 'seeds': seeds, - 'leechers': leechers, - 'size': size, - 'uploaded': uploaded, - 'id': id_ - }) - - return results - - -def remote(printer, pages, category, sort, mode, terms, mirror): - res_l = [] - - if pages < 1: - raise ValueError('Please provide an integer greater than 0 ' - 'for the number of pages to fetch.') - # Catch the Ctrl-C exception and exit cleanly try: - jar = CookieJar() - opener = request.build_opener( - request.HTTPErrorProcessor, - request.HTTPCookieProcessor(jar)) + req = request.Request( + '{}/q.php?q={}&cat={}'.format( + mirror, ' '.join(terms), category), + headers=pirate.data.default_headers) + try: + f = request.urlopen(req, timeout=pirate.data.default_timeout) + except urllib.error.URLError as e: + raise e - for page in range(pages): - path = build_request_path(page, category, sort, mode, terms) + if f.info().get('Content-Encoding') == 'gzip': + f = gzip.GzipFile(fileobj=BytesIO(f.read())) + for res in json.load(f): + res['size'] = pretty_size(int(res['size'])) + res['magnet'] = make_magnet(res['name'], res['info_hash']) + res['info_hash'] = int(res['info_hash'], 16) + res['uploaded'] = pretty_date(res['added']) + results.append(res) - req = request.Request(mirror + path, - headers=pirate.data.default_headers) - req.add_header('Accept-encoding', 'gzip') - - try: - f = opener.open(req, timeout=pirate.data.default_timeout) - except urllib.error.URLError as e: - res = e.fp.read().decode() - if e.code == 503 and 'cf-browser-verification' in res: - raise IOError('Cloudflare protected') - raise e - - if f.info().get('Content-Encoding') == 'gzip': - f = gzip.GzipFile(fileobj=BytesIO(f.read())) - res = f.read().decode('utf-8') - - res_l += parse_page(res) + return results except KeyboardInterrupt: printer.print('\nCancelled.') sys.exit(0) - return res_l + +def find_api(mirror): + # try common paths + for path in ['', '/apip', '/api.php?url=']: + req = request.Request(mirror + path + '/q.php?q=test&cat=0', + headers=pirate.data.default_headers) + try: + f = request.urlopen(req, timeout=pirate.data.default_timeout) + if f.info().get_content_type() == 'application/json': + return mirror + path + except urllib.error.URLError: + pass + + # extract api path from main.js + req = request.Request(mirror + '/static/main.js', + headers=pirate.data.default_headers) + try: + f = request.urlopen(req, timeout=pirate.data.default_timeout) + if f.info().get_content_type() == 'application/javascript': + match = re.search("var server='([^']+)'", f.read().decode()) + return mirror + match.group(1) + except urllib.error.URLError: + raise IOError('API not found: no main.js') + + raise IOError('API not found') def get_torrent(info_hash): @@ -200,44 +135,37 @@ def get_torrent(info_hash): def save_torrents(printer, chosen_links, results, folder): for link in chosen_links: - magnet = results[link]['magnet'] - name = re.search(r'dn=([^\&]*)', magnet) - torrent_name = parse.unquote(name.group(1)).replace('+', ' ') - info_hash = int(re.search(r'btih:([a-f0-9]{40})', magnet).group(1), 16) - torrent_name = torrent_name.replace('/', '_').replace('\\', '_') + result = results[link] + torrent_name = result['name'].replace('/', '_').replace('\\', '_') file = os.path.join(folder, torrent_name + '.torrent') try: - torrent = get_torrent(info_hash) + torrent = get_torrent(result['info_hash']) except urllib.error.HTTPError as e: printer.print('There is no cached file for this torrent :(' ' \nCode: {} - {}'.format(e.code, e.reason), color='ERROR') else: open(file, 'wb').write(torrent) - printer.print('Saved {:X} in {}'.format(info_hash, file)) + printer.print('Saved {:X} in {}'.format(result['info_hash'], file)) def save_magnets(printer, chosen_links, results, folder): for link in chosen_links: - magnet = results[link]['magnet'] - name = re.search(r'dn=([^\&]*)', magnet) - torrent_name = parse.unquote(name.group(1)).replace('+', ' ') - info_hash = int(re.search(r'btih:([a-f0-9]{40})', magnet).group(1), 16) - torrent_name = torrent_name.replace('/', '_').replace('\\', '_') + result = results[link] + torrent_name = result['name'].replace('/', '_').replace('\\', '_') file = os.path.join(folder, torrent_name + '.magnet') - printer.print('Saved {:X} in {}'.format(info_hash, file)) + printer.print('Saved {:X} in {}'.format(result['info_hash'], file)) with open(file, 'w') as f: - f.write(magnet + '\n') + f.write(result['magnet'] + '\n') def copy_magnets(printer, chosen_links, results): clipboard_text = '' for link in chosen_links: - magnet = results[link]['magnet'] - info_hash = int(re.search(r'btih:([a-fA-F0-9]{40})', magnet).group(1), 16) - clipboard_text += magnet + "\n" - printer.print('Copying {:X} to clipboard'.format(info_hash)) + result = results[link] + clipboard_text += result['magnet'] + "\n" + printer.print('Copying {:X} to clipboard'.format(result['info_hash'])) pyperclip.copy(clipboard_text)