initial work on tpb api support

2025-01-10 10:04:21 +01:00 · 2020-05-21 01:57:46 +02:00 · 2020-05-21 01:57:46 +02:00 · c23c3db3d8
commit c23c3db3d8
parent 567ea9db11
4 changed files with 126 additions and 208 deletions
--- a/pirate/data.py
+++ b/pirate/data.py
@ -5,7 +5,8 @@ import pkgutil
 def get_resource(filename):
    return pkgutil.get_data(__package__, 'data/' + filename)
-version = '0.3.7'
+
 version = '0.4.0'
 categories = json.loads(get_resource('categories.json').decode())
 sorts = json.loads(get_resource('sorts.json').decode())
@ -14,5 +15,5 @@ blacklist = set(json.loads(get_resource('blacklist.json').decode()))
 default_headers = {'User-Agent': 'pirate get'}
 default_timeout = 10
-default_mirror = 'https://thepiratebay.org/'
+default_mirror = 'https://apibay.org'
 mirror_list = 'https://proxybay.bz/list.txt'
--- a/pirate/pirate.py
+++ b/pirate/pirate.py
@ -144,9 +144,6 @@ def parse_args(args_in):
                        help='a csv file containing the Pirate Bay database '
                             'downloaded from '
                             'https://thepiratebay.org/static/dump/csv/')
    parser.add_argument('-p', dest='pages', default=1, type=int,
                        help='the number of pages to fetch '
                             "(doesn't work with --local)")
    parser.add_argument('-0', dest='first',
                        action='store_true',
                        help='choose the top result')
@ -261,14 +258,14 @@ def combine_configs(config, args):
 def connect_mirror(mirror, printer, args):
    try:
        printer.print('Trying', mirror, end='... ')
        url = pirate.torrent.find_api(mirror)
        results = pirate.torrent.remote(
            printer=printer,
            pages=args.pages,
            category=pirate.torrent.parse_category(printer, args.category),
            sort=pirate.torrent.parse_sort(printer, args.sort),
            mode=args.action,
            terms=args.search,
-            mirror=mirror)
+            mirror=url)
    except (urllib.error.URLError, socket.timeout, IOError, ValueError) as e:
        printer.print('Failed', color='WARN', end=' ')
        printer.print('(', e, ')', sep='')
@ -380,13 +377,13 @@ def pirate_main(args):
            printer.print("\nSelect links (Type 'h' for more options"
                          ", 'q' to quit)", end='\b', color='alt')
            try:
-                l = builtins.input(': ')
+                cmd = builtins.input(': ')
            except (KeyboardInterrupt, EOFError):
                printer.print('\nCancelled.')
                return
            try:
-                code, choices = parse_torrent_command(l)
+                code, choices = parse_torrent_command(cmd)
                # Act on option, if supplied
                printer.print('')
                if code == 'h':
@ -416,7 +413,7 @@ def pirate_main(args):
                elif code == 't':
                    pirate.torrent.save_torrents(printer, choices, results,
                                                 args.save_directory)
-                elif not l:
+                elif not cmd:
                    printer.print('No links entered!', color='WARN')
                else:
                    break
--- a/pirate/print.py
+++ b/pirate/print.py
@ -1,17 +1,17 @@
 import builtins
 import re
 import gzip
 import urllib.parse as parse
 import urllib.request as request
 import shutil
 import json
 import pirate.data
 import pirate.torrent
 import colorama
 import veryprettytable
 from veryprettytable import VeryPrettyTable
 from io import BytesIO
 from http.cookiejar import CookieJar
 class Printer:
@ -45,12 +45,12 @@ class Printer:
        even = True
        if local:
-            table = veryprettytable.VeryPrettyTable(['LINK', 'DATE', 'SIZE', 'NAME'])
+            table = VeryPrettyTable(['LINK', 'DATE', 'SIZE', 'NAME'])
            table.align['SIZE'] = 'r'
            table.align['NAME'] = 'l'
        else:
-            table = veryprettytable.VeryPrettyTable(['LINK', 'SEED', 'LEECH',
+            table = VeryPrettyTable(['LINK', 'SEED', 'LEECH',
                                     'RATIO', 'SIZE',
                                     'UPLOAD', 'NAME'])
            table.align['NAME'] = 'l'
@ -65,21 +65,15 @@ class Printer:
        table.padding_width = 1
        for n, result in enumerate(results):
-
+            torrent_name = result['name']
            name = re.search(r'dn=([^\&]*)', result['magnet'])
            torrent_name = parse.unquote_plus(name.group(1))
            if local:
-                content = [n, result['date'], result['size'], torrent_name[:columns - 42]]
+                content = [n, result['date'], result['size'],
                           torrent_name[:columns - 42]]
            else:
-                no_seeders = int(result['seeds'])
+                no_seeders = int(result['seeders'])
                no_leechers = int(result['leechers'])
-                if result['size'] != []:
+                size = result['size']
                    size = float(result['size'][0])
                    unit = result['size'][1]
                else:
                    size = 0
                    unit = '???'
                date = result['uploaded']
                # compute the S/L ratio (Higher is better)
@ -90,8 +84,7 @@ class Printer:
                content = [n, no_seeders, no_leechers,
                           '{:.1f}'.format(ratio),
-                           '{:.1f} '.format(size) + unit,
+                           size, date, torrent_name[:columns - 50]]
                           date, torrent_name[:columns - 50]]
            if even or not self.enable_color:
                table.add_row(content)
@ -103,14 +96,12 @@ class Printer:
        self.print(table)
    def descriptions(self, chosen_links, results, site):
-        jar = CookieJar()
+        opener = request.build_opener(request.HTTPErrorProcessor)
        opener = request.build_opener(
            request.HTTPErrorProcessor,
            request.HTTPCookieProcessor(jar))
        for link in chosen_links:
-            path = '/torrent/%s/' % results[link]['id']
+            result = results[link]
-            req = request.Request(site + path,
+            req = request.Request(
                site + '/t.php?id=' + result['id'],
                headers=pirate.data.default_headers)
            req.add_header('Accept-encoding', 'gzip')
            f = opener.open(req, timeout=pirate.data.default_timeout)
@ -118,29 +109,30 @@ class Printer:
            if f.info().get('Content-Encoding') == 'gzip':
                f = gzip.GzipFile(fileobj=BytesIO(f.read()))
-            res = f.read().decode('utf-8')
+            res = json.load(f)
            name = re.search(r'dn=([^\&]*)', results[link]['magnet'])
            torrent_name = parse.unquote(name.group(1)).replace('+', ' ')
            desc = re.search(r'<div class="nfo">\s*<pre>(.+?)(?=</pre>)',
                             res, re.DOTALL).group(1)
            # Replace HTML links with markdown style versions
            desc = re.sub(r'<a href="\s*([^"]+?)\s*"[^>]*>(\s*)([^<]+?)(\s*'
-                          r')</a>', r'\2[\3](\1)\4', desc)
+                          r')</a>', r'\2[\3](\1)\4', res['descr'])
-            self.print('Description for "%s":' % torrent_name, color='zebra_1')
+            self.print('Description for "{}":'.format(result['name']),
                       color='zebra_1')
            self.print(desc, color='zebra_0')
    def file_lists(self, chosen_links, results, site):
-        jar = CookieJar()
+        opener = request.build_opener(request.HTTPErrorProcessor)
-        opener = request.build_opener(
+
-            request.HTTPErrorProcessor,
+        # the API may returns object instead of list
-            request.HTTPCookieProcessor(jar))
+        def get(obj):
            try:
                return obj[0]
            except KeyError:
                return obj['0']
        for link in chosen_links:
-            path = '/ajax_details_filelist.php'
+            result = results[link]
-            query = '?id=' + results[link]['id']
+            req = request.Request(
-            req = request.Request(site + path + query,
+                site + '/f.php?id=' + result['id'],
                headers=pirate.data.default_headers)
            req.add_header('Accept-encoding', 'gzip')
            f = opener.open(req, timeout=pirate.data.default_timeout)
@ -148,19 +140,19 @@ class Printer:
            if f.info().get('Content-Encoding') == 'gzip':
                f = gzip.GzipFile(fileobj=BytesIO(f.read()))
-            # TODO: proper html decoding/parsing
+            res = json.load(f)
-            res = f.read().decode('utf-8').replace('&nbsp;', ' ')
+
-            if 'File list not available.' in res:
+            if len(res) == 1 and 'not found' in get(res[0]['name']):
                self.print('File list not available.')
                return
            files = re.findall(r'<td align="left">\s*([^<]+?)\s*</td><td ali'
                               r'gn="right">\s*([^<]+?)\s*</tr>', res)
            name = re.search(r'dn=([^\&]*)', results[link]['magnet'])
            torrent_name = parse.unquote(name.group(1)).replace('+', ' ')
-            self.print('Files in "%s":' % torrent_name, color='zebra_1')
+            self.print('Files in {}:'.format(result['name']), color='zebra_1')
            cur_color = 'zebra_0'
-            for f in files:
+            for f in res:
-                self.print('{0[0]:>11}  {0[1]}'.format(f), color=cur_color)
+                name = get(f['name'])
                size = pirate.torrent.pretty_size(int(get(f['size'])))
                self.print('{:>11} {}'.format(
                    size, name),
                    color=cur_color)
                cur_color = 'zebra_0' if cur_color == 'zebra_1' else 'zebra_1'
--- a/pirate/torrent.py
+++ b/pirate/torrent.py
@ -8,13 +8,10 @@ import urllib.error
 import os.path
 import pirate.data
 import json
-from bs4 import BeautifulSoup
+from datetime import datetime
 from io import BytesIO
 from http.cookiejar import CookieJar
 parser_regex = r'"(magnet\:\?xt=[^"]*)|<td align="right">([^<]+)</td>'
 def parse_category(printer, category):
@ -45,144 +42,82 @@ def parse_sort(printer, sort):
        return 99
-# TODO:
+def pretty_size(size):
-# * warn users when using a sort in a mode that doesn't accept sorts
+    ranges = [('PiB', 1125899906842624),
-# * warn users when using search terms in a mode
+              ('TiB', 1099511627776),
-#   that doesn't accept search terms
+              ('GiB', 1073741824),
-# * same with page parameter for top and top48h
+              ('MiB', 1048576),
-# * warn the user if trying to use a minor category with top48h
+              ('KiB', 1024)]
-def build_request_path(page, category, sort, mode, terms):
+    for unit, value in ranges:
-    if mode == 'browse':
+        if size >= value:
-        if(category == 0):
+            return '{:.1f} {}'.format(size/value, unit)
-            category = 100
+    return str(size) + ' B'
        return '/browse/{}/{}/{}'.format(category, page, sort)
    elif mode == 'recent':
        # This is not a typo. There is no / between 48h and the category.
        path = '/top/48h'
        # only major categories can be used with this mode
        if(category == 0):
            return path + 'all'
        else:
            return path + str(category)
    elif mode == 'top':
        path = '/top/'
        if(category == 0):
            return path + 'all'
        else:
            return path + str(category)
    elif mode == 'search':
        query = urllib.parse.quote_plus(' '.join(terms))
        return '/search/{}/{}/{}/{}'.format(query, page, sort, category)
    else:
        raise Exception('Unknown mode.')
-# this returns a list of dictionaries
+def pretty_date(ts):
-def parse_page(html):
+    date = datetime.fromtimestamp(int(ts))
-    soup = BeautifulSoup(html, 'html.parser')
+    return date.strftime('%Y-%m-%d %H:%M')
    tables = soup.find_all('table', id='searchResult')
    no_results = re.search(r'No hits\. Try adding an asterisk in '
                           r'you search phrase\.', html)
    # check for a blocked mirror
    if not tables and not no_results:
        # Contradiction - we found no results,
        # but the page didn't say there were no results.
        # The page is probably not actually the pirate bay,
        # so let's try another mirror
        raise IOError('Blocked mirror detected.')
    if no_results:
        return []
    # handle ads disguised as fake result tables
    for table in tables:
        results = parse_table(table)
        if results:
            break
    else:
        raise IOError('Mirror does not contain magnets.')
    return results
-def parse_table(table):
+def make_magnet(name, info_hash):
    return 'magnet:?xt=urn:btih:{}&dn={}'.format(
        info_hash, parse.quote(name, ''))
 def remote(printer, category, sort, mode, terms, mirror):
    results = []
    # parse the rows one by one (skipping headings)
    for row in table('tr')[1:]:
        # grab info about the row
        row_link = row.find('a', class_='detLink')
        if row_link is None:
            continue
        id_ = row_link['href'].split('/')[2]
        seeds, leechers = [i.text for i in row('td')[-2:]]
        magnet_tag = row.find(lambda tag: tag.name == 'a' and
                              tag['href'].startswith('magnet'))
        if magnet_tag is None:
            continue
        magnet = magnet_tag['href']
        # parse descriptions separately
        description = row.find('font', class_='detDesc').text
        size = re.findall(r'(?<=Size )[0-9.]+\s[KMGT]*[i ]*B',
                          description)[0].split()
        uploaded = re.findall(r'(?<=Uploaded ).+(?=\, Size)',
                              description)[0]
        results.append({
            'magnet': magnet,
            'seeds': seeds,
            'leechers': leechers,
            'size': size,
            'uploaded': uploaded,
            'id': id_
        })
    return results
 def remote(printer, pages, category, sort, mode, terms, mirror):
    res_l = []
    if pages < 1:
        raise ValueError('Please provide an integer greater than 0 '
                         'for the number of pages to fetch.')
    # Catch the Ctrl-C exception and exit cleanly
    try:
-        jar = CookieJar()
+        req = request.Request(
-        opener = request.build_opener(
+            '{}/q.php?q={}&cat={}'.format(
-            request.HTTPErrorProcessor,
+                mirror, ' '.join(terms), category),
            request.HTTPCookieProcessor(jar))
        for page in range(pages):
            path = build_request_path(page, category, sort, mode, terms)
            req = request.Request(mirror + path,
            headers=pirate.data.default_headers)
            req.add_header('Accept-encoding', 'gzip')
        try:
-                f = opener.open(req, timeout=pirate.data.default_timeout)
+            f = request.urlopen(req, timeout=pirate.data.default_timeout)
        except urllib.error.URLError as e:
                res = e.fp.read().decode()
                if e.code == 503 and 'cf-browser-verification' in res:
                    raise IOError('Cloudflare protected')
            raise e
        if f.info().get('Content-Encoding') == 'gzip':
            f = gzip.GzipFile(fileobj=BytesIO(f.read()))
-            res = f.read().decode('utf-8')
+        for res in json.load(f):
            res['size'] = pretty_size(int(res['size']))
            res['magnet'] = make_magnet(res['name'], res['info_hash'])
            res['info_hash'] = int(res['info_hash'], 16)
            res['uploaded'] = pretty_date(res['added'])
            results.append(res)
-            res_l += parse_page(res)
+        return results
    except KeyboardInterrupt:
        printer.print('\nCancelled.')
        sys.exit(0)
-    return res_l
+
 def find_api(mirror):
    # try common paths
    for path in ['', '/apip', '/api.php?url=']:
        req = request.Request(mirror + path + '/q.php?q=test&cat=0',
                              headers=pirate.data.default_headers)
        try:
            f = request.urlopen(req, timeout=pirate.data.default_timeout)
            if f.info().get_content_type() == 'application/json':
                return mirror + path
        except urllib.error.URLError:
            pass
    # extract api path from main.js
    req = request.Request(mirror + '/static/main.js',
                          headers=pirate.data.default_headers)
    try:
        f = request.urlopen(req, timeout=pirate.data.default_timeout)
        if f.info().get_content_type() == 'application/javascript':
            match = re.search("var server='([^']+)'", f.read().decode())
            return mirror + match.group(1)
    except urllib.error.URLError:
        raise IOError('API not found: no main.js')
    raise IOError('API not found')
 def get_torrent(info_hash):
@ -200,44 +135,37 @@ def get_torrent(info_hash):
 def save_torrents(printer, chosen_links, results, folder):
    for link in chosen_links:
-        magnet = results[link]['magnet']
+        result = results[link]
-        name = re.search(r'dn=([^\&]*)', magnet)
+        torrent_name = result['name'].replace('/', '_').replace('\\', '_')
        torrent_name = parse.unquote(name.group(1)).replace('+', ' ')
        info_hash = int(re.search(r'btih:([a-f0-9]{40})', magnet).group(1), 16)
        torrent_name = torrent_name.replace('/', '_').replace('\\', '_')
        file = os.path.join(folder, torrent_name + '.torrent')
        try:
-            torrent = get_torrent(info_hash)
+            torrent = get_torrent(result['info_hash'])
        except urllib.error.HTTPError as e:
            printer.print('There is no cached file for this torrent :('
                          ' \nCode: {} - {}'.format(e.code, e.reason),
                          color='ERROR')
        else:
            open(file, 'wb').write(torrent)
-            printer.print('Saved {:X} in {}'.format(info_hash, file))
+            printer.print('Saved {:X} in {}'.format(result['info_hash'], file))
 def save_magnets(printer, chosen_links, results, folder):
    for link in chosen_links:
-        magnet = results[link]['magnet']
+        result = results[link]
-        name = re.search(r'dn=([^\&]*)', magnet)
+        torrent_name = result['name'].replace('/', '_').replace('\\', '_')
        torrent_name = parse.unquote(name.group(1)).replace('+', ' ')
        info_hash = int(re.search(r'btih:([a-f0-9]{40})', magnet).group(1), 16)
        torrent_name = torrent_name.replace('/', '_').replace('\\', '_')
        file = os.path.join(folder,  torrent_name + '.magnet')
-        printer.print('Saved {:X} in {}'.format(info_hash, file))
+        printer.print('Saved {:X} in {}'.format(result['info_hash'], file))
        with open(file, 'w') as f:
-            f.write(magnet + '\n')
+            f.write(result['magnet'] + '\n')
 def copy_magnets(printer, chosen_links, results):
    clipboard_text = ''
    for link in chosen_links:
-        magnet = results[link]['magnet']
+        result = results[link]
-        info_hash = int(re.search(r'btih:([a-fA-F0-9]{40})', magnet).group(1), 16)
+        clipboard_text += result['magnet'] + "\n"
-        clipboard_text += magnet + "\n"
+        printer.print('Copying {:X} to clipboard'.format(result['info_hash']))
        printer.print('Copying {:X} to clipboard'.format(info_hash))
    pyperclip.copy(clipboard_text)