pirate-get/pirate/torrent.py

import re
import sys
import gzip
import pyperclip
import urllib.request as request
import urllib.parse as parse
import urllib.error
import os.path

from bs4 import BeautifulSoup

import pirate.data

from io import BytesIO


parser_regex = r'"(magnet\:\?xt=[^"]*)|<td align="right">([^<]+)</td>'


def parse_category(printer, category):
    try:
        category = int(category)
    except ValueError:
        pass
    if category in pirate.data.categories.values():
        return category
    elif category in pirate.data.categories.keys():
        return pirate.data.categories[category]
    else:
        printer.print('Invalid category ignored', color='WARN')
        return 0


def parse_sort(printer, sort):
    try:
        sort = int(sort)
    except ValueError:
        pass
    if sort in pirate.data.sorts.values():
        return sort
    elif sort in pirate.data.sorts.keys():
        return pirate.data.sorts[sort]
    else:
        printer.print('Invalid sort ignored', color='WARN')
        return 99


# TODO:
# * warn users when using a sort in a mode that doesn't accept sorts
# * warn users when using search terms in a mode
#   that doesn't accept search terms
# * same with page parameter for top and top48h
# * warn the user if trying to use a minor category with top48h
def build_request_path(page, category, sort, mode, terms):
    if mode == 'browse':
        if(category == 0):
            category = 100
        return '/browse/{}/{}/{}'.format(category, page, sort)
    elif mode == 'recent':
        # This is not a typo. There is no / between 48h and the category.
        path = '/top/48h'
        # only major categories can be used with this mode
        if(category == 0):
            return path + 'all'
        else:
            return path + str(category)
    elif mode == 'top':
        path = '/top/'
        if(category == 0):
            return path + 'all'
        else:
            return path + str(category)
    elif mode == 'search':
        query = urllib.parse.quote_plus(' '.join(terms))
        return '/search/{}/{}/{}/{}'.format(query, page, sort, category)
    else:
        raise Exception('Unknown mode.')


# this returns a list of dictionaries
def parse_page(html):
    soup = BeautifulSoup(html, 'html.parser')
    table = soup.find('table', id='searchResult')

    results = []
    no_results = re.search(r'No hits\. Try adding an asterisk in '
                           r'you search phrase\.', html)

    # check for a blocked mirror
    if not table and not no_results:
        # Contradiction - we found no results,
        # but the page didn't say there were no results.
        # The page is probably not actually the pirate bay,
        # so let's try another mirror
        raise IOError('Blocked mirror detected.')

    if no_results:
        return results

    # parse the rows one by one (skipping headings)
    for row in table('tr')[1:]:
        # grab info about the row
        row_link = row.find('a', class_='detLink')
        if row_link is None:
            continue
        id_ = row_link['href'].split('/')[2]
        seeds, leechers = [i.text for i in row('td')[-2:]]
        magnet = row.find(lambda tag:
                          tag.name == 'a' and
                          tag['href'].startswith('magnet'))['href']

        # parse descriptions separately
        description = row.find('font', class_='detDesc').text
        size = re.findall(r'(?<=Size )[0-9.]+\s[KMGT]*[i ]*B',
                          description)[0].split()
        uploaded = re.findall(r'(?<=Uploaded ).+(?=\, Size)',
                              description)[0]

        results.append({
            'magnet': magnet,
            'seeds': seeds,
            'leechers': leechers,
            'size': size,
            'uploaded': uploaded,
            'id': id_
        })

    return results


def remote(printer, pages, category, sort, mode, terms, mirror):
    res_l = []

    if pages < 1:
        raise ValueError('Please provide an integer greater than 0 '
                         'for the number of pages to fetch.')

    # Catch the Ctrl-C exception and exit cleanly
    try:
        for page in range(pages):
            path = build_request_path(page, category, sort, mode, terms)

            req = request.Request(mirror + path,
                                  headers=pirate.data.default_headers)
            req.add_header('Accept-encoding', 'gzip')
            f = request.urlopen(req, timeout=pirate.data.default_timeout)
            if f.info().get('Content-Encoding') == 'gzip':
                f = gzip.GzipFile(fileobj=BytesIO(f.read()))
            res = f.read().decode('utf-8')

            res_l += parse_page(res)

    except KeyboardInterrupt:
        printer.print('\nCancelled.')
        sys.exit(0)

    return res_l


def get_torrent(info_hash):
    url = 'http://itorrents.org/torrent/{:X}.torrent'
    req = request.Request(url.format(info_hash),
                          headers=pirate.data.default_headers)
    req.add_header('Accept-encoding', 'gzip')

    torrent = request.urlopen(req, timeout=pirate.data.default_timeout)
    if torrent.info().get('Content-Encoding') == 'gzip':
        torrent = gzip.GzipFile(fileobj=BytesIO(torrent.read()))

    return torrent.read()


def save_torrents(printer, chosen_links, results, folder):
    for link in chosen_links:
        magnet = results[link]['magnet']
        name = re.search(r'dn=([^\&]*)', magnet)
        torrent_name = parse.unquote(name.group(1)).replace('+', ' ')
        info_hash = int(re.search(r'btih:([a-f0-9]{40})', magnet).group(1), 16)
        torrent_name = torrent_name.replace('/', '_').replace('\\', '_')
        file = os.path.join(folder, torrent_name + '.torrent')

        try:
            torrent = get_torrent(info_hash)
        except urllib.error.HTTPError as e:
            printer.print('There is no cached file for this torrent :('
                          ' \nCode: {} - {}'.format(e.code, e.reason),
                          color='ERROR')
        else:
            open(file, 'wb').write(torrent)
            printer.print('Saved {:X} in {}'.format(info_hash, file))


def save_magnets(printer, chosen_links, results, folder):
    for link in chosen_links:
        magnet = results[link]['magnet']
        name = re.search(r'dn=([^\&]*)', magnet)
        torrent_name = parse.unquote(name.group(1)).replace('+', ' ')
        info_hash = int(re.search(r'btih:([a-f0-9]{40})', magnet).group(1), 16)
        torrent_name = torrent_name.replace('/', '_').replace('\\', '_')
        file = os.path.join(folder,  torrent_name + '.magnet')

        printer.print('Saved {:X} in {}'.format(info_hash, file))
        with open(file, 'w') as f:
            f.write(magnet + '\n')

def copy_magnets(printer, chosen_links, results):
    clipboard_text = ''
    for link in chosen_links:
        magnet = results[link]['magnet']
        info_hash = int(re.search(r'btih:([a-f0-9]{40})', magnet).group(1), 16)
        clipboard_text += magnet + "\n"
        printer.print('Copying {:X} to clipboard'.format(info_hash))

    pyperclip.copy(clipboard_text)
Organize in modules 2015-08-30 03:28:43 +02:00			`import re`
			`import sys`
			`import gzip`
Copying torrent magnets from the pirate-get CLI I often use seedboxes, and find myself having to save a magnet to file and then copy its contents, to input a magnet link into my seedbox. To remedy this issue, I introduced the 'c' command inside pirate-get, that allows the user to copy a magnet link to theit clipboard. For example, the 'c3' command would copy torrent number three into the user's clipboard. I hope you find my pull request satisfactory. Have a great day! - Matteo 2018-06-15 01:10:23 +02:00			`import pyperclip`
Organize in modules 2015-08-30 03:28:43 +02:00			`import urllib.request as request`
			`import urllib.parse as parse`
			`import urllib.error`
fix bugs with saving magnets and torrents 2015-08-31 07:54:59 +02:00			`import os.path`
Organize in modules 2015-08-30 03:28:43 +02:00
switch to BeautifulSoup 2016-07-03 19:26:58 +02:00			`from bs4 import BeautifulSoup`
rewrite html parser with pyquery 2015-09-04 06:44:02 +02:00
Organize in modules 2015-08-30 03:28:43 +02:00			`import pirate.data`

			`from io import BytesIO`


extract parse_magnets_seeds_leachers function 2015-09-03 09:25:17 +02:00			`parser_regex = r'"(magnet\:\?xt=[^"]*)\|<td align="right">([^<]+)</td>'`


make the printer a class so it can be configured 2015-09-20 23:14:00 +02:00			`def parse_category(printer, category):`
fix bugs with categories and sorts 2015-09-03 09:18:11 +02:00			`try:`
			`category = int(category)`
			`except ValueError:`
			`pass`
			`if category in pirate.data.categories.values():`
organize argument passing to torrent.py 2015-09-03 09:05:33 +02:00			`return category`
			`elif category in pirate.data.categories.keys():`
			`return pirate.data.categories[category]`
Organize in modules 2015-08-30 03:28:43 +02:00			`else:`
make the printer a class so it can be configured 2015-09-20 23:14:00 +02:00			`printer.print('Invalid category ignored', color='WARN')`
lots of tests and some bugfixes 2015-09-15 05:21:13 +02:00			`return 0`
Organize in modules 2015-08-30 03:28:43 +02:00
organize argument passing to torrent.py 2015-09-03 09:05:33 +02:00
make the printer a class so it can be configured 2015-09-20 23:14:00 +02:00			`def parse_sort(printer, sort):`
fix bugs with categories and sorts 2015-09-03 09:18:11 +02:00			`try:`
			`sort = int(sort)`
			`except ValueError:`
			`pass`
			`if sort in pirate.data.sorts.values():`
organize argument passing to torrent.py 2015-09-03 09:05:33 +02:00			`return sort`
			`elif sort in pirate.data.sorts.keys():`
			`return pirate.data.sorts[sort]`
Organize in modules 2015-08-30 03:28:43 +02:00			`else:`
make the printer a class so it can be configured 2015-09-20 23:14:00 +02:00			`printer.print('Invalid sort ignored', color='WARN')`
lots of tests and some bugfixes 2015-09-15 05:21:13 +02:00			`return 99`
organize argument passing to torrent.py 2015-09-03 09:05:33 +02:00

Fix PEP8 errors and unused import 2016-07-07 03:51:13 +02:00			`# TODO:`
			`# * warn users when using a sort in a mode that doesn't accept sorts`
			`# * warn users when using search terms in a mode`
			`# that doesn't accept search terms`
			`# * same with page parameter for top and top48h`
			`# * warn the user if trying to use a minor category with top48h`
factor out page parser 2015-09-04 05:25:24 +02:00			`def build_request_path(page, category, sort, mode, terms):`
			`if mode == 'browse':`
			`if(category == 0):`
			`category = 100`
			`return '/browse/{}/{}/{}'.format(category, page, sort)`
			`elif mode == 'recent':`
			`# This is not a typo. There is no / between 48h and the category.`
			`path = '/top/48h'`
			`# only major categories can be used with this mode`
			`if(category == 0):`
			`return path + 'all'`
			`else:`
			`return path + str(category)`
			`elif mode == 'top':`
			`path = '/top/'`
			`if(category == 0):`
			`return path + 'all'`
			`else:`
			`return path + str(category)`
			`elif mode == 'search':`
			`query = urllib.parse.quote_plus(' '.join(terms))`
			`return '/search/{}/{}/{}/{}'.format(query, page, sort, category)`
			`else:`
			`raise Exception('Unknown mode.')`


simplify results array 2015-09-04 07:18:38 +02:00			`# this returns a list of dictionaries`
rewrite html parser with pyquery 2015-09-04 06:44:02 +02:00			`def parse_page(html):`
switch to BeautifulSoup 2016-07-03 19:26:58 +02:00			`soup = BeautifulSoup(html, 'html.parser')`
			`table = soup.find('table', id='searchResult')`
rewrite html parser with pyquery 2015-09-04 06:44:02 +02:00
refactor results parser to be row-oriented 2015-09-04 07:34:08 +02:00			`results = []`
switch to BeautifulSoup 2016-07-03 19:26:58 +02:00			`no_results = re.search(r'No hits\. Try adding an asterisk in '`
			`r'you search phrase\.', html)`

			`# check for a blocked mirror`
			`if not table and not no_results:`
			`# Contradiction - we found no results,`
			`# but the page didn't say there were no results.`
			`# The page is probably not actually the pirate bay,`
			`# so let's try another mirror`
			`raise IOError('Blocked mirror detected.')`

			`if no_results:`
			`return results`
refactor results parser to be row-oriented 2015-09-04 07:34:08 +02:00
switch to BeautifulSoup 2016-07-03 19:26:58 +02:00			`# parse the rows one by one (skipping headings)`
			`for row in table('tr')[1:]:`
refactor results parser to be row-oriented 2015-09-04 07:34:08 +02:00			`# grab info about the row`
fix for empty href on links that ocasionally cause an uncaught exception 2018-06-27 15:33:07 +02:00			`row_link = row.find('a', class_='detLink')`
			`if row_link is None:`
			`continue`
			`id_ = row_link['href'].split('/')[2]`
switch to BeautifulSoup 2016-07-03 19:26:58 +02:00			`seeds, leechers = [i.text for i in row('td')[-2:]]`
			`magnet = row.find(lambda tag:`
			`tag.name == 'a' and`
			`tag['href'].startswith('magnet'))['href']`
refactor results parser to be row-oriented 2015-09-04 07:34:08 +02:00
			`# parse descriptions separately`
switch to BeautifulSoup 2016-07-03 19:26:58 +02:00			`description = row.find('font', class_='detDesc').text`
Fix PEP8 errors and unused import 2016-07-07 03:51:13 +02:00			`size = re.findall(r'(?<=Size )[0-9.]+\s[KMGT][i ]B',`
			`description)[0].split()`
			`uploaded = re.findall(r'(?<=Uploaded ).+(?=\, Size)',`
			`description)[0]`
refactor results parser to be row-oriented 2015-09-04 07:34:08 +02:00
			`results.append({`
			`'magnet': magnet,`
			`'seeds': seeds,`
			`'leechers': leechers,`
			`'size': size,`
			`'uploaded': uploaded,`
			`'id': id_`
			`})`
factor out page parser 2015-09-04 05:25:24 +02:00
refactor results parser to be row-oriented 2015-09-04 07:34:08 +02:00			`return results`
factor out page parser 2015-09-04 05:25:24 +02:00

make the printer a class so it can be configured 2015-09-20 23:14:00 +02:00			`def remote(printer, pages, category, sort, mode, terms, mirror):`
organize argument passing to torrent.py 2015-09-03 09:05:33 +02:00			`res_l = []`
factor out page parser 2015-09-04 05:25:24 +02:00
organize argument passing to torrent.py 2015-09-03 09:05:33 +02:00			`if pages < 1:`
			`raise ValueError('Please provide an integer greater than 0 '`
			`'for the number of pages to fetch.')`

Organize in modules 2015-08-30 03:28:43 +02:00			`# Catch the Ctrl-C exception and exit cleanly`
			`try:`
			`for page in range(pages):`
factor out page parser 2015-09-04 05:25:24 +02:00			`path = build_request_path(page, category, sort, mode, terms)`
Organize in modules 2015-08-30 03:28:43 +02:00
			`req = request.Request(mirror + path,`
			`headers=pirate.data.default_headers)`
			`req.add_header('Accept-encoding', 'gzip')`
			`f = request.urlopen(req, timeout=pirate.data.default_timeout)`
			`if f.info().get('Content-Encoding') == 'gzip':`
			`f = gzip.GzipFile(fileobj=BytesIO(f.read()))`
			`res = f.read().decode('utf-8')`
factor out page parser 2015-09-04 05:25:24 +02:00
simplify results array 2015-09-04 07:18:38 +02:00			`res_l += parse_page(res)`
factor out page parser 2015-09-04 05:25:24 +02:00
			`except KeyboardInterrupt:`
make the printer a class so it can be configured 2015-09-20 23:14:00 +02:00			`printer.print('\nCancelled.')`
Organize in modules 2015-08-30 03:28:43 +02:00			`sys.exit(0)`

simplify results array 2015-09-04 07:18:38 +02:00			`return res_l`
Organize in modules 2015-08-30 03:28:43 +02:00

			`def get_torrent(info_hash):`
replace torcache.net with itorrents.org cache 2017-09-06 19:27:00 +02:00			`url = 'http://itorrents.org/torrent/{:X}.torrent'`
Organize in modules 2015-08-30 03:28:43 +02:00			`req = request.Request(url.format(info_hash),`
refactor pirate.py 2015-09-17 08:15:27 +02:00			`headers=pirate.data.default_headers)`
Organize in modules 2015-08-30 03:28:43 +02:00			`req.add_header('Accept-encoding', 'gzip')`
refactor pirate.py 2015-09-17 08:15:27 +02:00
Organize in modules 2015-08-30 03:28:43 +02:00			`torrent = request.urlopen(req, timeout=pirate.data.default_timeout)`
			`if torrent.info().get('Content-Encoding') == 'gzip':`
			`torrent = gzip.GzipFile(fileobj=BytesIO(torrent.read()))`

			`return torrent.read()`


make the printer a class so it can be configured 2015-09-20 23:14:00 +02:00			`def save_torrents(printer, chosen_links, results, folder):`
Organize in modules 2015-08-30 03:28:43 +02:00			`for link in chosen_links:`
simplify results array 2015-09-04 07:18:38 +02:00			`magnet = results[link]['magnet']`
Organize in modules 2015-08-30 03:28:43 +02:00			`name = re.search(r'dn=([^\&]*)', magnet)`
			`torrent_name = parse.unquote(name.group(1)).replace('+', ' ')`
			`info_hash = int(re.search(r'btih:([a-f0-9]{40})', magnet).group(1), 16)`
fix bug with / or \ in torrent names, fix #73 2015-10-09 05:33:13 +02:00			`torrent_name = torrent_name.replace('/', '_').replace('\\', '_')`
Organize in modules 2015-08-30 03:28:43 +02:00			`file = os.path.join(folder, torrent_name + '.torrent')`

			`try:`
			`torrent = get_torrent(info_hash)`
fix exception 2018-09-15 04:14:29 +02:00			`except urllib.error.HTTPError as e:`
give more detail when there is not possible download a torrent file 2018-09-15 03:11:29 +02:00			`printer.print('There is no cached file for this torrent :('`
			`' \nCode: {} - {}'.format(e.code, e.reason),`
Fix PEP8 errors and unused import 2016-07-07 03:51:13 +02:00			`color='ERROR')`
Organize in modules 2015-08-30 03:28:43 +02:00			`else:`
refactor pirate.py 2015-09-17 08:15:27 +02:00			`open(file, 'wb').write(torrent)`
make the printer a class so it can be configured 2015-09-20 23:14:00 +02:00			`printer.print('Saved {:X} in {}'.format(info_hash, file))`
Organize in modules 2015-08-30 03:28:43 +02:00

make the printer a class so it can be configured 2015-09-20 23:14:00 +02:00			`def save_magnets(printer, chosen_links, results, folder):`
Organize in modules 2015-08-30 03:28:43 +02:00			`for link in chosen_links:`
simplify results array 2015-09-04 07:18:38 +02:00			`magnet = results[link]['magnet']`
Organize in modules 2015-08-30 03:28:43 +02:00			`name = re.search(r'dn=([^\&]*)', magnet)`
			`torrent_name = parse.unquote(name.group(1)).replace('+', ' ')`
			`info_hash = int(re.search(r'btih:([a-f0-9]{40})', magnet).group(1), 16)`
fix bug with / or \ in torrent names, fix #73 2015-10-09 05:33:13 +02:00			`torrent_name = torrent_name.replace('/', '_').replace('\\', '_')`
Organize in modules 2015-08-30 03:28:43 +02:00			`file = os.path.join(folder, torrent_name + '.magnet')`

make the printer a class so it can be configured 2015-09-20 23:14:00 +02:00			`printer.print('Saved {:X} in {}'.format(info_hash, file))`
Organize in modules 2015-08-30 03:28:43 +02:00			`with open(file, 'w') as f:`
fix bugs with saving magnets and torrents 2015-08-31 07:54:59 +02:00			`f.write(magnet + '\n')`
Copying torrent magnets from the pirate-get CLI I often use seedboxes, and find myself having to save a magnet to file and then copy its contents, to input a magnet link into my seedbox. To remedy this issue, I introduced the 'c' command inside pirate-get, that allows the user to copy a magnet link to theit clipboard. For example, the 'c3' command would copy torrent number three into the user's clipboard. I hope you find my pull request satisfactory. Have a great day! - Matteo 2018-06-15 01:10:23 +02:00
			`def copy_magnets(printer, chosen_links, results):`
			`clipboard_text = ''`
			`for link in chosen_links:`
			`magnet = results[link]['magnet']`
			`info_hash = int(re.search(r'btih:([a-f0-9]{40})', magnet).group(1), 16)`
			`clipboard_text += magnet + "\n"`
			`printer.print('Copying {:X} to clipboard'.format(info_hash))`

fix for empty href on links that ocasionally cause an uncaught exception 2018-06-27 15:33:07 +02:00			`pyperclip.copy(clipboard_text)`