mirror of
https://github.com/vikstrous/pirate-get
synced 2025-01-26 12:34:19 +01:00
Merge pull request #40 from brianpeiris/feature/openbay
Update to scrape openbay instead
This commit is contained in:
commit
a62a963614
@ -7,7 +7,7 @@ Tested on Arch Linux mostly. It should work on any other Linux too. Let me know
|
|||||||
|
|
||||||
## Installation
|
## Installation
|
||||||
|
|
||||||
Make sure you have python 2 installed.
|
Make sure you have python 2 and pip installed.
|
||||||
|
|
||||||
Run install.sh
|
Run install.sh
|
||||||
|
|
||||||
|
@ -14,10 +14,13 @@ TMP=$(mktemp pirate-get-XXXXXX)
|
|||||||
echo "#!/usr/bin/env python" > "$TMP"
|
echo "#!/usr/bin/env python" > "$TMP"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
sed 1d $(dirname $0)/pirate-get.py >> "$TMP"
|
sed 1d $(dirname $0)/pirate-get.py >> "$TMP"
|
||||||
|
|
||||||
cp "$TMP" /usr/bin/pirate-get &&
|
cp "$TMP" /usr/bin/pirate-get &&
|
||||||
chmod +x /usr/bin/pirate-get &&
|
chmod +x /usr/bin/pirate-get &&
|
||||||
chmod 755 /usr/bin/pirate-get &&
|
chmod 755 /usr/bin/pirate-get &&
|
||||||
|
|
||||||
|
pip install -r requirements.txt &&
|
||||||
|
|
||||||
rm $TMP
|
rm $TMP
|
||||||
} || rm $TMP
|
} || rm $TMP
|
||||||
|
141
pirate-get.py
141
pirate-get.py
@ -33,6 +33,8 @@ from pprint import pprint
|
|||||||
from StringIO import StringIO
|
from StringIO import StringIO
|
||||||
import gzip
|
import gzip
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
class NoRedirection(urllib2.HTTPErrorProcessor):
|
class NoRedirection(urllib2.HTTPErrorProcessor):
|
||||||
|
|
||||||
def http_response(self, request, response):
|
def http_response(self, request, response):
|
||||||
@ -92,8 +94,8 @@ def main():
|
|||||||
parser.add_argument('-b', dest='browse', action='store_true', help="Display in Browse mode", default=False)
|
parser.add_argument('-b', dest='browse', action='store_true', help="Display in Browse mode", default=False)
|
||||||
parser.add_argument('search', metavar='search', nargs="*", help="Term to search for")
|
parser.add_argument('search', metavar='search', nargs="*", help="Term to search for")
|
||||||
parser.add_argument('-c', dest='category', metavar='category', help="Specify a category to search", default="All")
|
parser.add_argument('-c', dest='category', metavar='category', help="Specify a category to search", default="All")
|
||||||
parser.add_argument('-s', dest='sort', metavar='sort', help="Specify a sort option", default="SeedersDsc")
|
parser.add_argument('-s', dest='sort', metavar='sort', help="Specify a sort option", default="seeders.desc")
|
||||||
parser.add_argument('-R', dest='recent', action='store_true', help="Torrents uploaded in the last 48hours. *ignored in searches*", default=False)
|
parser.add_argument('-R', dest='recent', action='store_true', help="Torrents uploaded in the last two weeks. *ignored in searches*", default=False)
|
||||||
parser.add_argument('-l', dest='list_categories', action='store_true', help="List categories", default=False)
|
parser.add_argument('-l', dest='list_categories', action='store_true', help="List categories", default=False)
|
||||||
parser.add_argument('--list_sorts', dest='list_sorts', action='store_true', help="List Sortable Types", default=False)
|
parser.add_argument('--list_sorts', dest='list_sorts', action='store_true', help="List Sortable Types", default=False)
|
||||||
parser.add_argument('-t',dest='transmission',action='store_true', help="call transmission-remote to start the download", default=False)
|
parser.add_argument('-t',dest='transmission',action='store_true', help="call transmission-remote to start the download", default=False)
|
||||||
@ -104,9 +106,30 @@ def main():
|
|||||||
parser.add_argument('-a', dest='download_all', action='store_true', help="download all results", default=False)
|
parser.add_argument('-a', dest='download_all', action='store_true', help="download all results", default=False)
|
||||||
parser.add_argument('--color', dest='color', action='store_true', help="use colored output", default=False)
|
parser.add_argument('--color', dest='color', action='store_true', help="use colored output", default=False)
|
||||||
|
|
||||||
categories = {"All":"0","Audio":"100","Audio/Music":"101","Audio/Audio books":"102","Audio/Sound clips":"103","Audio/FLAC":"104","Audio/Other":"199","Video":"200","Video/Movies":"201","Video/Movies DVDR":"202","Video/Music videos":"203","Video/Movie clips":"204","Video/TV shows":"205","Video/Handheld":"206","Video/HD - Movies":"207","Video/HD - TV shows":"208","Video/3D":"209","Video/Other":"299","Applications":"300","Applications/Windows":"301","Applications/Mac":"302","Applications/UNIX":"303","Applications/Handheld":"304","Applications/IOS (iPad/iPhone)":"305","Applications/Android":"306","Applications/Other OS":"399","Games":"400","Games/PC":"401","Games/Mac":"402","Games/PSx":"403","Games/XBOX360":"404","Games/Wii":"405","Games/Handheld":"406","Games/IOS (iPad/iPhone)":"407","Games/Android":"408","Games/Other":"499","Porn":"500","Porn/Movies":"501","Porn/Movies DVDR":"502","Porn/Pictures":"503","Porn/Games":"504","Porn/HD - Movies":"505","Porn/Movie clips":"506","Porn/Other":"599","Other":"600","Other/E-books":"601","Other/Comics":"602","Other/Pictures":"603","Other/Covers":"604","Other/Physibles":"605","Other/Other":"699"}
|
categories = {
|
||||||
|
"All":"0",
|
||||||
|
"Anime":"1",
|
||||||
|
"Software":"2",
|
||||||
|
"Games":"3",
|
||||||
|
"Adult":"4",
|
||||||
|
"Movies":"5",
|
||||||
|
"Music":"6",
|
||||||
|
"Other":"7",
|
||||||
|
"Series & TV":"8",
|
||||||
|
"Books":"9",
|
||||||
|
}
|
||||||
|
|
||||||
sorts = {"TitleDsc":"1","TitleAsc":"2","DateDsc":"3","DateAsc":"4","SizeDsc":"5","SizeAsc":"6","SeedersDsc":"7","SeedersAsc":"8","LeechersDsc":"9","LeechersAsc":"10","CategoryDsc":"13","CategoryAsc":"14","Default":"99"}
|
sorts = {
|
||||||
|
"created_at":"0",
|
||||||
|
"created_at.desc":"1",
|
||||||
|
"size":"2",
|
||||||
|
"size.desc":"3",
|
||||||
|
"seeders":"4",
|
||||||
|
"seeders.desc":"5",
|
||||||
|
"leechers":"6",
|
||||||
|
"leechers.desc":"7",
|
||||||
|
}
|
||||||
|
reverse_sorts = {v: k for k, v in sorts.items()}
|
||||||
|
|
||||||
#todo: redo this with html parser instead of regex
|
#todo: redo this with html parser instead of regex
|
||||||
def remote(args, mirror):
|
def remote(args, mirror):
|
||||||
@ -126,35 +149,37 @@ def main():
|
|||||||
category = "0";
|
category = "0";
|
||||||
print ("Invalid category ignored", color="WARN")
|
print ("Invalid category ignored", color="WARN")
|
||||||
|
|
||||||
if str(args.sort) in sorts.values():
|
if args.sort in sorts.keys():
|
||||||
sort = args.sort;
|
sort = args.sort;
|
||||||
elif args.sort in sorts.keys():
|
elif args.sort in sorts.values():
|
||||||
sort = sorts[args.sort]
|
sort = reverse_sorts[args.sort]
|
||||||
else:
|
else:
|
||||||
sort = "99";
|
|
||||||
print ("Invalid sort ignored", color="WARN")
|
print ("Invalid sort ignored", color="WARN")
|
||||||
|
|
||||||
|
query_parameters = {
|
||||||
|
"iht":"0",
|
||||||
|
"age":"0",
|
||||||
|
"Torrent_sort":"",
|
||||||
|
"LTorrent_page":0,
|
||||||
|
"q":"",
|
||||||
|
}
|
||||||
# Catch the Ctrl-C exception and exit cleanly
|
# Catch the Ctrl-C exception and exit cleanly
|
||||||
try:
|
try:
|
||||||
sizes = []
|
sizes = []
|
||||||
uploaded = []
|
uploaded = []
|
||||||
identifiers = []
|
identifiers = []
|
||||||
|
OPENBAY_PAGE_LEN = 40
|
||||||
for page in xrange(pages):
|
for page in xrange(pages):
|
||||||
|
|
||||||
#
|
query_parameters["LTorrent_page"] = page * OPENBAY_PAGE_LEN
|
||||||
if args.browse:
|
query_parameters["Torrent_sort"] = sort
|
||||||
path = "/browse/"
|
query_parameters["iht"] = category
|
||||||
if(category == "0"):
|
if len(args.search) == 0:
|
||||||
category = '100'
|
query_parameters["age"] = "14" if args.recent else "0"
|
||||||
path = '/browse/' + category + '/' + str(page) + '/' + str(sort)
|
|
||||||
elif len(args.search) == 0:
|
|
||||||
path = "/top/48h" if args.recent else "/top/"
|
|
||||||
if(category == "0"):
|
|
||||||
path += 'all'
|
|
||||||
else:
|
|
||||||
path += category
|
|
||||||
else:
|
else:
|
||||||
path = '/search/' + "+".join(args.search) + '/' + str(page) + '/' + str(sort) + '/' + category
|
query_parameters["q"] = "+".join(args.search)
|
||||||
|
|
||||||
|
path = "/search.php?" + '&'.join(k + "=" + str(v) for k, v in query_parameters.items())
|
||||||
|
|
||||||
request = urllib2.Request(mirror + path)
|
request = urllib2.Request(mirror + path)
|
||||||
request.add_header('Accept-encoding', 'gzip')
|
request.add_header('Accept-encoding', 'gzip')
|
||||||
@ -163,39 +188,36 @@ def main():
|
|||||||
buf = StringIO(f.read())
|
buf = StringIO(f.read())
|
||||||
f = gzip.GzipFile(fileobj=buf)
|
f = gzip.GzipFile(fileobj=buf)
|
||||||
res = f.read()
|
res = f.read()
|
||||||
found = re.findall(""""(magnet\:\?xt=[^"]*)|<td align="right">([^<]+)</td>""", res)
|
|
||||||
|
|
||||||
# check for a blocked mirror
|
soup = BeautifulSoup(res)
|
||||||
no_results = re.search(""""No hits\.""", res)
|
found = soup.select('table.table-torrents>tbody>tr')
|
||||||
if found == [] and not no_results is None:
|
|
||||||
# Contradiction - we found no results, but the page didn't say there were no results
|
|
||||||
# the page is probably not actually the pirate bay, so let's try another mirror
|
|
||||||
raise Exception("Blocked mirror detected.")
|
|
||||||
|
|
||||||
# get sizes as well and substitute the character
|
results_body = soup.table.tbody
|
||||||
sizes.extend([match.replace(" ", " ") for match in re.findall("(?<=Size )[0-9.]+\ \;[KMGT]*[i ]*B",res)])
|
|
||||||
uploaded.extend([match.replace(" ", " ") for match in re.findall("(?<=Uploaded ).+(?=\, Size)",res)])
|
|
||||||
identifiers.extend([match.replace(" ", " ") for match in re.findall("(?<=/torrent/)[0-9]+(?=/)",res)])
|
|
||||||
|
|
||||||
state = "seeds"
|
get_text = lambda elements: [element.get_text() for element in elements]
|
||||||
curr = ['',0,0] #magnet, seeds, leeches
|
get_text_by_class = lambda class_: get_text(results_body.find_all(class_=class_))
|
||||||
for f in found:
|
get_links = lambda links: [link.get('href') for link in links]
|
||||||
if f[1] == '':
|
|
||||||
curr[0] = f[0]
|
sizes.extend(get_text_by_class('size-row'))
|
||||||
else:
|
uploaded.extend(get_text_by_class('date-row'))
|
||||||
if state == 'seeds':
|
identifiers.extend([
|
||||||
curr[1] = f[1]
|
re.search('torrent/(\d+)', link).group(1)
|
||||||
state = 'leeches'
|
for link in
|
||||||
else:
|
get_links(results_body.find_all('a', href=re.compile('/torrent/')))
|
||||||
curr[2] = f[1]
|
])
|
||||||
state = 'seeds'
|
|
||||||
res_l.append(curr)
|
links = get_links(results_body.find_all('a', title='MAGNET LINK'))
|
||||||
curr = ['', 0, 0]
|
seeders = get_text_by_class('seeders-row')
|
||||||
|
leechers = get_text_by_class('leechers-row')
|
||||||
|
for i in xrange(len(links)):
|
||||||
|
res_l.append([links[i], seeders[i], leechers[i]])
|
||||||
|
|
||||||
|
if len(links) < OPENBAY_PAGE_LEN:
|
||||||
|
break
|
||||||
except KeyboardInterrupt :
|
except KeyboardInterrupt :
|
||||||
print("\nCancelled.")
|
print("\nCancelled.")
|
||||||
exit()
|
exit()
|
||||||
|
|
||||||
# return the sizes in a spearate list
|
|
||||||
return res_l, sizes, uploaded, identifiers
|
return res_l, sizes, uploaded, identifiers
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
@ -261,7 +283,7 @@ def main():
|
|||||||
if args.database:
|
if args.database:
|
||||||
mags = local(args)
|
mags = local(args)
|
||||||
else:
|
else:
|
||||||
mirrors = ["http://thepiratebay.se"]
|
mirrors = ["https://oldpiratebay.org", "http://thepiratebay.se"]
|
||||||
try:
|
try:
|
||||||
opener = urllib2.build_opener(NoRedirection)
|
opener = urllib2.build_opener(NoRedirection)
|
||||||
f = opener.open("https://proxybay.info/list.txt")
|
f = opener.open("https://proxybay.info/list.txt")
|
||||||
@ -275,14 +297,14 @@ def main():
|
|||||||
try:
|
try:
|
||||||
print("Trying " + mirror)
|
print("Trying " + mirror)
|
||||||
mags, sizes, uploaded, identifiers = remote(args, mirror)
|
mags, sizes, uploaded, identifiers = remote(args, mirror)
|
||||||
break
|
if not mags or len(mags) == 0:
|
||||||
|
print("No results from " + mirror)
|
||||||
|
else:
|
||||||
|
break
|
||||||
except Exception, e:
|
except Exception, e:
|
||||||
print(format(e))
|
print(format(e))
|
||||||
print("Could not contact " + mirror, color="WARN")
|
print("Could not contact " + mirror, color="WARN")
|
||||||
|
|
||||||
if not mags or len(mags) == 0:
|
|
||||||
print("no results")
|
|
||||||
return
|
|
||||||
# enhanced print output with column titles
|
# enhanced print output with column titles
|
||||||
def print_search_results():
|
def print_search_results():
|
||||||
print("%5s %6s %6s %-5s %-11s %-11s %s" \
|
print("%5s %6s %6s %-5s %-11s %-11s %s" \
|
||||||
@ -312,7 +334,7 @@ def main():
|
|||||||
uploaded[m], torrent_name), color=cur_color)
|
uploaded[m], torrent_name), color=cur_color)
|
||||||
def print_descriptions(chosen_links):
|
def print_descriptions(chosen_links):
|
||||||
for link in chosen_links:
|
for link in chosen_links:
|
||||||
path = '/torrent/' + identifiers[int(link)] + '/'
|
path = '/torrent/' + identifiers[int(link)] + '/pirate-get'
|
||||||
request = urllib2.Request(mirror + path)
|
request = urllib2.Request(mirror + path)
|
||||||
request.add_header('Accept-encoding', 'gzip')
|
request.add_header('Accept-encoding', 'gzip')
|
||||||
f = urllib2.urlopen(request)
|
f = urllib2.urlopen(request)
|
||||||
@ -320,14 +342,19 @@ def main():
|
|||||||
buf = StringIO(f.read())
|
buf = StringIO(f.read())
|
||||||
f = gzip.GzipFile(fileobj=buf)
|
f = gzip.GzipFile(fileobj=buf)
|
||||||
res = f.read()
|
res = f.read()
|
||||||
|
|
||||||
name = re.search("dn=([^\&]*)", mags[int(link)][0])
|
name = re.search("dn=([^\&]*)", mags[int(link)][0])
|
||||||
torrent_name = urllib.unquote(name.group(1).encode('ascii')) \
|
torrent_name = urllib.unquote(name.group(1).encode('ascii')) \
|
||||||
.decode('utf-8').replace("+", " ")
|
.decode('utf-8').replace("+", " ")
|
||||||
desc = re.search(r"<div class=\"nfo\">\s*<pre>(.+?)(?=</pre>)", res, re.DOTALL).group(1)
|
|
||||||
|
desc = re.search(r"<div class=\"nfo\">\s*<pre>(.*?)(?=</pre>)", res, re.DOTALL).group(1)
|
||||||
# Replace HTML links with markdown style versions
|
# Replace HTML links with markdown style versions
|
||||||
desc = re.sub(r"<a href=\"\s*([^\"]+?)\s*\"[^>]*>(\s*)([^<]+?)(\s*)</a>", r"\2[\3](\1)\4", desc)
|
desc = re.sub(r"<a href=\"\s*([^\"]+?)\s*\"[^>]*>(\s*)([^<]+?)(\s*)</a>", r"\2[\3](\1)\4", desc).strip()
|
||||||
print ('Description for "' + torrent_name + '":', color="zebra_1")
|
if desc == '':
|
||||||
print (desc, color="zebra_0")
|
print ('No description given for "' + torrent_name + '"', color="zebra_1")
|
||||||
|
else:
|
||||||
|
print ('Description for "' + torrent_name + '":', color="zebra_1")
|
||||||
|
print (desc, color="zebra_0")
|
||||||
|
|
||||||
def print_fileLists(chosen_links):
|
def print_fileLists(chosen_links):
|
||||||
for link in chosen_links:
|
for link in chosen_links:
|
||||||
|
2
requirements.txt
Normal file
2
requirements.txt
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
beautifulsoup4==4.3.2
|
||||||
|
colorama==0.3.2
|
Loading…
Reference in New Issue
Block a user