v0.2.0 released

## Changes
### mangeticod
* Cython-based bencoder.pyx is  used instead of our own pure Python solution.
* `--node-addr` commandline argument added.
* TCP QUICKACK is automatically enabled (instead of by default) so that magneticod can work on macOS as well.
* Database index added for `info_hash` column of `torrents` table.

### magneticow
* Some small performance improvements for search.
This commit is contained in:
Bora M. ALPER 2017-04-10 17:44:09 +04:00
parent dd37830b80
commit 1c15afb45a
10 changed files with 113 additions and 131 deletions

View File

@ -12,4 +12,4 @@
#
# You should have received a copy of the GNU Affero General Public License along with this program. If not, see
# <http://www.gnu.org/licenses/>.
__version__ = (0, 1, 0)
__version__ = (0, 2, 0)

View File

@ -12,10 +12,14 @@
#
# You should have received a copy of the GNU Affero General Public License along with this program. If not, see
# <http://www.gnu.org/licenses/>.
import argparse
import collections
import functools
import logging
import ipaddress
import selectors
import textwrap
import urllib.parse
import itertools
import os
import sys
@ -51,9 +55,13 @@ complete_info_hashes = set()
def main():
global complete_info_hashes, database, node, peers, selector
logging.basicConfig(level=logging.DEBUG, format="%(asctime)s %(levelname)8s %(message)s")
logging.basicConfig(level=logging.DEBUG, format="%(asctime)s %(levelname)-8s %(message)s")
logging.info("magneticod v%d.%d.%d started", *__version__)
arguments = parse_cmdline_arguments()
if arguments is None:
return 2
# noinspection PyBroadException
try:
path = os.path.join(appdirs.user_data_dir("magneticod"), "database.sqlite3")
@ -64,7 +72,7 @@ def main():
complete_info_hashes = database.get_complete_info_hashes()
node = dht.SybilNode()
node = dht.SybilNode(arguments.node_addr)
node.when_peer_found = on_peer_found
selector.register(node, selectors.EVENT_READ)
@ -155,5 +163,63 @@ def loop() -> None:
selector.modify(fileobj, selectors.EVENT_READ | selectors.EVENT_WRITE)
def parse_cmdline_arguments() -> typing.Optional[argparse.Namespace]:
parser = argparse.ArgumentParser(
description="Autonomous BitTorrent DHT crawler and metadata fetcher.",
epilog=textwrap.dedent("""\
Copyright (C) 2017 Mert Bora ALPER <bora@boramalper.org>
Dedicated to Cemile Binay, in whose hands I thrived.
This program is free software: you can redistribute it and/or modify it under
the terms of the GNU Affero General Public License as published by the Free
Software Foundation, either version 3 of the License, or (at your option) any
later version.
This program is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
PARTICULAR PURPOSE. See the GNU Affero General Public License for more
details.
You should have received a copy of the GNU Affero General Public License along
with this program. If not, see <http://www.gnu.org/licenses/>.
"""),
allow_abbrev=False,
formatter_class=argparse.RawDescriptionHelpFormatter
)
parser.add_argument(
"--node-addr", action="store", type=str, required=False,
help="the address of the (DHT) node magneticod will use"
)
args = parser.parse_args(sys.argv[1:])
args.node_addr = parse_ip_port(args.node_addr) if args.node_addr else ("0.0.0.0", 0)
if args.node_addr is None:
logging.critical("Invalid node address supplied!")
return None
return args
def parse_ip_port(netloc) -> typing.Optional[typing.Tuple[str, int]]:
# In case no port supplied
try:
return str(ipaddress.ip_address(netloc)), 0
except ValueError:
pass
# In case port supplied
try:
parsed = urllib.parse.urlparse("//{}".format(netloc))
ip = str(ipaddress.ip_address(parsed.hostname))
port = parsed.port
if port is None:
# Invalid port
return None
except ValueError:
return None
return ip, port
if __name__ == "__main__":
sys.exit(main())

View File

@ -16,21 +16,23 @@
"""
bencode
Wrapper around bencoder.pyx library.
bencoder.pyx
Copyright (c) 2016, whtsky
All rights reserved.
https://github.com/whtsky/bencoder.pyx
Warning:
Encoders do NOT check for circular objects! (and will NEVER check due to speed concerns).
TODO:
Add support for integers in scientific notation. (?)
Please do re-write this as a shared C module so that we can gain a H U G E speed & performance gain!
I M P O R T A N T // U R G E N T
Support bytearrays as well! (Currently, only bytes).
"""
import typing
import bencoder
Types = typing.Union[int, bytes, list, "KRPCDict"]
KRPCDict = typing.Dict[bytes, Types]
@ -38,14 +40,14 @@ KRPCDict = typing.Dict[bytes, Types]
def dumps(obj) -> bytes:
try:
return __encode[type(obj)](obj)
return bencoder.bencode(obj)
except:
raise BencodeEncodingError()
def loads(bytes_object: bytes) -> Types:
try:
return __decoders[bytes_object[0]](bytes_object, 0)[0]
return bencoder.decode_func[bytes_object[0]](bytes_object, 0)[0]
except Exception as exc:
raise BencodeDecodingError(exc)
@ -59,102 +61,11 @@ def loads2(bytes_object: bytes) -> typing.Tuple[Types, int]:
print(">>>", dump[i:]) # OUTPUT: >>> b'OH YEAH'
"""
try:
return __decoders[bytes_object[0]](bytes_object, 0)
return bencoder.decode_func[bytes_object[0]](bytes_object, 0)
except Exception as exc:
raise BencodeDecodingError(exc)
def __encode_int(i: int) -> bytes:
# False positive...
return b"i%de" % i
def __encode_str(s: typing.ByteString) -> bytes:
return b"%d:%s" % (len(s), s)
def __encode_list(l: typing.Sequence) -> bytes:
""" REFERENCE IMPLEMENTATION
s = bytearray()
for obj in l:
s += __encode[type(obj)](obj)
return b"l%se" % (s,)
"""
return b"l%se" % b"".join(__encode[type(obj)](obj) for obj in l)
def __encode_dict(d: typing.Dict[typing.ByteString, typing.Any]) -> bytes:
s = bytearray()
# Making sure that the keys are in lexicographical order.
# Source: http://stackoverflow.com/a/7375703/4466589
items = sorted(d.items(), key=lambda k: (k[0].lower(), k[0]))
for key, value in items:
s += __encode_str(key)
s += __encode[type(value)](value)
return b"d%se" % (s, )
__encode = {
int: __encode_int,
bytes: __encode_str,
bytearray: __encode_str,
list: __encode_list,
dict: __encode_dict
}
def __decode_int(b: bytes, start_i: int) -> typing.Tuple[int, int]:
end_i = b.find(b"e", start_i)
assert end_i != -1
return int(b[start_i + 1: end_i]), end_i + 1
def __decode_str(b: bytes, start_i: int) -> typing.Tuple[bytes, int]:
separator_i = b.find(b":", start_i)
assert separator_i != -1
length = int(b[start_i: separator_i])
return b[separator_i + 1: separator_i + 1 + length], separator_i + 1 + length
def __decode_list(b: bytes, start_i: int) -> typing.Tuple[list, int]:
list_ = []
i = start_i + 1
while b[i] != 101: # 101 = ord(b"e")
item, i = __decoders[b[i]](b, i)
list_.append(item)
return list_, i + 1
def __decode_dict(b: bytes, start_i: int) -> typing.Tuple[dict, int]:
dict_ = {}
i = start_i + 1
while b[i] != 101: # 101 = ord(b"e")
# Making sure it's between b"0" and b"9" (incl.)
assert 48 <= b[i] <= 57
key, end_i = __decode_str(b, i)
dict_[key], i = __decoders[b[end_i]](b, end_i)
return dict_, i + 1
__decoders = {
ord(b"i"): __decode_int,
ord(b"0"): __decode_str,
ord(b"1"): __decode_str,
ord(b"2"): __decode_str,
ord(b"3"): __decode_str,
ord(b"4"): __decode_str,
ord(b"5"): __decode_str,
ord(b"6"): __decode_str,
ord(b"7"): __decode_str,
ord(b"8"): __decode_str,
ord(b"9"): __decode_str,
ord(b"l"): __decode_list,
ord(b"d"): __decode_dict
}
class BencodeEncodingError(Exception):
pass

View File

@ -33,8 +33,9 @@ class DisposablePeer:
self.__socket.setblocking(False)
# To reduce the latency:
self.__socket.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, True)
if hasattr(socket, 'TCP_QUICKACK'):
if hasattr(socket, "TCP_QUICKACK"):
self.__socket.setsockopt(socket.IPPROTO_TCP, socket.TCP_QUICKACK, True)
res = self.__socket.connect_ex(peer_addr)
if res != errno.EINPROGRESS:
raise ConnectionError()

View File

@ -35,14 +35,11 @@ BOOTSTRAPPING_NODES = [
class SybilNode:
# Maximum number of neighbours (this is a THRESHOLD where, once reached, the search for new neighbours will stop;
# but until then, the total number of neighbours might exceed the threshold).
def __init__(self):
def __init__(self, address: typing.Tuple[str, int]):
self.__true_id = self.__random_bytes(20)
self.__socket = socket.socket(type=socket.SOCK_DGRAM)
self.__socket.bind(("0.0.0.0", 0))
self.__socket.bind(address)
self.__socket.setblocking(False)
self.__incoming_buffer = array.array("B", (0 for _ in range(65536)))
@ -52,9 +49,11 @@ class SybilNode:
self.__token_secret = self.__random_bytes(4)
# Maximum number of neighbours (this is a THRESHOLD where, once reached, the search for new neighbours will
# stop; but until then, the total number of neighbours might exceed the threshold).
self.__n_max_neighbours = 2000
logging.debug("SybilNode %s initialized!", self.__true_id.hex().upper())
logging.debug("SybilNode %s on %s initialized!", self.__true_id.hex().upper(), address)
@staticmethod
def when_peer_found(info_hash: InfoHash, peer_addr: PeerAddress) -> None:
@ -83,7 +82,7 @@ class SybilNode:
except bencode.BencodeDecodingError:
continue
if type(message.get(b"r")) is dict and type(message[b"r"].get(b"nodes")) is bytes:
if isinstance(message.get(b"r"), dict) and type(message[b"r"].get(b"nodes")) is bytes:
self.__on_FIND_NODE_response(message)
elif message.get(b"q") == b"get_peers":
self.__on_GET_PEERS_query(message, addr)

View File

@ -41,16 +41,18 @@ class Database:
db_conn = sqlite3.connect(database, isolation_level=None)
db_conn.execute("PRAGMA journal_mode=WAL;")
db_conn.execute("PRAGMA temp_store = 2;")
db_conn.execute("PRAGMA foreign_keys=ON;")
with db_conn:
db_conn.execute("CREATE TABLE IF NOT EXISTS torrents ("
"id INTEGER PRIMARY KEY,"
"id INTEGER PRIMARY KEY AUTOINCREMENT,"
"info_hash BLOB NOT NULL UNIQUE,"
"name TEXT NOT NULL,"
"total_size INTEGER NOT NULL CHECK(total_size > 0),"
"discovered_on INTEGER NOT NULL CHECK(discovered_on > 0)"
");")
db_conn.execute("CREATE INDEX IF NOT EXISTS info_hash_index ON torrents (info_hash);")
db_conn.execute("CREATE TABLE IF NOT EXISTS files ("
"id INTEGER PRIMARY KEY,"
"torrent_id INTEGER REFERENCES torrents ON DELETE CASCADE ON UPDATE RESTRICT,"

View File

@ -8,7 +8,7 @@ def read_file(path):
setup(
name="magneticod",
version="0.1.0",
version="0.2.0",
description="Autonomous BitTorrent DHT crawler and metadata fetcher.",
long_description=read_file("README.rst"),
url="http://magnetico.org",
@ -22,7 +22,8 @@ setup(
},
install_requires=[
"appdirs>=1.4.3"
"appdirs >= 1.4.3",
"bencoder.pyx >= 1.1.3"
],
classifiers=[

View File

@ -101,12 +101,12 @@ def search_torrents():
" discovered_on "
"FROM torrents "
"INNER JOIN ("
" SELECT torrent_id, rank(matchinfo(fts_torrents, 'pcnxal')) AS rank "
" SELECT docid AS id, rank(matchinfo(fts_torrents, 'pcnxal')) AS rank "
" FROM fts_torrents "
" WHERE name MATCH ? "
" ORDER BY rank ASC"
" LIMIT 20 OFFSET ?"
") AS ranktable ON torrents.id=ranktable.torrent_id;",
") AS ranktable USING(id);",
(search, 20 * page)
)
context["torrents"] = [Torrent(t[0].hex(), t[1], utils.to_human_size(t[2]),
@ -138,7 +138,7 @@ def newest_torrents():
" total_size, "
" discovered_on "
"FROM torrents "
"ORDER BY discovered_on DESC LIMIT 20 OFFSET ?",
"ORDER BY id DESC LIMIT 20 OFFSET ?",
(20 * page,)
)
context["torrents"] = [Torrent(t[0].hex(), t[1], utils.to_human_size(t[2]), datetime.fromtimestamp(t[3]).strftime("%d/%m/%Y"), [])
@ -187,15 +187,14 @@ def torrent(**kwargs):
return flask.abort(400)
with magneticod_db:
cur = magneticod_db.execute("SELECT name, discovered_on FROM torrents WHERE info_hash=? LIMIT 1;", (info_hash,))
cur = magneticod_db.execute("SELECT id, name, discovered_on FROM torrents WHERE info_hash=? LIMIT 1;",
(info_hash,))
try:
name, discovered_on = cur.fetchone()
torrent_id, name, discovered_on = cur.fetchone()
except TypeError: # In case no results returned, TypeError will be raised when we try to subscript None object
return flask.abort(404)
cur = magneticod_db.execute("SELECT path, size FROM files "
"WHERE torrent_id=(SELECT id FROM torrents WHERE info_hash=? LIMIT 1);",
(info_hash,))
cur = magneticod_db.execute("SELECT path, size FROM files WHERE torrent_id=?;", (torrent_id,))
raw_files = cur.fetchall()
size = sum(f[1] for f in raw_files)
files = [File(f[0], utils.to_human_size(f[1])) for f in raw_files]
@ -214,12 +213,15 @@ def get_magneticod_db():
magneticod_db = flask.g.magneticod_db = sqlite3.connect(magneticod_db_path, isolation_level=None)
with magneticod_db:
magneticod_db.execute("CREATE VIRTUAL TABLE temp.fts_torrents USING fts4(torrent_id INTEGER, name TEXT NOT NULL);")
magneticod_db.execute("INSERT INTO fts_torrents (torrent_id, name) SELECT id, name FROM torrents;")
magneticod_db.execute("PRAGMA journal_mode=WAL;")
magneticod_db.execute("PRAGMA temp_store=2;")
magneticod_db.execute("CREATE VIRTUAL TABLE temp.fts_torrents USING fts4(name);")
magneticod_db.execute("INSERT INTO fts_torrents (docid, name) SELECT id, name FROM torrents;")
magneticod_db.execute("INSERT INTO fts_torrents (fts_torrents) VALUES ('optimize');")
magneticod_db.execute("CREATE TEMPORARY TRIGGER on_torrents_insert AFTER INSERT ON torrents FOR EACH ROW BEGIN"
" INSERT INTO fts_torrents (torrent_id, name) VALUES (NEW.id, NEW.name);"
" INSERT INTO fts_torrents (docid, name) VALUES (NEW.id, NEW.name);"
"END;")
magneticod_db.create_function("rank", 1, utils.rank)

View File

@ -37,10 +37,10 @@ def rank(blob):
x.append((x0, x1, x2))
# Ignore the first column (torrent_id)
_, avgdl = unpack_from("=LL", blob, 12 + 3*c*p*4)
avgdl = unpack_from("=L", blob, 12 + 3*c*p*4)[0]
# Ignore the first column (torrent_id)
_, l = unpack_from("=LL", blob, (12 + 3*c*p*4) + 4*c)
l = unpack_from("=L", blob, (12 + 3*c*p*4) + 4*c)[0]
# Multiply by -1 so that sorting in the ASC order would yield the best match first
return -1 * okapi_bm25(term_frequencies=[X[0] for X in x], dl=l, avgdl=avgdl, N=n, nq=[X[2] for X in x])

View File

@ -8,7 +8,7 @@ def read_file(path):
setup(
name="magneticow",
version="0.1.0",
version="0.2.0",
description="Lightweight web interface for magnetico.",
long_description=read_file("README.rst"),
url="http://magnetico.org",
@ -23,9 +23,9 @@ setup(
},
install_requires=[
"appdirs>=1.4.3",
"flask>=0.12.1",
"gevent>=1.2.1"
"appdirs >= 1.4.3",
"flask >= 0.12.1",
"gevent >= 1.2.1"
],
classifiers=[