-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape_dht.py
executable file
·70 lines (62 loc) · 2.27 KB
/
scrape_dht.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
#!/usr/bin/env python3
import os, btdht, binascii, hashlib, time, base64, json
from database import database
"""
This script is at the heart of our data collection. It's intended to be run
as a cronjob every five minutes or so. It connects to the DHT, waits a
minute to find peers, then performs a lookup on each of our torrent hashes
to determine the number of peers for that torrents. Logs results to our
database, then exits.
"""
STARTUP_DELAY = 120 # How long to wait for the DHT to peer before querying
ID = "r9T5uUAuTG00hwKyBkZw"
BIND_PORT = 2987
# Returns a list of torrent hashes to examine
def getTorrentHashes():
with database() as (conn,c):
c.execute("SELECT DISTINCT hash FROM torrents")
return list(map(lambda r: r[0], c.fetchall()))
# Support both hexadecimal and older b32 encoded info hashes
def hashToBinary(info_hash):
if( len(info_hash) == 40 ):
return binascii.a2b_hex(info_hash)
elif( len(info_hash) == 32 ):
return base64.b32decode(info_hash)
else:
raise ValueError("Torrent info hash '%s' is invalid" % info_hash)
# Returns a list of hashed IP addresses peered with a torrent
def getPeers(dht, info_hash, salt):
peers = dht.get_peers(hashToBinary(info_hash))
if( peers == None ):
return set()
hashed_peers = set()
for (ip,port) in peers:
peer = (ip+salt).encode("ascii")
m = hashlib.sha256(peer).hexdigest()
hashed_peers.add(m)
return hashed_peers
def logPeers(torrentPeers):
epoch = int(time.time())
with database() as (conn,c):
for info_hash in torrentPeers.keys():
num_peers = len(torrentPeers[info_hash])
for peer in torrentPeers[info_hash]:
c.execute("INSERT INTO peers VALUES(%s,%s,%s,%s)", [info_hash,epoch,peer,True])
def loadSalt():
config = None
with open(os.path.dirname(os.path.realpath(__file__)) + "/config.json", "r") as f:
config = json.loads(f.read())
return config["salt"]
if __name__ == "__main__":
dht = btdht.DHT(id=ID.encode("ascii"), bind_port=BIND_PORT)
dht.start()
salt = loadSalt()
print("DHT created, waiting to peer...")
time.sleep(STARTUP_DELAY)
while( True ):
hashes = getTorrentHashes()
torrentPeers = dict()
for info_hash in hashes:
torrentPeers[info_hash] = getPeers(dht, info_hash, salt)
print("Found %3d peers for %s" % (len(torrentPeers[info_hash]), info_hash))
logPeers(torrentPeers)