respodns/respodns/top1m.py

57 lines
1.6 KiB
Python
Raw Permalink Normal View History

# https://github.com/PeterDaveHello/top-1m-domains
urltop_default = "https://s3-us-west-1.amazonaws.com/umbrella-static/top-1m.csv.zip"
2021-06-09 01:58:45 -07:00
csvfn_default = "top-1m.csv" # path within the zip file
2020-08-29 01:16:06 -07:00
one_week = 7 * 24 * 60 * 60 # in seconds
2020-08-29 06:34:46 -07:00
2020-08-29 05:54:07 -07:00
def alive(fp, expiry):
2020-08-29 01:16:06 -07:00
from os.path import exists, getmtime, getsize
from time import time
2021-06-09 01:58:45 -07:00
# 2 is used as a magic number because it's len("\r\n")
2020-08-29 05:54:07 -07:00
return exists(fp) and time() < getmtime(fp) + expiry and getsize(fp) > 2
2020-08-29 01:16:06 -07:00
2020-08-29 06:34:46 -07:00
2020-08-29 01:16:06 -07:00
def download_top1m(urltop=None, csvfn=None):
from io import BytesIO
from urllib.request import urlopen
from zipfile import ZipFile
if urltop is None:
urltop = urltop_default
if csvfn is None:
csvfn = csvfn_default
comp = BytesIO()
with urlopen(urltop) as re:
comp.write(re.read())
2021-06-09 01:59:55 -07:00
comp.seek(0)
2020-08-29 01:16:06 -07:00
with ZipFile(comp) as zipf:
with zipf.open(csvfn) as f:
uncomp = f.read()
comp.close()
return uncomp.decode("utf-8")
2020-08-29 06:34:46 -07:00
2020-08-29 01:16:06 -07:00
def retrieve_top1m_entries(csv_fp="top-1m.csv"):
from sys import stderr
if alive(csv_fp, one_week):
with open(csv_fp, "r") as f:
uncomp = f.read()
else:
print("downloading", csv_fp, file=stderr)
uncomp = download_top1m()
with open(csv_fp, "w") as f:
f.write(uncomp)
# we could use the csv module, but this is totally overkill
# for data that *should* be just a subset of ascii.
lines = uncomp.splitlines()
entries = [(lambda a: (int(a[0]), a[2]))(line.partition(","))
for line in lines if line]
2020-08-29 01:16:06 -07:00
return entries