# https://github.com/PeterDaveHello/top-1m-domains urltop_default = "https://s3-us-west-1.amazonaws.com/umbrella-static/top-1m.csv.zip" csvfn_default = "top-1m.csv" # path within the zip file one_week = 7 * 24 * 60 * 60 # in seconds def alive(fp, expiry): from os.path import exists, getmtime, getsize from time import time # 2 is used as a magic number because it's len("\r\n") return exists(fp) and time() < getmtime(fp) + expiry and getsize(fp) > 2 def download_top1m(urltop=None, csvfn=None): from io import BytesIO from urllib.request import urlopen from zipfile import ZipFile if urltop is None: urltop = urltop_default if csvfn is None: csvfn = csvfn_default comp = BytesIO() with urlopen(urltop) as re: comp.write(re.read()) comp.seek(0) with ZipFile(comp) as zipf: with zipf.open(csvfn) as f: uncomp = f.read() comp.close() return uncomp.decode("utf-8") def retrieve_top1m_entries(csv_fp="top-1m.csv"): from sys import stderr if alive(csv_fp, one_week): with open(csv_fp, "r") as f: uncomp = f.read() else: print("downloading", csv_fp, file=stderr) uncomp = download_top1m() with open(csv_fp, "w") as f: f.write(uncomp) # we could use the csv module, but this is totally overkill # for data that *should* be just a subset of ascii. lines = uncomp.splitlines() entries = [(lambda a: (int(a[0]), a[2]))(line.partition(",")) for line in lines if line] return entries