53 lines
1.4 KiB
Python
53 lines
1.4 KiB
Python
urltop_default = "http://s3.amazonaws.com/alexa-static/top-1m.csv.zip"
|
|
csvfn_default = "top-1m.csv"
|
|
|
|
one_week = 7 * 24 * 60 * 60 # in seconds
|
|
|
|
|
|
def alive(fp, expiry):
|
|
from os.path import exists, getmtime, getsize
|
|
from time import time
|
|
|
|
return exists(fp) and time() < getmtime(fp) + expiry and getsize(fp) > 2
|
|
|
|
|
|
def download_top1m(urltop=None, csvfn=None):
|
|
from io import BytesIO
|
|
from urllib.request import urlopen
|
|
from zipfile import ZipFile
|
|
|
|
if urltop is None:
|
|
urltop = urltop_default
|
|
if csvfn is None:
|
|
csvfn = csvfn_default
|
|
|
|
comp = BytesIO()
|
|
with urlopen(urltop) as re:
|
|
comp.write(re.read())
|
|
|
|
with ZipFile(comp) as zipf:
|
|
with zipf.open(csvfn) as f:
|
|
uncomp = f.read()
|
|
comp.close()
|
|
|
|
return uncomp.decode("utf-8")
|
|
|
|
|
|
def retrieve_top1m_entries(csv_fp="top-1m.csv"):
|
|
from sys import stderr
|
|
|
|
if alive(csv_fp, one_week):
|
|
with open(csv_fp, "r") as f:
|
|
uncomp = f.read()
|
|
else:
|
|
print("downloading", csv_fp, file=stderr)
|
|
uncomp = download_top1m()
|
|
with open(csv_fp, "w") as f:
|
|
f.write(uncomp)
|
|
|
|
# we could use the csv module, but this is totally overkill
|
|
# for data that *should* be just a subset of ascii.
|
|
lines = uncomp.splitlines()
|
|
entries = [(lambda a: (int(a[0]), a[2]))(line.partition(","))
|
|
for line in lines]
|
|
return entries
|