respodns/respodns/top1m.py

# https://github.com/PeterDaveHello/top-1m-domains
urltop_default = "https://s3-us-west-1.amazonaws.com/umbrella-static/top-1m.csv.zip"
csvfn_default = "top-1m.csv"  # path within the zip file

one_week = 7 * 24 * 60 * 60  # in seconds


def alive(fp, expiry):
    from os.path import exists, getmtime, getsize
    from time import time

    # 2 is used as a magic number because it's len("\r\n")
    return exists(fp) and time() < getmtime(fp) + expiry and getsize(fp) > 2


def download_top1m(urltop=None, csvfn=None):
    from io import BytesIO
    from urllib.request import urlopen
    from zipfile import ZipFile

    if urltop is None:
        urltop = urltop_default
    if csvfn is None:
        csvfn = csvfn_default

    comp = BytesIO()
    with urlopen(urltop) as re:
        comp.write(re.read())

    comp.seek(0)
    with ZipFile(comp) as zipf:
        with zipf.open(csvfn) as f:
            uncomp = f.read()
    comp.close()

    return uncomp.decode("utf-8")


def retrieve_top1m_entries(csv_fp="top-1m.csv"):
    from sys import stderr

    if alive(csv_fp, one_week):
        with open(csv_fp, "r") as f:
            uncomp = f.read()
    else:
        print("downloading", csv_fp, file=stderr)
        uncomp = download_top1m()
        with open(csv_fp, "w") as f:
            f.write(uncomp)

    # we could use the csv module, but this is totally overkill
    # for data that *should* be just a subset of ascii.
    lines = uncomp.splitlines()
    entries = [(lambda a: (int(a[0]), a[2]))(line.partition(","))
               for line in lines if line]
    return entries
use an alternative to the now-defunct Alexa top-1m 2023-11-08 05:53:08 -08:00			`# https://github.com/PeterDaveHello/top-1m-domains`
			`urltop_default = "https://s3-us-west-1.amazonaws.com/umbrella-static/top-1m.csv.zip"`
add a few comments 2021-06-09 01:58:45 -07:00			`csvfn_default = "top-1m.csv" # path within the zip file`
init 2020-08-29 01:16:06 -07:00
			`one_week = 7 * 24 * 60 * 60 # in seconds`

abide to PEP 808 (passes pycodestyle) 2020-08-29 06:34:46 -07:00
begin reorganizing code 2020-08-29 05:54:07 -07:00			`def alive(fp, expiry):`
init 2020-08-29 01:16:06 -07:00			`from os.path import exists, getmtime, getsize`
			`from time import time`

add a few comments 2021-06-09 01:58:45 -07:00			`# 2 is used as a magic number because it's len("\r\n")`
begin reorganizing code 2020-08-29 05:54:07 -07:00			`return exists(fp) and time() < getmtime(fp) + expiry and getsize(fp) > 2`
init 2020-08-29 01:16:06 -07:00
abide to PEP 808 (passes pycodestyle) 2020-08-29 06:34:46 -07:00
init 2020-08-29 01:16:06 -07:00			`def download_top1m(urltop=None, csvfn=None):`
			`from io import BytesIO`
			`from urllib.request import urlopen`
			`from zipfile import ZipFile`

			`if urltop is None:`
			`urltop = urltop_default`
			`if csvfn is None:`
			`csvfn = csvfn_default`

			`comp = BytesIO()`
			`with urlopen(urltop) as re:`
			`comp.write(re.read())`

fix top1m unzipping 2021-06-09 01:59:55 -07:00			`comp.seek(0)`
init 2020-08-29 01:16:06 -07:00			`with ZipFile(comp) as zipf:`
			`with zipf.open(csvfn) as f:`
			`uncomp = f.read()`
			`comp.close()`

			`return uncomp.decode("utf-8")`

abide to PEP 808 (passes pycodestyle) 2020-08-29 06:34:46 -07:00
init 2020-08-29 01:16:06 -07:00			`def retrieve_top1m_entries(csv_fp="top-1m.csv"):`
			`from sys import stderr`

			`if alive(csv_fp, one_week):`
			`with open(csv_fp, "r") as f:`
			`uncomp = f.read()`
			`else:`
			`print("downloading", csv_fp, file=stderr)`
			`uncomp = download_top1m()`
			`with open(csv_fp, "w") as f:`
			`f.write(uncomp)`

			`# we could use the csv module, but this is totally overkill`
			`# for data that should be just a subset of ascii.`
			`lines = uncomp.splitlines()`
			`entries = [(lambda a: (int(a[0]), a[2]))(line.partition(","))`
use an alternative to the now-defunct Alexa top-1m 2023-11-08 05:53:08 -08:00			`for line in lines if line]`
init 2020-08-29 01:16:06 -07:00			`return entries`