respodns/respodns/checks.py

from .top1m import retrieve_top1m_entries
from .util import concat_nonsense, rot13, head
from .structs import Check

first = [
    # checking this first this avoids issues with censorship in China. see:
    # https://www.bortzmeyer.org/sichuan-pepper.html
    Check("common",         "baidu.com"),
]

likely = [
    # these checks are, in practice, the most likely
    # to weed out unwanted DNS servers.

    Check("news",           "huanqiu.com"),
    Check("adware",         rot13("nqf789.pbz")),

    # some servers block shock sites, which isn't a terrible idea,
    # but it's inaccurate.
    Check("shock",          rot13("tbng.pk")),  # actually parked at this time

    # some servers block piracy and porn sites for being taboo, or whatever.
    Check("porn",           "pornhub.com"),

    Check("adtrack",        "media.fastclick.net"),

    # dns.watch fails here: domain parking is evil, but servers must abide.
    Check("parking",        "scmp.org"),

    # some servers block sites driven by their chaotic user-created content.
    Check("usercontent",    "4chan.org"),

    # some servers like to redirect nonexistent domains. see:
    # https://web.archive.org/web/20140302064622/http://james.bertelson.me/blog/2014/01/level-3-are-now-hijacking-failed-dns-requests-for-ad-revenue-on-4-2-2-x/
    Check("bad",            concat_nonsense("com")),

    # blogspot handles these strangely; DNS servers likewise
    Check("weirdsub",       concat_nonsense("javarevisited.blogspot.com")),

    # NOTE: disabled for being wildly inconsistent:
    # Cloudflare fails here. see:
    # https://jarv.is/notes/cloudflare-dns-archive-is-blocked/
    # Check("weird",          "archive.is"),

    Check("common",         "archive.org"),

    # this is one of the WannaCry sinkholes, it's kinda important.
    Check("infosec",        "iuqerfsodp9ifjaposdfjhgosurijfaewrwergwea.com"),

    # try out internationalized domains.
    Check("common",         "xn--b1aew.xn--p1ai"),

    Check("common",         "wikileaks.com"),

    # i suppose this doubles as a check for the new TLDs.
    Check("uncommon",       "cybre.space"),

    # some servers block piracy and porn sites for being taboo, or whatever
    Check("piracy",         "thehiddenbay.org"),

    # some servers block this. see:
    # https://scan.shadowserver.org/dns/
    Check("infosec",        "scan.shadowserver.org"),

    # a few servers block this for some reason.
    Check("common",         "duckduckgo.com"),

    # DNS poisoning may yield an unwanted result here.
    Check("badsub",         concat_nonsense("google.com")),

    Check("common",         "en.wikipedia.org"),
    Check("adtrack",        "google-analytics.com"),
    Check("adtrack",        "ad.doubleclick.net"),
    # baidu goes here...?
    Check("common",         "naver.com"),

    # surely a fully-functional server would resolve
    # the most popular domain in existence, right?
    Check("common",         "google.com"),
]

unlikely = [
    Check("piracy",         "thepiratebay.org"),
    Check("porn",           "xvideos.com"),
    Check("usercontent",    "imgur.com"),
    Check("usercontent",    "twitter.com"),
    Check("usercontent",    "weibo.com"),
    Check("usercontent",    "github.com"),
    Check("porn",           "chaturbate.com"),
    Check("video",          "bilibili.com"),
    Check("video",          "twitch.tv"),
    Check("common",         "qq.com"),
    Check("video",          "netflix.com"),
    Check("usercontent",    "reddit.com"),
    Check("usercontent",    "facebook.com"),
    Check("video",          "youtube.com"),
    Check("usercontent",    "tumblr.com"),
    Check("usercontent",    "wordpress.com"),
    Check("common",         "tmall.com"),
    Check("usercontent",    "instagram.com"),
    Check("news",           "nytimes.com"),
    Check("usercontent",    "flickr.com"),
    Check("common",         "ebay.com"),
    Check("news",           "scmp.com"),
    Check("common",         "aliexpress.com"),
    Check("common",         "stackoverflow.com"),
]

defunct = [
    "panda.tv",  # imochen.github.io
]


def _top1m_gen():
    return (Check("top", entry)
            for i, entry in retrieve_top1m_entries()
            if entry not in defunct)


top100 = head(100, _top1m_gen())
top1000 = head(1000, _top1m_gen())
init 2020-08-29 01:16:06 -07:00			`from .top1m import retrieve_top1m_entries`
begin reorganizing code 2020-08-29 05:54:07 -07:00			`from .util import concat_nonsense, rot13, head`
			`from .structs import Check`
init 2020-08-29 01:16:06 -07:00
reorganize checks 2020-08-29 01:28:26 -07:00			`first = [`
reword the comment on censorship 2020-09-03 04:31:21 -07:00			`# checking this first this avoids issues with censorship in China. see:`
abide to PEP 808 (passes pycodestyle) 2020-08-29 06:34:46 -07:00			`# https://www.bortzmeyer.org/sichuan-pepper.html`
			`Check("common", "baidu.com"),`
init 2020-08-29 01:16:06 -07:00			`]`

reorganize checks 2020-08-29 01:28:26 -07:00			`likely = [`
abide to PEP 808 (passes pycodestyle) 2020-08-29 06:34:46 -07:00			`# these checks are, in practice, the most likely`
			`# to weed out unwanted DNS servers.`
superficial cleanup 2020-08-30 03:19:42 -07:00
abide to PEP 808 (passes pycodestyle) 2020-08-29 06:34:46 -07:00			`Check("news", "huanqiu.com"),`
			`Check("adware", rot13("nqf789.pbz")),`

			`# some servers block shock sites, which isn't a terrible idea,`
			`# but it's inaccurate.`
write comment about a shock domain being parked 2020-08-31 23:25:59 -07:00			`Check("shock", rot13("tbng.pk")), # actually parked at this time`
abide to PEP 808 (passes pycodestyle) 2020-08-29 06:34:46 -07:00
			`# some servers block piracy and porn sites for being taboo, or whatever.`
			`Check("porn", "pornhub.com"),`
superficial cleanup 2020-08-30 03:19:42 -07:00
abide to PEP 808 (passes pycodestyle) 2020-08-29 06:34:46 -07:00			`Check("adtrack", "media.fastclick.net"),`

			`# dns.watch fails here: domain parking is evil, but servers must abide.`
			`Check("parking", "scmp.org"),`

			`# some servers block sites driven by their chaotic user-created content.`
			`Check("usercontent", "4chan.org"),`

			`# some servers like to redirect nonexistent domains. see:`
			`# https://web.archive.org/web/20140302064622/http://james.bertelson.me/blog/2014/01/level-3-are-now-hijacking-failed-dns-requests-for-ad-revenue-on-4-2-2-x/`
			`Check("bad", concat_nonsense("com")),`

			`# blogspot handles these strangely; DNS servers likewise`
			`Check("weirdsub", concat_nonsense("javarevisited.blogspot.com")),`

init 2020-08-29 01:16:06 -07:00			`# NOTE: disabled for being wildly inconsistent:`
abide to PEP 808 (passes pycodestyle) 2020-08-29 06:34:46 -07:00			`# Cloudflare fails here. see:`
			`# https://jarv.is/notes/cloudflare-dns-archive-is-blocked/`
disable archive.is once more 2020-08-29 06:44:16 -07:00			`# Check("weird", "archive.is"),`
abide to PEP 808 (passes pycodestyle) 2020-08-29 06:34:46 -07:00
			`Check("common", "archive.org"),`

			`# this is one of the WannaCry sinkholes, it's kinda important.`
			`Check("infosec", "iuqerfsodp9ifjaposdfjhgosurijfaewrwergwea.com"),`

			`# try out internationalized domains.`
			`Check("common", "xn--b1aew.xn--p1ai"),`

			`Check("common", "wikileaks.com"),`

			`# i suppose this doubles as a check for the new TLDs.`
			`Check("uncommon", "cybre.space"),`

			`# some servers block piracy and porn sites for being taboo, or whatever`
			`Check("piracy", "thehiddenbay.org"),`

			`# some servers block this. see:`
			`# https://scan.shadowserver.org/dns/`
			`Check("infosec", "scan.shadowserver.org"),`

			`# a few servers block this for some reason.`
			`Check("common", "duckduckgo.com"),`

			`# DNS poisoning may yield an unwanted result here.`
			`Check("badsub", concat_nonsense("google.com")),`

			`Check("common", "en.wikipedia.org"),`
			`Check("adtrack", "google-analytics.com"),`
			`Check("adtrack", "ad.doubleclick.net"),`
init 2020-08-29 01:16:06 -07:00			`# baidu goes here...?`
abide to PEP 808 (passes pycodestyle) 2020-08-29 06:34:46 -07:00			`Check("common", "naver.com"),`

			`# surely a fully-functional server would resolve`
			`# the most popular domain in existence, right?`
			`Check("common", "google.com"),`
init 2020-08-29 01:16:06 -07:00			`]`

reorganize checks 2020-08-29 01:28:26 -07:00			`unlikely = [`
abide to PEP 808 (passes pycodestyle) 2020-08-29 06:34:46 -07:00			`Check("piracy", "thepiratebay.org"),`
			`Check("porn", "xvideos.com"),`
			`Check("usercontent", "imgur.com"),`
			`Check("usercontent", "twitter.com"),`
			`Check("usercontent", "weibo.com"),`
			`Check("usercontent", "github.com"),`
			`Check("porn", "chaturbate.com"),`
			`Check("video", "bilibili.com"),`
			`Check("video", "twitch.tv"),`
			`Check("common", "qq.com"),`
			`Check("video", "netflix.com"),`
			`Check("usercontent", "reddit.com"),`
			`Check("usercontent", "facebook.com"),`
			`Check("video", "youtube.com"),`
			`Check("usercontent", "tumblr.com"),`
			`Check("usercontent", "wordpress.com"),`
			`Check("common", "tmall.com"),`
			`Check("usercontent", "instagram.com"),`
			`Check("news", "nytimes.com"),`
			`Check("usercontent", "flickr.com"),`
			`Check("common", "ebay.com"),`
			`Check("news", "scmp.com"),`
			`Check("common", "aliexpress.com"),`
			`Check("common", "stackoverflow.com"),`
init 2020-08-29 01:16:06 -07:00			`]`

			`defunct = [`
			`"panda.tv", # imochen.github.io`
			`]`

abide to PEP 808 (passes pycodestyle) 2020-08-29 06:34:46 -07:00
init 2020-08-29 01:16:06 -07:00			`def _top1m_gen():`
			`return (Check("top", entry)`
			`for i, entry in retrieve_top1m_entries()`
			`if entry not in defunct)`

abide to PEP 808 (passes pycodestyle) 2020-08-29 06:34:46 -07:00
init 2020-08-29 01:16:06 -07:00			`top100 = head(100, _top1m_gen())`
			`top1000 = head(1000, _top1m_gen())`