respodns/respodns/checks.py

from .top1m import retrieve_top1m_entries
from .util import concat_nonsense, rot13, head
from .structs import Check

first = [
    # checking this first this avoids issues with censorship in China. see:
    # https://www.bortzmeyer.org/sichuan-pepper.html
    Check("common",         "baidu.com"),
]

likely = [
    # these checks are, in practice, the most likely
    # to weed out unwanted DNS servers.

    Check("news",           "huanqiu.com"),
    Check("adware",         rot13("nqf789.pbz")),

    # some servers block shock sites, which isn't a terrible idea,
    # but it's inaccurate.
    Check("shock",          rot13("tbng.pk")),  # actually parked at this time

    # some servers block piracy and porn sites for being taboo, or whatever.
    Check("porn",           "pornhub.com"),

    Check("adtrack",        "media.fastclick.net"),

    # dns.watch fails here: domain parking is evil, but servers must abide.
    Check("parking",        "scmp.org"),

    # some servers block sites driven by their chaotic user-created content.
    Check("usercontent",    "4chan.org"),

    # some servers like to redirect nonexistent domains. see:
    # https://web.archive.org/web/20140302064622/http://james.bertelson.me/blog/2014/01/level-3-are-now-hijacking-failed-dns-requests-for-ad-revenue-on-4-2-2-x/
    Check("bad",            concat_nonsense("com")),

    # blogspot handles these strangely; DNS servers likewise
    Check("weirdsub",       concat_nonsense("javarevisited.blogspot.com")),

    # NOTE: disabled for being wildly inconsistent:
    # Cloudflare fails here. see:
    # https://jarv.is/notes/cloudflare-dns-archive-is-blocked/
    # Check("weird",          "archive.is"),

    Check("common",         "archive.org"),

    # this is one of the WannaCry sinkholes, it's kinda important.
    Check("infosec",        "iuqerfsodp9ifjaposdfjhgosurijfaewrwergwea.com"),

    # try out internationalized domains.
    Check("common",         "xn--b1aew.xn--p1ai"),

    Check("common",         "wikileaks.com"),

    # i suppose this doubles as a check for the new TLDs.
    Check("uncommon",       "cybre.space"),

    # some servers block piracy and porn sites for being taboo, or whatever
    Check("piracy",         "thehiddenbay.org"),

    # some servers block this. see:
    # https://scan.shadowserver.org/dns/
    Check("infosec",        "scan.shadowserver.org"),

    # a few servers block this for some reason.
    Check("common",         "duckduckgo.com"),

    # DNS poisoning may yield an unwanted result here.
    Check("badsub",         concat_nonsense("google.com")),

    Check("common",         "en.wikipedia.org"),
    Check("adtrack",        "google-analytics.com"),
    Check("adtrack",        "ad.doubleclick.net"),
    # baidu goes here...?
    Check("common",         "naver.com"),

    # surely a fully-functional server would resolve
    # the most popular domain in existence, right?
    Check("common",         "google.com"),
]

unlikely = [
    Check("piracy",         "thepiratebay.org"),
    Check("porn",           "xvideos.com"),
    Check("usercontent",    "imgur.com"),
    Check("usercontent",    "twitter.com"),
    Check("usercontent",    "weibo.com"),
    Check("usercontent",    "github.com"),
    Check("porn",           "chaturbate.com"),
    Check("video",          "bilibili.com"),
    Check("video",          "twitch.tv"),
    Check("common",         "qq.com"),
    Check("video",          "netflix.com"),
    Check("usercontent",    "reddit.com"),
    Check("usercontent",    "facebook.com"),
    Check("video",          "youtube.com"),
    Check("usercontent",    "tumblr.com"),
    Check("usercontent",    "wordpress.com"),
    Check("common",         "tmall.com"),
    Check("usercontent",    "instagram.com"),
    Check("news",           "nytimes.com"),
    Check("usercontent",    "flickr.com"),
    Check("common",         "ebay.com"),
    Check("news",           "scmp.com"),
    Check("common",         "aliexpress.com"),
    Check("common",         "stackoverflow.com"),
]

defunct = [
    "panda.tv",  # imochen.github.io
]


def _top1m_gen():
    return (Check("top", entry)
            for i, entry in retrieve_top1m_entries()
            if entry not in defunct)


top100 = head(100, _top1m_gen())
top1000 = head(1000, _top1m_gen())