from .top1m import retrieve_top1m_entries from .util import concat_nonsense, rot13, head from .structs import Check first = [ # this avoids issues with chinese censorship. see: # https://www.bortzmeyer.org/sichuan-pepper.html Check("common", "baidu.com"), ] new = [ # via dnsvalidator Check("adtrack", "bet365.com"), Check("common", "facebook.com"), Check("common", "google.com"), Check("common", "paypal.com"), Check("common", "wikileaks.com"), Check("news", "telegram.com"), ] likely = [ # these checks are, in practice, the most likely # to weed out unwanted DNS servers. Check("news", "huanqiu.com"), Check("adware", rot13("nqf789.pbz")), # some servers block shock sites, which isn't a terrible idea, # but it's inaccurate. Check("shock", rot13("tbng.pk")), # some servers block piracy and porn sites for being taboo, or whatever. Check("porn", "pornhub.com"), Check("adtrack", "media.fastclick.net"), # dns.watch fails here: domain parking is evil, but servers must abide. Check("parking", "scmp.org"), # some servers block sites driven by their chaotic user-created content. Check("usercontent", "4chan.org"), # some servers like to redirect nonexistent domains. see: # https://web.archive.org/web/20140302064622/http://james.bertelson.me/blog/2014/01/level-3-are-now-hijacking-failed-dns-requests-for-ad-revenue-on-4-2-2-x/ Check("bad", concat_nonsense("com")), # blogspot handles these strangely; DNS servers likewise Check("weirdsub", concat_nonsense("javarevisited.blogspot.com")), # NOTE: disabled for being wildly inconsistent: # Cloudflare fails here. see: # https://jarv.is/notes/cloudflare-dns-archive-is-blocked/ # Check("weird", "archive.is"), Check("common", "archive.org"), # this is one of the WannaCry sinkholes, it's kinda important. Check("infosec", "iuqerfsodp9ifjaposdfjhgosurijfaewrwergwea.com"), # try out internationalized domains. Check("common", "xn--b1aew.xn--p1ai"), Check("common", "wikileaks.com"), # i suppose this doubles as a check for the new TLDs. Check("uncommon", "cybre.space"), # some servers block piracy and porn sites for being taboo, or whatever Check("piracy", "thehiddenbay.org"), # some servers block this. see: # https://scan.shadowserver.org/dns/ Check("infosec", "scan.shadowserver.org"), # a few servers block this for some reason. Check("common", "duckduckgo.com"), # DNS poisoning may yield an unwanted result here. Check("badsub", concat_nonsense("google.com")), Check("common", "en.wikipedia.org"), Check("adtrack", "google-analytics.com"), Check("adtrack", "ad.doubleclick.net"), # baidu goes here...? Check("common", "naver.com"), # surely a fully-functional server would resolve # the most popular domain in existence, right? Check("common", "google.com"), ] unlikely = [ Check("piracy", "thepiratebay.org"), Check("porn", "xvideos.com"), Check("usercontent", "imgur.com"), Check("usercontent", "twitter.com"), Check("usercontent", "weibo.com"), Check("usercontent", "github.com"), Check("porn", "chaturbate.com"), Check("video", "bilibili.com"), Check("video", "twitch.tv"), Check("common", "qq.com"), Check("video", "netflix.com"), Check("usercontent", "reddit.com"), Check("usercontent", "facebook.com"), Check("video", "youtube.com"), Check("usercontent", "tumblr.com"), Check("usercontent", "wordpress.com"), Check("common", "tmall.com"), Check("usercontent", "instagram.com"), Check("news", "nytimes.com"), Check("usercontent", "flickr.com"), Check("common", "ebay.com"), Check("news", "scmp.com"), Check("common", "aliexpress.com"), Check("common", "stackoverflow.com"), ] defunct = [ "panda.tv", # imochen.github.io ] def _top1m_gen(): return (Check("top", entry) for i, entry in retrieve_top1m_entries() if entry not in defunct) top100 = head(100, _top1m_gen()) top1000 = head(1000, _top1m_gen())