add failures field to Check and sort by them

this commit removes the comments on individual checks.
these comments should be restored at a later point.
This commit is contained in:
Connor Olding 2020-09-04 15:39:08 +02:00
parent 35f2ce9206
commit f6e3a28aff
2 changed files with 65 additions and 96 deletions

View File

@ -2,109 +2,78 @@ from .top1m import retrieve_top1m_entries
from .util import concat_nonsense, rot13, head from .util import concat_nonsense, rot13, head
from .structs import Check from .structs import Check
def order_by_failures(checks): # descending
return sorted(checks, key=lambda check: -check.failures)
first = [ first = [
# checking this first this avoids issues with censorship in China. see: # checking this first this avoids issues with censorship in China. see:
# https://www.bortzmeyer.org/sichuan-pepper.html # https://www.bortzmeyer.org/sichuan-pepper.html
Check("common", "baidu.com"), Check("common", "baidu.com", 491),
] ]
likely = [ likely = order_by_failures([
# these checks are, in practice, the most likely # these checks are, in practice, the most likely
# to weed out unwanted DNS servers. # to weed out unwanted DNS servers.
Check("news", "huanqiu.com"), Check("adtrack", "ad.doubleclick.net", 81),
Check("adware", rot13("nqf789.pbz")), Check("adtrack", "google-analytics.com", 75),
Check("adtrack", "media.fastclick.net", 116),
Check("adware", rot13("nqf789.pbz"), 168),
Check("bad", concat_nonsense("com"), 153),
Check("badsub", concat_nonsense("google.com"), 63),
Check("common", "archive.org", 98),
Check("common", "duckduckgo.com", 78),
Check("common", "en.wikipedia.org", 75),
Check("common", "facebook.com", 94),
Check("common", "google.com", 69),
# Check("common", "naver.com", 57),
Check("common", "paypal.com", 74),
Check("common", "wikileaks.com", 86),
Check("common", "xn--b1aew.xn--p1ai", 85),
Check("gambling", "bet365.com", 157),
Check("gambling", "betonline.ag", 168),
Check("gambling", "unibet.com", 137),
Check("infosec", "iuqerfsodp9ifjaposdfjhgosurijfaewrwergwea.com", 98),
Check("infosec", "scan.shadowserver.org", 73),
Check("news", "huanqiu.com", 435),
Check("news", "telegram.com", 71),
Check("parking", "scmp.org", 132),
Check("piracy", "thehiddenbay.org", 77),
Check("porn", "pornhub.com", 151),
Check("shock", rot13("tbng.pk"), 209),
Check("uncommon", "cybre.space", 88),
Check("uncommon", "react.uni-saarland.de", 74),
Check("usercontent", "4chan.org", 116),
# Check("weird", "archive.is", 0),
Check("weirdsub", concat_nonsense("javarevisited.blogspot.com"), 126),
])
# some servers block shock sites, which isn't a terrible idea, unlikely = order_by_failures([
# but it's inaccurate. Check("common", "aliexpress.com", 2),
Check("shock", rot13("tbng.pk")), # actually parked at this time Check("common", "ebay.com", 4),
Check("common", "qq.com", 15),
# some servers block piracy and porn sites for being taboo, or whatever. Check("common", "stackoverflow.com", 1),
Check("porn", "pornhub.com"), Check("common", "tmall.com", 8),
Check("news", "nytimes.com", 6),
Check("adtrack", "media.fastclick.net"), Check("news", "scmp.com", 3),
Check("piracy", "thepiratebay.org", 24),
# dns.watch fails here: domain parking is evil, but servers must abide. Check("porn", "chaturbate.com", 18),
Check("parking", "scmp.org"), Check("porn", "xvideos.com", 23),
Check("usercontent", "facebook.com", 12),
# some servers block sites driven by their chaotic user-created content. Check("usercontent", "flickr.com", 5),
Check("usercontent", "4chan.org"), Check("usercontent", "github.com", 19),
Check("usercontent", "imgur.com", 22),
# some servers like to redirect nonexistent domains. see: Check("usercontent", "instagram.com", 7),
# https://web.archive.org/web/20140302064622/http://james.bertelson.me/blog/2014/01/level-3-are-now-hijacking-failed-dns-requests-for-ad-revenue-on-4-2-2-x/ Check("usercontent", "reddit.com", 13),
Check("bad", concat_nonsense("com")), Check("usercontent", "tumblr.com", 10),
Check("usercontent", "twitter.com", 21),
# blogspot handles these strangely; DNS servers likewise Check("usercontent", "weibo.com", 20),
Check("weirdsub", concat_nonsense("javarevisited.blogspot.com")), Check("usercontent", "wordpress.com", 9),
Check("video", "bilibili.com", 17),
# NOTE: disabled for being wildly inconsistent: Check("video", "netflix.com", 14),
# Cloudflare fails here. see: Check("video", "twitch.tv", 16),
# https://jarv.is/notes/cloudflare-dns-archive-is-blocked/ Check("video", "youtube.com", 11),
# Check("weird", "archive.is"), ])
Check("common", "archive.org"),
# this is one of the WannaCry sinkholes, it's kinda important.
Check("infosec", "iuqerfsodp9ifjaposdfjhgosurijfaewrwergwea.com"),
# try out internationalized domains.
Check("common", "xn--b1aew.xn--p1ai"),
Check("common", "wikileaks.com"),
# i suppose this doubles as a check for the new TLDs.
Check("uncommon", "cybre.space"),
# some servers block piracy and porn sites for being taboo, or whatever
Check("piracy", "thehiddenbay.org"),
# some servers block this. see:
# https://scan.shadowserver.org/dns/
Check("infosec", "scan.shadowserver.org"),
# a few servers block this for some reason.
Check("common", "duckduckgo.com"),
# DNS poisoning may yield an unwanted result here.
Check("badsub", concat_nonsense("google.com")),
Check("common", "en.wikipedia.org"),
Check("adtrack", "google-analytics.com"),
Check("adtrack", "ad.doubleclick.net"),
# baidu goes here...?
Check("common", "naver.com"),
# surely a fully-functional server would resolve
# the most popular domain in existence, right?
Check("common", "google.com"),
]
unlikely = [
Check("piracy", "thepiratebay.org"),
Check("porn", "xvideos.com"),
Check("usercontent", "imgur.com"),
Check("usercontent", "twitter.com"),
Check("usercontent", "weibo.com"),
Check("usercontent", "github.com"),
Check("porn", "chaturbate.com"),
Check("video", "bilibili.com"),
Check("video", "twitch.tv"),
Check("common", "qq.com"),
Check("video", "netflix.com"),
Check("usercontent", "reddit.com"),
Check("usercontent", "facebook.com"),
Check("video", "youtube.com"),
Check("usercontent", "tumblr.com"),
Check("usercontent", "wordpress.com"),
Check("common", "tmall.com"),
Check("usercontent", "instagram.com"),
Check("news", "nytimes.com"),
Check("usercontent", "flickr.com"),
Check("common", "ebay.com"),
Check("news", "scmp.com"),
Check("common", "aliexpress.com"),
Check("common", "stackoverflow.com"),
]
defunct = [ defunct = [
"panda.tv", # imochen.github.io "panda.tv", # imochen.github.io
@ -112,7 +81,7 @@ defunct = [
def _top1m_gen(): def _top1m_gen():
return (Check("top", entry) return (Check("top", entry, 0)
for i, entry in retrieve_top1m_entries() for i, entry in retrieve_top1m_entries()
if entry not in defunct) if entry not in defunct)

View File

@ -35,4 +35,4 @@ class Entry:
execution: object execution: object
Check = namedtuple("Check", ("kind", "domain")) Check = namedtuple("Check", ("kind", "domain", "failures"))