From f6e3a28aff0c5740999586f2ee3a8f7cc60616f7 Mon Sep 17 00:00:00 2001
From: Connor Olding <cloningdonor@gmail.com>
Date: Fri, 4 Sep 2020 15:39:08 +0200
Subject: [PATCH] add failures field to Check and sort by them

this commit removes the comments on individual checks.
these comments should be restored at a later point.
---
 respodns/checks.py  | 159 ++++++++++++++++++--------------------------
 respodns/structs.py |   2 +-
 2 files changed, 65 insertions(+), 96 deletions(-)

diff --git a/respodns/checks.py b/respodns/checks.py
index 0b82911..889d20c 100644
--- a/respodns/checks.py
+++ b/respodns/checks.py
@@ -2,109 +2,78 @@ from .top1m import retrieve_top1m_entries
 from .util import concat_nonsense, rot13, head
 from .structs import Check
 
+def order_by_failures(checks):  # descending
+    return sorted(checks, key=lambda check: -check.failures)
+
 first = [
     # checking this first this avoids issues with censorship in China. see:
     # https://www.bortzmeyer.org/sichuan-pepper.html
-    Check("common",         "baidu.com"),
+    Check("common",         "baidu.com", 491),
 ]
 
-likely = [
+likely = order_by_failures([
     # these checks are, in practice, the most likely
     # to weed out unwanted DNS servers.
 
-    Check("news",           "huanqiu.com"),
-    Check("adware",         rot13("nqf789.pbz")),
+    Check("adtrack",        "ad.doubleclick.net", 81),
+    Check("adtrack",        "google-analytics.com", 75),
+    Check("adtrack",        "media.fastclick.net", 116),
+    Check("adware",         rot13("nqf789.pbz"), 168),
+    Check("bad",            concat_nonsense("com"), 153),
+    Check("badsub",         concat_nonsense("google.com"), 63),
+    Check("common",         "archive.org", 98),
+    Check("common",         "duckduckgo.com", 78),
+    Check("common",         "en.wikipedia.org", 75),
+    Check("common",         "facebook.com", 94),
+    Check("common",         "google.com", 69),
+    # Check("common",         "naver.com", 57),
+    Check("common",         "paypal.com", 74),
+    Check("common",         "wikileaks.com", 86),
+    Check("common",         "xn--b1aew.xn--p1ai", 85),
+    Check("gambling",       "bet365.com", 157),
+    Check("gambling",       "betonline.ag", 168),
+    Check("gambling",       "unibet.com", 137),
+    Check("infosec",        "iuqerfsodp9ifjaposdfjhgosurijfaewrwergwea.com", 98),
+    Check("infosec",        "scan.shadowserver.org", 73),
+    Check("news",           "huanqiu.com", 435),
+    Check("news",           "telegram.com", 71),
+    Check("parking",        "scmp.org", 132),
+    Check("piracy",         "thehiddenbay.org", 77),
+    Check("porn",           "pornhub.com", 151),
+    Check("shock",          rot13("tbng.pk"), 209),
+    Check("uncommon",       "cybre.space", 88),
+    Check("uncommon",       "react.uni-saarland.de", 74),
+    Check("usercontent",    "4chan.org", 116),
+    # Check("weird",          "archive.is", 0),
+    Check("weirdsub",       concat_nonsense("javarevisited.blogspot.com"), 126),
+])
 
-    # some servers block shock sites, which isn't a terrible idea,
-    # but it's inaccurate.
-    Check("shock",          rot13("tbng.pk")),  # actually parked at this time
-
-    # some servers block piracy and porn sites for being taboo, or whatever.
-    Check("porn",           "pornhub.com"),
-
-    Check("adtrack",        "media.fastclick.net"),
-
-    # dns.watch fails here: domain parking is evil, but servers must abide.
-    Check("parking",        "scmp.org"),
-
-    # some servers block sites driven by their chaotic user-created content.
-    Check("usercontent",    "4chan.org"),
-
-    # some servers like to redirect nonexistent domains. see:
-    # https://web.archive.org/web/20140302064622/http://james.bertelson.me/blog/2014/01/level-3-are-now-hijacking-failed-dns-requests-for-ad-revenue-on-4-2-2-x/
-    Check("bad",            concat_nonsense("com")),
-
-    # blogspot handles these strangely; DNS servers likewise
-    Check("weirdsub",       concat_nonsense("javarevisited.blogspot.com")),
-
-    # NOTE: disabled for being wildly inconsistent:
-    # Cloudflare fails here. see:
-    # https://jarv.is/notes/cloudflare-dns-archive-is-blocked/
-    # Check("weird",          "archive.is"),
-
-    Check("common",         "archive.org"),
-
-    # this is one of the WannaCry sinkholes, it's kinda important.
-    Check("infosec",        "iuqerfsodp9ifjaposdfjhgosurijfaewrwergwea.com"),
-
-    # try out internationalized domains.
-    Check("common",         "xn--b1aew.xn--p1ai"),
-
-    Check("common",         "wikileaks.com"),
-
-    # i suppose this doubles as a check for the new TLDs.
-    Check("uncommon",       "cybre.space"),
-
-    # some servers block piracy and porn sites for being taboo, or whatever
-    Check("piracy",         "thehiddenbay.org"),
-
-    # some servers block this. see:
-    # https://scan.shadowserver.org/dns/
-    Check("infosec",        "scan.shadowserver.org"),
-
-    # a few servers block this for some reason.
-    Check("common",         "duckduckgo.com"),
-
-    # DNS poisoning may yield an unwanted result here.
-    Check("badsub",         concat_nonsense("google.com")),
-
-    Check("common",         "en.wikipedia.org"),
-    Check("adtrack",        "google-analytics.com"),
-    Check("adtrack",        "ad.doubleclick.net"),
-    # baidu goes here...?
-    Check("common",         "naver.com"),
-
-    # surely a fully-functional server would resolve
-    # the most popular domain in existence, right?
-    Check("common",         "google.com"),
-]
-
-unlikely = [
-    Check("piracy",         "thepiratebay.org"),
-    Check("porn",           "xvideos.com"),
-    Check("usercontent",    "imgur.com"),
-    Check("usercontent",    "twitter.com"),
-    Check("usercontent",    "weibo.com"),
-    Check("usercontent",    "github.com"),
-    Check("porn",           "chaturbate.com"),
-    Check("video",          "bilibili.com"),
-    Check("video",          "twitch.tv"),
-    Check("common",         "qq.com"),
-    Check("video",          "netflix.com"),
-    Check("usercontent",    "reddit.com"),
-    Check("usercontent",    "facebook.com"),
-    Check("video",          "youtube.com"),
-    Check("usercontent",    "tumblr.com"),
-    Check("usercontent",    "wordpress.com"),
-    Check("common",         "tmall.com"),
-    Check("usercontent",    "instagram.com"),
-    Check("news",           "nytimes.com"),
-    Check("usercontent",    "flickr.com"),
-    Check("common",         "ebay.com"),
-    Check("news",           "scmp.com"),
-    Check("common",         "aliexpress.com"),
-    Check("common",         "stackoverflow.com"),
-]
+unlikely = order_by_failures([
+    Check("common",         "aliexpress.com", 2),
+    Check("common",         "ebay.com", 4),
+    Check("common",         "qq.com", 15),
+    Check("common",         "stackoverflow.com", 1),
+    Check("common",         "tmall.com", 8),
+    Check("news",           "nytimes.com", 6),
+    Check("news",           "scmp.com", 3),
+    Check("piracy",         "thepiratebay.org", 24),
+    Check("porn",           "chaturbate.com", 18),
+    Check("porn",           "xvideos.com", 23),
+    Check("usercontent",    "facebook.com", 12),
+    Check("usercontent",    "flickr.com", 5),
+    Check("usercontent",    "github.com", 19),
+    Check("usercontent",    "imgur.com", 22),
+    Check("usercontent",    "instagram.com", 7),
+    Check("usercontent",    "reddit.com", 13),
+    Check("usercontent",    "tumblr.com", 10),
+    Check("usercontent",    "twitter.com", 21),
+    Check("usercontent",    "weibo.com", 20),
+    Check("usercontent",    "wordpress.com", 9),
+    Check("video",          "bilibili.com", 17),
+    Check("video",          "netflix.com", 14),
+    Check("video",          "twitch.tv", 16),
+    Check("video",          "youtube.com", 11),
+])
 
 defunct = [
     "panda.tv",  # imochen.github.io
@@ -112,7 +81,7 @@ defunct = [
 
 
 def _top1m_gen():
-    return (Check("top", entry)
+    return (Check("top", entry, 0)
             for i, entry in retrieve_top1m_entries()
             if entry not in defunct)
 
diff --git a/respodns/structs.py b/respodns/structs.py
index cd541cd..bf8f937 100644
--- a/respodns/structs.py
+++ b/respodns/structs.py
@@ -35,4 +35,4 @@ class Entry:
     execution: object
 
 
-Check = namedtuple("Check", ("kind", "domain"))
+Check = namedtuple("Check", ("kind", "domain", "failures"))