from .top1m import retrieve_top1m_entries from .util import concat_nonsense, rot13, head from .structs import Check def order_by_failures(checks): # descending order return sorted(checks, key=lambda check: -check.failures) def partition_checks(checks, n): more, less = [], [] for check in checks: if check.failures >= n: more.append(check) else: less.append(check) return more, less first = [ # checking this first this avoids issues with censorship in China. see: # https://www.bortzmeyer.org/sichuan-pepper.html Check("common", "baidu.com", 1151), ] _sinkhole = "iuqerfsodp9ifjaposdfjhgosurijfaewrwergwea.com" _weirdsub = concat_nonsense("javarevisited.blogspot.com") checks = order_by_failures([ Check("adtrack", "ad.doubleclick.net", 99), Check("adtrack", "google-analytics.com", 89), Check("adtrack", "media.fastclick.net", 161), Check("adware", rot13("nqf789.pbz"), 194), Check("bad", concat_nonsense("com"), 170), Check("badsub", concat_nonsense("google.com"), 91), Check("common", "aliexpress.com", 114), Check("common", "archive.org", 114), Check("common", "duckduckgo.com", 107), Check("common", "ebay.com", 69), Check("common", "en.wikipedia.org", 102), Check("common", "example.com", 96), Check("common", "google.com", 101), Check("common", "naver.com", 90), Check("common", "paypal.com", 89), Check("common", "qq.com", 110), Check("common", "stackoverflow.com", 86), Check("common", "tmall.com", 115), Check("common", "wikileaks.com", 99), Check("common", "xn--b1aew.xn--p1ai", 106), Check("gambling", "bet365.com", 172), Check("gambling", "betonline.ag", 182), Check("gambling", "unibet.com", 160), Check("ignore", "archive.is", 0), Check("infosec", "scan.shadowserver.org", 107), Check("infosec", _sinkhole, 129), Check("news", "huanqiu.com", 393), Check("news", "nytimes.com", 101), Check("news", "scmp.com", 103), Check("news", "telegram.com", 80), Check("parking", "scmp.org", 158), Check("piracy", "thehiddenbay.org", 96), Check("piracy", "thepiratebay.org", 218), Check("porn", "chaturbate.com", 182), Check("porn", "pornhub.com", 165), Check("porn", "xvideos.com", 209), Check("shock", rot13("ebggra.pbz"), 94), Check("uncommon", "cybre.space", 116), Check("uncommon", "react.uni-saarland.de", 78), Check("usercontent", "4chan.org", 153), Check("usercontent", "facebook.com", 111), Check("usercontent", "flickr.com", 140), Check("usercontent", "github.com", 95), Check("usercontent", "imgur.com", 116), Check("usercontent", "instagram.com", 93), Check("usercontent", "reddit.com", 113), Check("usercontent", "tumblr.com", 118), Check("usercontent", "twitter.com", 123), Check("usercontent", "weibo.com", 115), Check("usercontent", "wordpress.com", 108), Check("video", "bilibili.com", 122), Check("video", "netflix.com", 129), Check("video", "twitch.tv", 97), Check("video", "youtube.com", 136), Check("weirdsub", _weirdsub, 152), ]) likely, unlikely = partition_checks(checks, 99) defunct = [ "panda.tv", # imochen.github.io ] def _top1m_gen(): return (Check("top", entry, 0) for i, entry in retrieve_top1m_entries() if entry not in defunct) top100 = head(100, _top1m_gen()) top1000 = head(1000, _top1m_gen())