2020-08-29 01:16:06 -07:00
|
|
|
from .top1m import retrieve_top1m_entries
|
2020-08-29 05:54:07 -07:00
|
|
|
from .util import concat_nonsense, rot13, head
|
|
|
|
from .structs import Check
|
2020-08-29 01:16:06 -07:00
|
|
|
|
2020-09-04 06:49:21 -07:00
|
|
|
|
2021-08-13 02:32:21 -07:00
|
|
|
def order_by_failures(checks): # descending order
|
2020-09-04 06:39:08 -07:00
|
|
|
return sorted(checks, key=lambda check: -check.failures)
|
|
|
|
|
2020-09-04 06:49:21 -07:00
|
|
|
|
2020-09-04 07:06:37 -07:00
|
|
|
def partition_checks(checks, n):
|
|
|
|
more, less = [], []
|
|
|
|
for check in checks:
|
|
|
|
if check.failures >= n:
|
|
|
|
more.append(check)
|
|
|
|
else:
|
|
|
|
less.append(check)
|
|
|
|
return more, less
|
|
|
|
|
|
|
|
|
2020-08-29 01:28:26 -07:00
|
|
|
first = [
|
2020-09-03 04:31:21 -07:00
|
|
|
# checking this first this avoids issues with censorship in China. see:
|
2020-08-29 06:34:46 -07:00
|
|
|
# https://www.bortzmeyer.org/sichuan-pepper.html
|
2020-09-07 01:02:20 -07:00
|
|
|
Check("common", "baidu.com", 1151),
|
2020-08-29 01:16:06 -07:00
|
|
|
]
|
|
|
|
|
2020-09-04 06:49:21 -07:00
|
|
|
_sinkhole = "iuqerfsodp9ifjaposdfjhgosurijfaewrwergwea.com"
|
|
|
|
_weirdsub = concat_nonsense("javarevisited.blogspot.com")
|
|
|
|
|
2020-09-04 07:06:37 -07:00
|
|
|
checks = order_by_failures([
|
2020-09-07 01:02:20 -07:00
|
|
|
Check("adtrack", "ad.doubleclick.net", 99),
|
|
|
|
Check("adtrack", "google-analytics.com", 89),
|
|
|
|
Check("adtrack", "media.fastclick.net", 161),
|
|
|
|
Check("adware", rot13("nqf789.pbz"), 194),
|
|
|
|
Check("bad", concat_nonsense("com"), 170),
|
|
|
|
Check("badsub", concat_nonsense("google.com"), 91),
|
|
|
|
Check("common", "aliexpress.com", 114),
|
|
|
|
Check("common", "archive.org", 114),
|
|
|
|
Check("common", "duckduckgo.com", 107),
|
|
|
|
Check("common", "ebay.com", 69),
|
|
|
|
Check("common", "en.wikipedia.org", 102),
|
|
|
|
Check("common", "example.com", 96),
|
|
|
|
Check("common", "google.com", 101),
|
|
|
|
Check("common", "naver.com", 90),
|
|
|
|
Check("common", "paypal.com", 89),
|
|
|
|
Check("common", "qq.com", 110),
|
|
|
|
Check("common", "stackoverflow.com", 86),
|
|
|
|
Check("common", "tmall.com", 115),
|
|
|
|
Check("common", "wikileaks.com", 99),
|
|
|
|
Check("common", "xn--b1aew.xn--p1ai", 106),
|
|
|
|
Check("gambling", "bet365.com", 172),
|
|
|
|
Check("gambling", "betonline.ag", 182),
|
|
|
|
Check("gambling", "unibet.com", 160),
|
2020-09-04 07:16:59 -07:00
|
|
|
Check("ignore", "archive.is", 0),
|
2020-09-07 01:02:20 -07:00
|
|
|
Check("infosec", "scan.shadowserver.org", 107),
|
|
|
|
Check("infosec", _sinkhole, 129),
|
|
|
|
Check("news", "huanqiu.com", 393),
|
|
|
|
Check("news", "nytimes.com", 101),
|
|
|
|
Check("news", "scmp.com", 103),
|
|
|
|
Check("news", "telegram.com", 80),
|
|
|
|
Check("parking", "scmp.org", 158),
|
|
|
|
Check("piracy", "thehiddenbay.org", 96),
|
|
|
|
Check("piracy", "thepiratebay.org", 218),
|
|
|
|
Check("porn", "chaturbate.com", 182),
|
|
|
|
Check("porn", "pornhub.com", 165),
|
|
|
|
Check("porn", "xvideos.com", 209),
|
|
|
|
Check("shock", rot13("ebggra.pbz"), 94),
|
|
|
|
Check("uncommon", "cybre.space", 116),
|
|
|
|
Check("uncommon", "react.uni-saarland.de", 78),
|
|
|
|
Check("usercontent", "4chan.org", 153),
|
|
|
|
Check("usercontent", "facebook.com", 111),
|
|
|
|
Check("usercontent", "flickr.com", 140),
|
|
|
|
Check("usercontent", "github.com", 95),
|
|
|
|
Check("usercontent", "imgur.com", 116),
|
|
|
|
Check("usercontent", "instagram.com", 93),
|
|
|
|
Check("usercontent", "reddit.com", 113),
|
|
|
|
Check("usercontent", "tumblr.com", 118),
|
|
|
|
Check("usercontent", "twitter.com", 123),
|
|
|
|
Check("usercontent", "weibo.com", 115),
|
|
|
|
Check("usercontent", "wordpress.com", 108),
|
|
|
|
Check("video", "bilibili.com", 122),
|
|
|
|
Check("video", "netflix.com", 129),
|
|
|
|
Check("video", "twitch.tv", 97),
|
|
|
|
Check("video", "youtube.com", 136),
|
|
|
|
Check("weirdsub", _weirdsub, 152),
|
2020-09-04 06:39:08 -07:00
|
|
|
])
|
2020-08-29 01:16:06 -07:00
|
|
|
|
2021-08-11 18:57:51 -07:00
|
|
|
likely, unlikely = partition_checks(checks, 99)
|
2020-09-04 07:06:37 -07:00
|
|
|
|
2020-08-29 01:16:06 -07:00
|
|
|
defunct = [
|
|
|
|
"panda.tv", # imochen.github.io
|
|
|
|
]
|
|
|
|
|
2020-08-29 06:34:46 -07:00
|
|
|
|
2020-08-29 01:16:06 -07:00
|
|
|
def _top1m_gen():
|
2020-09-04 06:39:08 -07:00
|
|
|
return (Check("top", entry, 0)
|
2020-08-29 01:16:06 -07:00
|
|
|
for i, entry in retrieve_top1m_entries()
|
|
|
|
if entry not in defunct)
|
|
|
|
|
2020-08-29 06:34:46 -07:00
|
|
|
|
2020-08-29 01:16:06 -07:00
|
|
|
top100 = head(100, _top1m_gen())
|
|
|
|
top1000 = head(1000, _top1m_gen())
|