2020-08-29 01:16:06 -07:00
from . top1m import retrieve_top1m_entries
2020-08-29 05:54:07 -07:00
from . util import concat_nonsense , rot13 , head
from . structs import Check
2020-08-29 01:16:06 -07:00
2020-08-29 01:28:26 -07:00
first = [
2020-08-29 01:16:06 -07:00
Check ( " common " , " baidu.com " ) , # this avoids issues with chinese censorship: https://www.bortzmeyer.org/sichuan-pepper.html
]
2020-08-29 01:28:26 -07:00
new = [
2020-08-29 01:16:06 -07:00
# via dnsvalidator
Check ( " adtrack " , " bet365.com " ) ,
Check ( " common " , " facebook.com " ) ,
Check ( " common " , " google.com " ) ,
Check ( " common " , " paypal.com " ) ,
Check ( " common " , " wikileaks.com " ) ,
Check ( " news " , " telegram.com " ) ,
]
2020-08-29 01:28:26 -07:00
likely = [
2020-08-29 01:16:06 -07:00
# these checks are, in practice, the most likely to weed out unwanted DNS servers.
Check ( " news " , " huanqiu.com " ) ,
Check ( " adware " , rot13 ( " nqf789.pbz " ) ) ,
Check ( " shock " , rot13 ( " tbng.pk " ) ) , # some servers block shock sites, which isn't a terrible idea, but it's inaccurate
Check ( " porn " , " pornhub.com " ) , # some servers block piracy and porn sites for being taboo, or whatever
Check ( " adtrack " , " media.fastclick.net " ) ,
Check ( " parking " , " scmp.org " ) , # dns.watch fails here: domain parking is evil, but servers must abide
Check ( " usercontent " , " 4chan.org " ) , # some servers block sites driven by their chaotic user-created content
Check ( " bad " , concat_nonsense ( " com " ) ) , # some servers like to redirect nonexistent domains: https://web.archive.org/web/20140302064622/http://james.bertelson.me/blog/2014/01/level-3-are-now-hijacking-failed-dns-requests-for-ad-revenue-on-4-2-2-x/
Check ( " weirdsub " , concat_nonsense ( " javarevisited.blogspot.com " ) ) , # blogspot handles these strangely; DNS servers likewise
# NOTE: disabled for being wildly inconsistent:
# Check("weird", "archive.is"), # Cloudflare fails here: https://jarv.is/notes/cloudflare-dns-archive-is-blocked/
Check ( " common " , " archive.org " ) ,
Check ( " infosec " , " iuqerfsodp9ifjaposdfjhgosurijfaewrwergwea.com " ) , # one of the WannaCry sinkholes, kinda important that it resolves
Check ( " common " , " xn--b1aew.xn--p1ai " ) , # just to test internationalized domains
Check ( " common " , " wikileaks.com " ) ,
Check ( " uncommon " , " cybre.space " ) , # also doubles as a check for the new TLDs
Check ( " piracy " , " thehiddenbay.org " ) , # some servers block piracy and porn sites for being taboo, or whatever
Check ( " infosec " , " scan.shadowserver.org " ) , # some servers block this: https://scan.shadowserver.org/dns/
Check ( " common " , " duckduckgo.com " ) , # a few servers block this for some reason?
Check ( " badsub " , concat_nonsense ( " google.com " ) ) , # poisoning may yield an unwanted result here
Check ( " common " , " en.wikipedia.org " ) ,
Check ( " adtrack " , " google-analytics.com " ) ,
Check ( " adtrack " , " ad.doubleclick.net " ) ,
# baidu goes here...?
Check ( " common " , " naver.com " ) ,
Check ( " common " , " google.com " ) , # surely a fully-functional server would resolve the most popular domain in existence
]
2020-08-29 01:28:26 -07:00
unlikely = [
2020-08-29 01:16:06 -07:00
Check ( " piracy " , " thepiratebay.org " ) ,
Check ( " porn " , " xvideos.com " ) ,
Check ( " usercontent " , " imgur.com " ) ,
Check ( " usercontent " , " twitter.com " ) ,
Check ( " usercontent " , " weibo.com " ) ,
Check ( " usercontent " , " github.com " ) ,
Check ( " porn " , " chaturbate.com " ) ,
Check ( " video " , " bilibili.com " ) ,
Check ( " video " , " twitch.tv " ) ,
Check ( " common " , " qq.com " ) ,
Check ( " video " , " netflix.com " ) ,
Check ( " usercontent " , " reddit.com " ) ,
Check ( " usercontent " , " facebook.com " ) ,
Check ( " video " , " youtube.com " ) ,
Check ( " usercontent " , " tumblr.com " ) ,
Check ( " usercontent " , " wordpress.com " ) ,
Check ( " common " , " tmall.com " ) ,
Check ( " usercontent " , " instagram.com " ) ,
Check ( " news " , " nytimes.com " ) ,
Check ( " usercontent " , " flickr.com " ) ,
Check ( " common " , " ebay.com " ) ,
Check ( " news " , " scmp.com " ) ,
Check ( " common " , " aliexpress.com " ) ,
Check ( " common " , " stackoverflow.com " ) ,
]
defunct = [
" panda.tv " , # imochen.github.io
]
def _top1m_gen ( ) :
return ( Check ( " top " , entry )
for i , entry in retrieve_top1m_entries ( )
if entry not in defunct )
top100 = head ( 100 , _top1m_gen ( ) )
top1000 = head ( 1000 , _top1m_gen ( ) )