From 3afeb5a3fd6eb338b4e413df6e38a15785d81fa4 Mon Sep 17 00:00:00 2001 From: Connor Olding Date: Sat, 29 Aug 2020 10:16:06 +0200 Subject: [PATCH] init --- .gitignore | 3 + LICENSE | 13 ++ README.md | 1 + requirements.txt | 2 + respodns/__init__.py | 0 respodns/__main__.py | 208 +++++++++++++++++++++ respodns/checks.py | 114 ++++++++++++ respodns/db.py | 434 +++++++++++++++++++++++++++++++++++++++++++ respodns/ips.py | 67 +++++++ respodns/nonsense.py | 8 + respodns/pooler.py | 48 +++++ respodns/structs.py | 32 ++++ respodns/top1m.py | 52 ++++++ respodns/util.py | 71 +++++++ setup.py | 31 ++++ 15 files changed, 1084 insertions(+) create mode 100644 .gitignore create mode 100644 LICENSE create mode 100644 README.md create mode 100644 requirements.txt create mode 100644 respodns/__init__.py create mode 100644 respodns/__main__.py create mode 100644 respodns/checks.py create mode 100644 respodns/db.py create mode 100644 respodns/ips.py create mode 100644 respodns/nonsense.py create mode 100644 respodns/pooler.py create mode 100644 respodns/structs.py create mode 100644 respodns/top1m.py create mode 100644 respodns/util.py create mode 100644 setup.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..308237e --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +*.csv +*.db +__pycache__/ diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..8df617e --- /dev/null +++ b/LICENSE @@ -0,0 +1,13 @@ +Copyright (c) 2020, Connor Olding + +Permission to use, copy, modify, and/or distribute this software for any +purpose with or without fee is hereby granted, provided that the above +copyright notice and this permission notice appear in all copies. + +THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..385efde --- /dev/null +++ b/README.md @@ -0,0 +1 @@ +へい diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..b41e259 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +dnspython >= 2.1.0 +storm diff --git a/respodns/__init__.py b/respodns/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/respodns/__main__.py b/respodns/__main__.py new file mode 100644 index 0000000..ff3f0d7 --- /dev/null +++ b/respodns/__main__.py @@ -0,0 +1,208 @@ +#!/usr/bin/env python3 + +from argparse import ArgumentParser +from asyncio import run, sleep +from .checks import (first_checks, new_checks, + likely_checks, unlikely_checks, top100) +from .db import RespoDB +from .ips import blocks, is_bogon +from .pooler import make_simple_pooler +from .structs import Options, Entry +from .util import right_now, read_ips, getaddrs, detect_gfw +from sys import argv, stdin, stderr, exit + +def process_result(res, ip, check, opts: Options): + # TODO: get more accurate times by inserting start-end into getaddrs. + now = right_now() + assert len(res) > 0 + reason = None + + if "Timeout" in res: + reason = "timeout" + + elif check.kind.startswith("bad"): + reason = "okay" if "NXDOMAIN" in res else "redirect" + + elif any(is_bogon(r) for r in res): + reason = "block" + elif any(blocked in res for blocked in blocks): + reason = "block" + + elif not any(len(r) > 0 and r[0].isdigit() for r in res): + # TODO: check for no alias on common. + reason = "missing" + + else: + for r in res: + if len(r) == 0 or not r[0].isdigit(): + continue + if detect_gfw(r, ip, check): + reason = "gfw" + break + else: + reason = "okay" + + assert reason is not None, (res, ip, check) + + addrs = list(filter(lambda r: len(r) > 0 and r[0].isdigit(), res)) + exception = res[0] if len(addrs) == 0 else None + + return Entry( + date=now, + success=reason == "okay", + server=ip, + kind=check.kind, + domain=check.domain, + exception=exception, + addrs=addrs, + reason=reason, + execution=opts.execution, + ) + +async def try_ip(db, server_ip, checks, opts: Options): + entries = [] + + success = True + def finisher(done, pending): + nonlocal success + for task in done: + res, ip, check = task.result() + entry = process_result(res, ip, check, opts) + entries.append(entry) + if not entry.success: + if opts.early_stopping and success: # only cancel once + for pend in pending: + #print("CANCEL", file=stderr) + # FIXME: this can still, somehow, cancel the main function. + pend.cancel() + success = False + + pooler = make_simple_pooler(opts.domain_simul, finisher) + + async def getaddrs_wrapper(ip, check): + # NOTE: could put right_now() stuff here! + # TODO: add duration field given in milliseconds (integer) + # by subtracting start and end datetimes. + res = await getaddrs(ip, check.domain, opts) + return res, ip, check + + for i, check in enumerate(checks): + first = i == 0 + if not first: + await sleep(opts.domain_wait) + await pooler(getaddrs_wrapper(server_ip, check)) + if first: + # limit to one connection for the first check. + await pooler() + if not success: + if opts.early_stopping or first: + break + else: + await pooler() + + if not opts.dry: + for entry in entries: + db.push_entry(entry) + db.commit() + + if not success: + first_failure = None + assert len(entries) > 0 + for entry in entries: + #print(entry, file=stderr) + if not entry.success: + first_failure = entry + break + else: + assert 0, ("no failures found:", entries) + return server_ip, first_failure + return server_ip, None + +async def main(db, filepath, checks, opts: Options): + def finisher(done, pending): + for task in done: + ip, first_failure = task.result() + if first_failure is None: + print(ip) + elif opts.dry: + ff = first_failure + if ff.kind in ("shock", "adware"): + print(ip, ff.reason, ff.kind, sep="\t") + else: + print(ip, ff.reason, ff.kind, ff.domain, sep="\t") + + pooler = make_simple_pooler(opts.ip_simul, finisher) + + f = stdin if filepath == "" else open(filepath, "r") + for i, ip in enumerate(read_ips(f)): + first = i == 0 + if opts.progress: + print(f"#{i}: {ip}", file=stderr) + stderr.flush() + if not first: + await sleep(opts.ip_wait) + await pooler(try_ip(db, ip, checks, opts)) + if f != stdin: + f.close() + + await pooler() + +def ui(program, args): + name = "respodns6" + parser = ArgumentParser(name, + description=name + ": test and log DNS records") + + # TODO: support multiple paths. nargs="+", iterate with pooling? + parser.add_argument( + "path", metavar="file-path", + help="a path to a file containing IPv4 addresses which host DNS servers") + + parser.add_argument( + "--database", + help="specify database for logging") + + a = parser.parse_args(args) + + checks = [] + checks += first_checks + #checks += new_checks + checks += likely_checks + #checks += unlikely_checks + #checks += top100 + + opts = Options() + opts.dry = a.database is None + opts.early_stopping = opts.dry + + if a.database is not None: + if a.database.startswith("sqlite:"): + uri = a.database + else: + uri = "sqlite:///" + a.database + + def runwrap(db, debug=False): + if debug: + import logging + logging.basicConfig(level=logging.DEBUG) + run(main(db, a.path, checks, opts), debug=True) + else: + run(main(db, a.path, checks, opts)) + + if opts.dry: + runwrap(None) + else: + # log to a database. + db = RespoDB(uri, create=True) + with db: # TODO: .open and .close methods for manual invocation. + with db.execution as execution: # TODO: clean up this interface. + opts.execution = execution + runwrap(db) + +if __name__ == "__main__": + if len(argv) == 0: + print("You've met with a terrible fate.", file=stderr) + ret = 2 + else: + ret = ui(argv[0], argv[1:]) + if ret is not None: + exit(ret) diff --git a/respodns/checks.py b/respodns/checks.py new file mode 100644 index 0000000..5d3f68f --- /dev/null +++ b/respodns/checks.py @@ -0,0 +1,114 @@ +from collections import namedtuple +from .nonsense import nonsense_consistent +from .top1m import retrieve_top1m_entries + +rot13_mapping = {} +for a, b, c, d in zip("anAN05", "mzMZ49", "naNA50", "zmZM94"): + rot13_mapping.update(dict((chr(k), chr(v)) + for k, v in zip(range(ord(a), ord(b) + 1), + range(ord(c), ord(d) + 1)))) + +def rot13(s): + return "".join(rot13_mapping.get(c, c) for c in s) + +def concat_nonsense(domain): + return nonsense_consistent(domain) + "." + domain + +def head(n, it): + res = [] + try: + while len(res) < n: + res.append(next(it)) + except StopIteration: + pass + return res + +Check = namedtuple("Check", ("kind", "domain")) + +first_checks = [ + Check("common", "baidu.com"), # this avoids issues with chinese censorship: https://www.bortzmeyer.org/sichuan-pepper.html +] + +new_checks = [ + # via dnsvalidator + Check("adtrack", "bet365.com"), + Check("common", "facebook.com"), + Check("common", "google.com"), + Check("common", "paypal.com"), + Check("common", "wikileaks.com"), + Check("news", "telegram.com"), +] + +likely_checks = [ + # these checks are, in practice, the most likely to weed out unwanted DNS servers. + Check("news", "huanqiu.com"), + Check("adware", rot13("nqf789.pbz")), + Check("shock", rot13("tbng.pk")), # some servers block shock sites, which isn't a terrible idea, but it's inaccurate + Check("porn", "pornhub.com"), # some servers block piracy and porn sites for being taboo, or whatever + Check("adtrack", "media.fastclick.net"), + Check("parking", "scmp.org"), # dns.watch fails here: domain parking is evil, but servers must abide + Check("usercontent","4chan.org"), # some servers block sites driven by their chaotic user-created content + Check("bad", concat_nonsense("com")), # some servers like to redirect nonexistent domains: https://web.archive.org/web/20140302064622/http://james.bertelson.me/blog/2014/01/level-3-are-now-hijacking-failed-dns-requests-for-ad-revenue-on-4-2-2-x/ + Check("weirdsub", concat_nonsense("javarevisited.blogspot.com")), # blogspot handles these strangely; DNS servers likewise + # NOTE: disabled for being wildly inconsistent: +# Check("weird", "archive.is"), # Cloudflare fails here: https://jarv.is/notes/cloudflare-dns-archive-is-blocked/ + Check("common", "archive.org"), + Check("infosec", "iuqerfsodp9ifjaposdfjhgosurijfaewrwergwea.com"), # one of the WannaCry sinkholes, kinda important that it resolves + Check("common", "xn--b1aew.xn--p1ai"), # just to test internationalized domains + Check("common", "wikileaks.com"), + Check("uncommon", "cybre.space"), # also doubles as a check for the new TLDs + Check("piracy", "thehiddenbay.org"), # some servers block piracy and porn sites for being taboo, or whatever + Check("infosec", "scan.shadowserver.org"), # some servers block this: https://scan.shadowserver.org/dns/ + Check("common", "duckduckgo.com"), # a few servers block this for some reason? + Check("badsub", concat_nonsense("google.com")), # poisoning may yield an unwanted result here + Check("common", "en.wikipedia.org"), + Check("adtrack", "google-analytics.com"), + Check("adtrack", "ad.doubleclick.net"), + # baidu goes here...? + Check("common", "naver.com"), + Check("common", "google.com"), # surely a fully-functional server would resolve the most popular domain in existence +] + +unlikely_checks = [ + Check("piracy", "thepiratebay.org"), + Check("porn", "xvideos.com"), + Check("usercontent","imgur.com"), + Check("usercontent","twitter.com"), + Check("usercontent","weibo.com"), + Check("usercontent","github.com"), + Check("porn", "chaturbate.com"), + Check("video", "bilibili.com"), + Check("video", "twitch.tv"), + Check("common", "qq.com"), + Check("video", "netflix.com"), + Check("usercontent","reddit.com"), + Check("usercontent","facebook.com"), + Check("video", "youtube.com"), + Check("usercontent","tumblr.com"), + Check("usercontent","wordpress.com"), + Check("common", "tmall.com"), + Check("usercontent","instagram.com"), + Check("news", "nytimes.com"), + Check("usercontent","flickr.com"), + Check("common", "ebay.com"), + Check("news", "scmp.com"), + Check("common", "aliexpress.com"), + Check("common", "stackoverflow.com"), +] + +defunct = [ + "panda.tv", # imochen.github.io +] + +def _top1m_gen(): + return (Check("top", entry) + for i, entry in retrieve_top1m_entries() + if entry not in defunct) + +top100 = head(100, _top1m_gen()) +top1000 = head(1000, _top1m_gen()) + +#__all__ = [ +# "first_checks", "new_checks", "likely_checks", "unlikely_checks", "top100", +# "defunct", +#] diff --git a/respodns/db.py b/respodns/db.py new file mode 100644 index 0000000..291e1fe --- /dev/null +++ b/respodns/db.py @@ -0,0 +1,434 @@ +import storm.locals as rain +import re + +ipv4_pattern = re.compile("(\d+)\.(\d+)\.(\d+)\.(\d+)", re.ASCII) + +def addr_to_int(ip): + match = ipv4_pattern.fullmatch(ip) + assert match is not None, row + segs = list(map(int, match.group(1, 2, 3, 4))) + assert all(0 <= seg <= 255 for seg in segs), match.group(0) + numeric = segs[0] << 24 | segs[1] << 16 | segs[2] << 8 | segs[3] + return numeric + +create_table_statements = dict( + # TODO: Duration REAL GENERATED ALWAYS AS etc.? + executions=""" +CREATE TABLE IF NOT EXISTS Executions ( + ExecutionId INTEGER PRIMARY KEY, + StartDate DATE NOT NULL, + FinishDate DATE, + Completed BOOLEAN DEFAULT 0 NOT NULL) + """, + + exceptions=""" +CREATE TABLE IF NOT EXISTS Exceptions ( + ExceptionId INTEGER PRIMARY KEY, + Name TEXT NOT NULL, + Fail BOOLEAN NOT NULL) + """, + + ips=""" +CREATE TABLE IF NOT EXISTS Ips ( + IpId INTEGER PRIMARY KEY, + AsStr TEXT GENERATED ALWAYS AS ( + Cast(AsInt >> 24 & 255 AS TEXT) || '.' || + Cast(AsInt >> 16 & 255 AS TEXT) || '.' || + Cast(AsInt >> 8 & 255 AS TEXT) || '.' || + Cast(AsInt & 255 AS TEXT) + ) STORED NOT NULL, + AsInt INTEGER UNIQUE CHECK(AsInt >= 0 AND AsInt < 1 << 32) NOT NULL, + China BOOLEAN DEFAULT 0 NOT NULL, + BlockTarget BOOLEAN DEFAULT 0 NOT NULL, + Server BOOLEAN DEFAULT 0 NOT NULL, + RedirectTarget BOOLEAN DEFAULT 0 NOT NULL, + GfwTarget BOOLEAN DEFAULT 0 NOT NULL) + """, + + kinds=""" +CREATE TABLE IF NOT EXISTS Kinds ( + KindId INTEGER PRIMARY KEY, + Name TEXT UNIQUE NOT NULL, + ExpectExceptionId INTEGER, + FOREIGN KEY(ExpectExceptionId) REFERENCES Exceptions(ExceptionId)) + """, + + domains=""" +CREATE TABLE IF NOT EXISTS Domains ( + DomainId INTEGER PRIMARY KEY, + Name TEXT UNIQUE NOT NULL, + KindId INTEGER, + FOREIGN KEY(KindId) REFERENCES Kinds(KindId)) + """, + + # NOTE: that RecordId is *not* the rowid here + # since records can contain multiple IPs, + # and thereby span multiple rows. + # TODO: indexing stuff, cascade deletion stuff. + records=""" +CREATE TABLE IF NOT EXISTS Records ( + RecordId INTEGER NOT NULL, + IpId INTEGER, + FOREIGN KEY(IpId) REFERENCES Ips(IpId)) + """, + + messages=""" +CREATE TABLE IF NOT EXISTS Messages ( + MessageId INTEGER PRIMARY KEY, + ExecutionId INTEGER, + ServerId INTEGER NOT NULL, + DomainId INTEGER NOT NULL, + RecordId INTEGER, + ExceptionId INTEGER, + FOREIGN KEY(ServerId) REFERENCES Ips(IpId), + FOREIGN KEY(ExecutionId) REFERENCES Executions(ExecutionId), + FOREIGN KEY(DomainId) REFERENCES Domains(DomainId), + FOREIGN KEY(ExceptionId) REFERENCES Exceptions(ExceptionId)) + """, + # this fails because RecordId is not UNIQUE: + # FOREIGN KEY(RecordId) REFERENCES Records(RecordId) +) + +create_view_statements = [ + """ +CREATE VIEW Results AS +SELECT + Messages.ExecutionId, + ServerIps.AsStr as Server, + Kinds.Name as Kind, + Domains.Name as Name, + RecordIps.AsStr as Address, + Exceptions.Name as Exception +FROM Messages +LEFT JOIN Domains ON Messages.DomainId = Domains.DomainId +LEFT JOIN Kinds ON Domains.KindId = Kinds.KindId +LEFT JOIN Ips AS ServerIps ON Messages.ServerId = ServerIps.IpId +LEFT JOIN Records ON Messages.RecordId = Records.RecordId +LEFT JOIN Ips as RecordIps ON Records.IpId = RecordIps.IpId +LEFT JOIN Exceptions ON Messages.ExceptionId = Exceptions.ExceptionId +-- GROUP BY Records.IpId + """, +] + +table_triggers = dict( + messages=[ + # TODO: more triggers. (before update, and also for Records table) + """ +CREATE TRIGGER IF NOT EXISTS RecordExists +BEFORE INSERT +ON Messages +BEGIN + SELECT CASE + WHEN NEW.RecordId NOTNULL AND NOT EXISTS(SELECT 1 FROM Records WHERE Records.RecordID = NEW.RecordId) + THEN raise(FAIL, "RecordId does not exist") + END; +END + """, + ]) + +class Execution: + def __init__(self, db): + self.db = db + self.execution = None + + def __enter__(self): + from .util import right_now + self.execution = self.db.start_execution(right_now()) + return self.execution + + def __exit__(self, exc_type, exc_value, traceback): + from .util import right_now + completed = exc_type is None + self.db.finish_execution(self.execution, right_now(), completed) + +class AttrCheck: + """ + Inheriting AttrCheck prevents accidentally setting attributes + that don't already exist. + """ + def __setattr__(self, name, value): + # NOTE: hasattr doesn't do what we want here. dir does. + if name.startswith("_") or name in dir(self): + super().__setattr__(name, value) + else: + raise AttributeError(name) + +class TException(rain.Storm, AttrCheck): + __storm_table__ = "Exceptions" + exception_id = rain.Int("ExceptionId", primary=True) + name = rain.Unicode("Name") + fail = rain.Bool("Fail") + +class TExecution(rain.Storm, AttrCheck): + __storm_table__ = "Executions" + execution_id = rain.Int("ExecutionId", primary=True) + start_date = rain.DateTime("StartDate") + finish_date = rain.DateTime("FinishDate") + completed = rain.Bool("Completed") + +class TAddress(rain.Storm, AttrCheck): + __storm_table__ = "Ips" + address_id = rain.Int("IpId", primary=True) + str = rain.Unicode("AsStr") + ip = rain.Int("AsInt") + china = rain.Bool("China") + block_target = rain.Bool("BlockTarget") + server = rain.Bool("Server") + redirect_target = rain.Bool("RedirectTarget") + gfw_target = rain.Bool("GfwTarget") + +class TKind(rain.Storm, AttrCheck): + __storm_table__ = "Kinds" + kind_id = rain.Int("KindId", primary=True) + name = rain.Unicode("Name") + xxid = rain.Int("ExpectExceptionId") + exception = rain.Reference(xxid, "TException.exception_id") + +class TDomain(rain.Storm, AttrCheck): + __storm_table__ = "Domains" + domain_id = rain.Int("DomainId", primary=True) + name = rain.Unicode("Name") + kind_id = rain.Int("KindId") + kind = rain.Reference(kind_id, "TKind.kind_id") + +class TRecord(rain.Storm, AttrCheck): + __storm_table__ = "Records" + row_id = rain.Int("rowid", primary=True) + record_id = rain.Int("RecordId") + address_id = rain.Int("IpId") + address = rain.Reference(address_id, "TAddress.address_id") + +class TMessage(rain.Storm, AttrCheck): + __storm_table__ = "Messages" + message_id = rain.Int("MessageId", primary=True) + execution_id = rain.Int("ExecutionId") + server_id = rain.Int("ServerId") + domain_id = rain.Int("DomainId") + record_id = rain.Int("RecordId") + exception_id = rain.Int("ExceptionId") + execution = rain.Reference(execution_id, "TExecution.execution_id") + server = rain.Reference(server_id, "TAddress.address_id") + domain = rain.Reference(domain_id, "TDomain.domain_id") + #record = rain.Reference(record_id, "TRecord.record_id") + exception = rain.Reference(exception_id, "TException.exception_id") + +def apply_properties(obj, d): + from storm.properties import PropertyColumn + for k, v in d.items(): + ref = getattr(obj.__class__, k) + assert ref is not None, (type(obj), k) + assert isinstance(ref, PropertyColumn) or isinstance(ref, rain.Reference), \ + (type(obj), k) + setattr(obj, k, v) + return obj + +class RespoDB: + def __init__(self, uri, setup=False, create=False): + self.uri = uri + db_exists = self._db_exists(self.uri) + self.db = rain.create_database(self.uri) + self._conn = None + + if setup or (create and not db_exists): + with self: + self.setup_executions() + self.setup_exceptions() + self.setup_ips() + self.setup_kinds() + self.setup_domains() + self.setup_records() + self.setup_messages() + + for q in create_view_statements: + self._conn.execute(q, noresult=True) + assert setup or create or db_exists, "database was never setup" + + self.execution = Execution(self) + + @staticmethod + def _db_exists(uri): + from os.path import exists + _, _, fp = uri.partition(":") + if fp.startswith("//"): + _, _, fp = fp[2:].partition("/") + return fp and exists(fp) + + def __enter__(self): + self._conn = rain.Store(self.db) + return self + + def __exit__(self, exc_type, exc_value, traceback): + self.commit() + self._conn.close() + self._conn = None + + def find_one(self, cls_spec, *args, **kwargs): + assert self._conn is not None + return self._conn.find(cls_spec, *args, **kwargs).one() + + def flush(self): + assert self._conn is not None + self._conn.flush() + + def commit(self): + assert self._conn is not None + self._conn.commit() + + def new_exception(self, **kwargs): + assert self._conn is not None + return self._conn.add(apply_properties(TException(), kwargs)) + + def new_kind(self, **kwargs): + assert self._conn is not None + return self._conn.add(apply_properties(TKind(), kwargs)) + + def new_domain(self, **kwargs): + assert self._conn is not None + return self._conn.add(apply_properties(TDomain(), kwargs)) + + def new_address(self, **kwargs): + assert self._conn is not None + return self._conn.add(apply_properties(TAddress(), kwargs)) + + def new_record(self, **kwargs): + assert self._conn is not None + return self._conn.add(apply_properties(TRecord(), kwargs)) + + def new_message(self, **kwargs): + assert self._conn is not None + return self._conn.add(apply_properties(TMessage(), kwargs)) + + def setup_executions(self): + self._conn.execute(create_table_statements["executions"], noresult=True) + + def setup_exceptions(self): + # careful not to call them "errors" since NXDOMAIN is not an error. + self._conn.execute(create_table_statements["exceptions"], noresult=True) + + # TODO: upsert? + + self.new_exception(name="NXDOMAIN", fail=False) + self.new_exception(name="NoAnswer", fail=True) + self.new_exception(name="NoNameservers", fail=True) + self.new_exception(name="Timeout", fail=True) + + def setup_ips(self): + from .ips import china, blocks + + self._conn.execute(create_table_statements["ips"], noresult=True) + + # TODO: upsert? + + self.new_address(ip=addr_to_int("0.0.0.0"), block_target=True) + self.new_address(ip=addr_to_int("127.0.0.1"), block_target=True) + for ip in china: + self.new_address(ip=addr_to_int(ip), china=True) + for ip in blocks: + self.new_address(ip=addr_to_int(ip), block_target=True) + + def setup_kinds(self): + self._conn.execute(create_table_statements["kinds"], noresult=True) + + # TODO: upsert? + + #NXDOMAIN = self.find_one(TException, TException.name == "NXDOMAIN") + #self.new_kind(name="bad", exception=NXDOMAIN) + #self.new_kind(name="badsub", exception=NXDOMAIN) + + def setup_domains(self): + self._conn.execute(create_table_statements["domains"], noresult=True) + + def setup_records(self): + self._conn.execute(create_table_statements["records"], noresult=True) + + def setup_messages(self): + self._conn.execute(create_table_statements["messages"], noresult=True) + for trig in table_triggers["messages"]: + self._conn.execute(trig) + + def start_execution(self, dt): + execution = TExecution() + execution.start_date = dt + self.flush() + return execution + + def finish_execution(self, execution, dt, completed): + # TODO: fail if ExecutionId is missing? + execution.finish_date = dt + execution.completed = completed + self.flush() + + def next_record_id(self): + from storm.expr import Add, Max, Coalesce + expr = Add(Coalesce(Max(TRecord.record_id), 0), 1) + return self.find_one(expr) + + def find_record_id(self, addresses): + address_ids = list(address.address_id for address in addresses) + record_ids = list(self._conn.find(TRecord, TRecord.address_id.is_in(address_ids)).values(TRecord.record_id)) + if not record_ids: + return None + unique_ids = sorted(set(record_ids)) + for needle in unique_ids: + if sum(1 for id in record_ids if id == needle) == len(addresses): + found = True + return needle + return None + + def push_entry(self, entry): + kind = self.find_one(TKind, TKind.name == entry.kind) + if not kind: + kind = self.new_kind(name=entry.kind) + if entry.kind.startswith("bad"): + exception = self.find_one(TException, TException.name == "NXDOMAIN") + assert exception is not None + kind.exception = exception + + domain = self.find_one(TDomain, TDomain.name == entry.domain) + if not domain: + domain = self.new_domain(name=entry.domain) + domain.kind = kind + + addresses = [] + as_ints = sorted(set(map(addr_to_int, entry.addrs))) + for numeric in as_ints: + address = self.find_one(TAddress, TAddress.ip == numeric) + if not address: + address = self.new_address(ip=numeric) + addresses.append(address) + + for address in addresses: + if entry.reason == "block": + address.block_target = True + elif entry.reason == "redirect": + address.redirect_target = True + elif entry.reason == "gfw": + address.gfw_target = True + + if addresses: + record_id = self.find_record_id(addresses) + if record_id is None: + record_id = self.next_record_id() + for address in addresses: + self.new_record(record_id=record_id, address=address) + else: + record_id = None + + numeric = addr_to_int(entry.server) + server = self.find_one(TAddress, TAddress.ip == numeric) + if not server: + server = self.new_address(ip=numeric) + self.flush() + server.server = True + + if entry.exception: + exception = self.find_one(TException, TException.name == entry.exception) + assert exception is not None + else: + exception = None + + message = self.new_message( + execution=entry.execution, + server=server, domain=domain, + record_id=record_id, exception=exception) + self.flush() diff --git a/respodns/ips.py b/respodns/ips.py new file mode 100644 index 0000000..81dccde --- /dev/null +++ b/respodns/ips.py @@ -0,0 +1,67 @@ +# known IPs of any DNS located in China: +china = { + "1.1.8.8", + "1.1.8.9", + "1.2.4.8", + "1.8.1.8", + "1.8.8.8", + "114.254.201.131", + "218.107.55.108", + "222.216.2.236", +} + +# known IPs (not servers) that are used to deny access to websites: +blocks = { + "1.2.3.4", # timeout + "54.242.237.204", # fake + "93.158.134.250", # fake + "114.6.128.8", # fake + "118.97.116.27", # fake + "119.235.29.59", # fake + "124.40.255.99", # fake + "146.112.61.106", # fake + "156.154.113.17", # fake + "156.154.175.30", # fake + "156.154.175.215", # fake + "156.154.175.216", # fake + "156.154.175.221", # fake + "163.28.10.160", # fake + "175.139.142.25", # fake + "176.103.130.135", # fake + "182.93.64.126", # fake + "192.99.140.48", # fake + "195.175.254.2", # fake + "202.40.187.91", # fake + "202.162.209.133", # fake + "202.165.36.253", # fake + "203.119.13.75", # fake + "203.119.13.76", # fake + "203.190.55.217", # fake +} + +bogon_checks = [ + "0.", + "10.", + "127.", + "169.254.", + "192.0.0.", + "192.0.2.", + "192.168.", + "198.18.", + "198.19.", + "198.51.100.", + "203.0.113.", +] + [ + "100.{}.".format(i) for i in range(64, 128) +] + [ + "172.{}.".format(i) for i in range(16, 32) +] + [ + "{}.".format(i) for i in range(224, 256) +] + +def is_bogon(ip): + return any(ip.startswith(check) for check in bogon_checks) + +def ipkey(ip_string): + segs = [int(s) for s in ip_string.replace(":", ".").split(".")] + return sum(256**(3 - i) * seg for i, seg in enumerate(segs)) diff --git a/respodns/nonsense.py b/respodns/nonsense.py new file mode 100644 index 0000000..c6df281 --- /dev/null +++ b/respodns/nonsense.py @@ -0,0 +1,8 @@ +from random import choice, choices, Random +from string import ascii_lowercase +from zlib import crc32 + +def nonsense_consistent(domain): + rng = Random(crc32(domain.encode("utf-8"))) + length = rng.choices((9, 10, 11, 12), (4, 5, 3, 2))[0] + return "".join(rng.choice(ascii_lowercase) for i in range(length)) diff --git a/respodns/pooler.py b/respodns/pooler.py new file mode 100644 index 0000000..daeb4dc --- /dev/null +++ b/respodns/pooler.py @@ -0,0 +1,48 @@ +from types import CoroutineType +import asyncio + +# TODO: write a less confusing interface that allows the code to be written more flatly. +# maybe like: async for done in apply(doit, [tuple_of_args]): + +def make_pooler(pool_size, finisher=None): + aws = set() + async def pooler(item=None): + nonlocal aws + finish = item is None + if not finish: + if isinstance(item, CoroutineType): + assert not isinstance(item, asyncio.Task) + item = asyncio.create_task(item) + aws.add(item) + # TODO: don't wait until all completed, just first completed in loop. + # that way we can handle each done task ASAP. + condition = asyncio.ALL_COMPLETED if finish else asyncio.FIRST_COMPLETED + if len(aws) == 0: + return None + if finish or len(aws) >= pool_size: + done, pending = await asyncio.wait(aws, return_when=condition) + #pending = set(task for task in pending if task in aws) # ??? + ret = None if finisher is None else finisher(done, pending) + #aws = set(task for task in pending if not task.cancelled()) + aws = pending + if ret is not None: + return ret + return None + return pooler + +def make_simple_pooler(pool_size, finisher=None): + condition = asyncio.FIRST_COMPLETED + pending = set() + async def pooler(item=None): + nonlocal pending + finish = item is None + if not finish: + if isinstance(item, CoroutineType): + assert not isinstance(item, asyncio.Task) + item = asyncio.create_task(item) + pending.add(item) + desired_size = 0 if finish else pool_size - 1 + while len(pending) > desired_size: + done, pending = await asyncio.wait(pending, return_when=condition) + finisher(done, pending) + return pooler diff --git a/respodns/structs.py b/respodns/structs.py new file mode 100644 index 0000000..bbbfa23 --- /dev/null +++ b/respodns/structs.py @@ -0,0 +1,32 @@ +from dataclasses import dataclass + +@dataclass +class Options: + #exec_id: int = -1 + execution: object = None + + ip_simul: int = 10 # how many IPs to connect to at once + domain_simul: int = 3 # how many domains per IP to request at once + + ip_wait: float = 0.15 + domain_wait: float = 0.25 + + impatient: bool = False # reduce retries and times for timeouts + early_stopping: bool = True # stop at the first invalid domain (best with dry) + dry: bool = True # don't write anything to database + progress: bool = False + +@dataclass +class Entry: + from datetime import datetime + + date: datetime + success: bool + server: str + kind: str + domain: str + exception: str + addrs: list # list of strings + reason: str + #exec_id: int + execution: object diff --git a/respodns/top1m.py b/respodns/top1m.py new file mode 100644 index 0000000..a10b1e1 --- /dev/null +++ b/respodns/top1m.py @@ -0,0 +1,52 @@ +urltop_default = "http://s3.amazonaws.com/alexa-static/top-1m.csv.zip" +csvfn_default = "top-1m.csv" + +one_week = 7 * 24 * 60 * 60 # in seconds + +def alive(fp, expire): + from os.path import exists, getmtime, getsize + from time import time + + return exists(fp) and time() < getmtime(fp) + one_week and getsize(fp) > 2 + +def download_top1m(urltop=None, csvfn=None): + from io import BytesIO + from urllib.request import urlopen + from zipfile import ZipFile + + if urltop is None: + urltop = urltop_default + if csvfn is None: + csvfn = csvfn_default + + comp = BytesIO() + with urlopen(urltop) as re: + comp.write(re.read()) + + with ZipFile(comp) as zipf: + with zipf.open(csvfn) as f: + uncomp = f.read() + comp.close() + + return uncomp.decode("utf-8") + +def retrieve_top1m_entries(csv_fp="top-1m.csv"): + from sys import stderr + + if alive(csv_fp, one_week): + with open(csv_fp, "r") as f: + uncomp = f.read() + else: + print("downloading", csv_fp, file=stderr) + uncomp = download_top1m() + with open(csv_fp, "w") as f: + f.write(uncomp) + + # we could use the csv module, but this is totally overkill + # for data that *should* be just a subset of ascii. + lines = uncomp.splitlines() + entries = [(lambda a: (int(a[0]), a[2]))(line.partition(",")) + for line in lines] + return entries + +top1m = download_top1m diff --git a/respodns/util.py b/respodns/util.py new file mode 100644 index 0000000..45ac7f0 --- /dev/null +++ b/respodns/util.py @@ -0,0 +1,71 @@ +def right_now(): + from datetime import datetime, timezone + return datetime.now(timezone.utc) + +def detect_gfw(r, ip, check): + # attempt to detect interference from the Great Firewall of China. + #from .ips import china + #if r in china: return True + + # class D or class E, neither of which are correct for a (public?) DNS. + #if int(r.partition(".")[0]) >= 224: return True + + rs = lambda prefix: r.startswith(prefix) + de = lambda suffix: check.domain.endswith(suffix) + hosted = de("facebook.com") or de("instagram.com") or de("whatsapp.com") + if rs("31.13.") and not hosted: return True + if rs("66.220."): return True + if rs("69.63."): return True + if rs("69.171.") and not rs("69.171.250."): return True + if rs("74.86."): return True + if rs("75.126."): return True + if r == "64.13.192.74": return True + # more non-facebook GFW stuff: + # 31.13.64.33 + # 31.13.70.1 + # 31.13.70.20 + # 31.13.76.16 + # 31.13.86.1 + # 173.252.110.21 + # 192.99.140.48 + # 199.16.156.40 + # 199.16.158.190 + + return False + +async def getaddrs(server, domain, opts): + from dns.asyncresolver import Resolver + from dns.exception import Timeout + from dns.resolver import NXDOMAIN, NoAnswer, NoNameservers + #from dns.resolver import Resolver + from .ips import ipkey + + res = Resolver(configure=False) + if opts.impatient: + res.timeout = 5 + res.lifetime = 2 + res.nameservers = [server] + try: + #ans = res.resolve(domain, "A", search=False) + ans = await res.resolve(domain, "A", search=False) + except NXDOMAIN: + return ["NXDOMAIN"] + except NoAnswer: + return ["NoAnswer"] + except NoNameservers: + return ["NoNameservers"] + except Timeout: + return ["Timeout"] + #return list(set(rr.address for rr in ans.rrset)) + return sorted(set(rr.address for rr in ans.rrset), key=ipkey) + +def read_ips(f): + # TODO: make async and more robust. (regex pls) + # TODO: does readlines() block if the pipe is left open i.e. user input? + for ip in f.readlines(): + if "#" in ip: + ip, _, _ = ip.partition("#") + ip = ip.strip() + if ip.count(".") != 3: + continue + yield ip diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..1deff93 --- /dev/null +++ b/setup.py @@ -0,0 +1,31 @@ +from setuptools import setup + +setup( + name='respodns', + version='0.1.0', + packages=[ + 'respodns', + ], + + author='notwa', + author_email='cloningdonor+pypi@gmail.com', + url='https://github.com/notwa/respodns', + keywords='TODO', + description='DNS logger', + license='MIT', + zip_safe=True, + + classifiers=[ + 'Development Status :: 4 - Beta', + 'Environment :: Console', + 'Intended Audience :: System Administrators', + 'License :: OSI Approved :: ISC License (ISCL)', + 'Natural Language :: English', + 'Operating System :: OS Independent', + 'Programming Language :: Python :: 3.7', + 'Programming Language :: SQL', + 'Topic :: Internet :: Name Service (DNS)', + 'Topic :: System :: Logging', + 'Topic :: Utilities', + ] +)