From 7ccb616fee78000dabd4ce47fdea15b3596015ec Mon Sep 17 00:00:00 2001 From: Connor Date: Thu, 10 Aug 2017 03:14:16 -0700 Subject: [PATCH 1/2] --- idup.py | 135 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 135 insertions(+) create mode 100644 idup.py diff --git a/idup.py b/idup.py new file mode 100644 index 0000000..9fadad9 --- /dev/null +++ b/idup.py @@ -0,0 +1,135 @@ +#!/usr/bin/env python3 +# find duplicate images given a hamming distance threshold. +# employs dhash to do the heavy lifting. +# does not recurse into "./_duplicate" so you can dump things there if you wish. +# dependencies: pillow, dhash + +import sys, os, os.path, pickle +from PIL import Image +import dhash + +def lament(*args, **kwargs): + print(*args, file=sys.stderr, **kwargs) + +def result(diff, ours, theirs): # TODO: rename + print("{}\t{}\t{}".format(diff, ours, theirs)) + +dbname = "idup.db" +exts = ".jpeg .jpg .png".split() + +"""verbosity: + -1: only unrecoverable errors. + 0: include failures. + 1: include image opening/hashing. + 2: the kitchen sink. +""" +verbosity = 1 + +pname = sys.argv[0] +if len(sys.argv) <= 1: + print("usage: {} {{threshold}}".format(pname)) + print(" utilizes {} in the current working directory".format(dbname)) + sys.exit(1) +args = sys.argv[1:] + +threshold = int(args[0]) + +ignore_dir = os.path.join(rootpath, "_duplicate") + +paths = {} + +if os.path.exists(dbname) and os.path.getsize(dbname) > 0: + with open(dbname, "rb") as f: + paths = pickle.load(f) + #lament("loaded", len(paths.keys()), "hashes") +else: + if verbosity >= 0: + lament("warning: no database found. starting from scratch.") + +existing = dict((path, h) for path, h in paths.items() if os.path.exists(path)) +for path in paths.keys(): + if path not in existing: + if verbosity >= 0: + lament("#d", path) + +paths = existing + +def compare_hash(h1, h2): + # hashes are in byte format, so we have to convert them to integers. + i1 = int.from_bytes(h1, byteorder="big") + i2 = int.from_bytes(h2, byteorder="big") + # return the hamming distance. + return bin(i1 ^ i2).count('1') + +def run(): + for dn, _, fns in os.walk(rootpath): + if dn == ignore_dir: + continue + + for fn in fns: + name, ext = os.path.splitext(fn) + path = os.path.join(dn, fn) + if ext not in exts: + continue + + if path in paths: + if verbosity >= 2: + lament("#s", path) + continue + + try: + image = Image.open(path) + except OSError: + if verbosity >= 0: + lament("#f", path) + else: + try: + row, col = dhash.dhash_row_col(image) + except OSError: + if verbosity >= 0: + lament("#f", path) + else: + if verbosity >= 1: + lament("#o", path) + h = dhash.format_bytes(row, col) + paths[path] = h + finally: + image.close() + + # first pass: exact hash matching. + hashes = dict((v, k) for k, v in paths.items()) + for p1, h in paths.items(): + p2 = hashes[h] + if p1 != p2: + result(-1, p1, p2) + + # second pass: fuzzy hash matching. + if threshold <= 0: + return + seen = set() + for p1, h1 in paths.items(): + if verbosity >= 2: + lament("#c", p1) + seen.add(p1) + for p2, h2 in paths.items(): + if p2 in seen: + continue + if h1 == h2: + continue + diff = compare_hash(h1, h2) + if diff <= threshold: + result(diff, p1, p2) + +try: + run() +except KeyboardInterrupt: + if verbosity >= 0: + lament("# interrupted") +finally: + if os.path.exists(dbname): + backup = dbname+".bak" + if os.path.exists(backup): + os.remove(backup) + os.rename(dbname, dbname+".bak") + with open(dbname, "wb") as f: + pickle.dump(paths, f) From 12a9b1640d5cdfc4906698d1480ba6fb1ec4e891 Mon Sep 17 00:00:00 2001 From: Connor Date: Thu, 10 Aug 2017 03:28:25 -0700 Subject: [PATCH 2/2] --- idup.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/idup.py b/idup.py index 9fadad9..5e78699 100644 --- a/idup.py +++ b/idup.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # find duplicate images given a hamming distance threshold. # employs dhash to do the heavy lifting. -# does not recurse into "./_duplicate" so you can dump things there if you wish. +# doesn't recurse into "./_duplicate/" so you can dump things there if you wish. # dependencies: pillow, dhash import sys, os, os.path, pickle @@ -11,12 +11,15 @@ import dhash def lament(*args, **kwargs): print(*args, file=sys.stderr, **kwargs) -def result(diff, ours, theirs): # TODO: rename - print("{}\t{}\t{}".format(diff, ours, theirs)) +def result(diff, p1, p2): # TODO: rename + print("{}\t{}\t{}".format(diff, p1, p2)) dbname = "idup.db" exts = ".jpeg .jpg .png".split() +rootpath = "." +ignore_dir = os.path.join(rootpath, "_duplicate") + """verbosity: -1: only unrecoverable errors. 0: include failures. @@ -34,9 +37,7 @@ args = sys.argv[1:] threshold = int(args[0]) -ignore_dir = os.path.join(rootpath, "_duplicate") - -paths = {} +paths = {} # path to hash mapping. if os.path.exists(dbname) and os.path.getsize(dbname) > 0: with open(dbname, "rb") as f: @@ -55,7 +56,7 @@ for path in paths.keys(): paths = existing def compare_hash(h1, h2): - # hashes are in byte format, so we have to convert them to integers. + # hashes are in byte strings, so we have to convert them to integers. i1 = int.from_bytes(h1, byteorder="big") i2 = int.from_bytes(h2, byteorder="big") # return the hamming distance.