This commit is contained in:
Connor 2017-08-10 03:28:25 -07:00 committed by GitHub
parent 7ccb616fee
commit 12a9b1640d

15
idup.py
View file

@ -1,7 +1,7 @@
#!/usr/bin/env python3
# find duplicate images given a hamming distance threshold.
# employs dhash to do the heavy lifting.
# does not recurse into "./_duplicate" so you can dump things there if you wish.
# doesn't recurse into "./_duplicate/" so you can dump things there if you wish.
# dependencies: pillow, dhash
import sys, os, os.path, pickle
@ -11,12 +11,15 @@ import dhash
def lament(*args, **kwargs):
print(*args, file=sys.stderr, **kwargs)
def result(diff, ours, theirs): # TODO: rename
print("{}\t{}\t{}".format(diff, ours, theirs))
def result(diff, p1, p2): # TODO: rename
print("{}\t{}\t{}".format(diff, p1, p2))
dbname = "idup.db"
exts = ".jpeg .jpg .png".split()
rootpath = "."
ignore_dir = os.path.join(rootpath, "_duplicate")
"""verbosity:
-1: only unrecoverable errors.
0: include failures.
@ -34,9 +37,7 @@ args = sys.argv[1:]
threshold = int(args[0])
ignore_dir = os.path.join(rootpath, "_duplicate")
paths = {}
paths = {} # path to hash mapping.
if os.path.exists(dbname) and os.path.getsize(dbname) > 0:
with open(dbname, "rb") as f:
@ -55,7 +56,7 @@ for path in paths.keys():
paths = existing
def compare_hash(h1, h2):
# hashes are in byte format, so we have to convert them to integers.
# hashes are in byte strings, so we have to convert them to integers.
i1 = int.from_bytes(h1, byteorder="big")
i2 = int.from_bytes(h2, byteorder="big")
# return the hamming distance.