Merge remote-tracking branch 'image_deduplication/master'
This commit is contained in:
commit
703f4b7039
1 changed files with 136 additions and 0 deletions
136
idup.py
Normal file
136
idup.py
Normal file
|
@ -0,0 +1,136 @@
|
|||
#!/usr/bin/env python3
|
||||
# find duplicate images given a hamming distance threshold.
|
||||
# employs dhash to do the heavy lifting.
|
||||
# doesn't recurse into "./_duplicate/" so you can dump things there if you wish.
|
||||
# dependencies: pillow, dhash
|
||||
|
||||
import sys, os, os.path, pickle
|
||||
from PIL import Image
|
||||
import dhash
|
||||
|
||||
def lament(*args, **kwargs):
|
||||
print(*args, file=sys.stderr, **kwargs)
|
||||
|
||||
def result(diff, p1, p2): # TODO: rename
|
||||
print("{}\t{}\t{}".format(diff, p1, p2))
|
||||
|
||||
dbname = "idup.db"
|
||||
exts = ".jpeg .jpg .png".split()
|
||||
|
||||
rootpath = "."
|
||||
ignore_dir = os.path.join(rootpath, "_duplicate")
|
||||
|
||||
"""verbosity:
|
||||
-1: only unrecoverable errors.
|
||||
0: include failures.
|
||||
1: include image opening/hashing.
|
||||
2: the kitchen sink.
|
||||
"""
|
||||
verbosity = 1
|
||||
|
||||
pname = sys.argv[0]
|
||||
if len(sys.argv) <= 1:
|
||||
print("usage: {} {{threshold}}".format(pname))
|
||||
print(" utilizes {} in the current working directory".format(dbname))
|
||||
sys.exit(1)
|
||||
args = sys.argv[1:]
|
||||
|
||||
threshold = int(args[0])
|
||||
|
||||
paths = {} # path to hash mapping.
|
||||
|
||||
if os.path.exists(dbname) and os.path.getsize(dbname) > 0:
|
||||
with open(dbname, "rb") as f:
|
||||
paths = pickle.load(f)
|
||||
#lament("loaded", len(paths.keys()), "hashes")
|
||||
else:
|
||||
if verbosity >= 0:
|
||||
lament("warning: no database found. starting from scratch.")
|
||||
|
||||
existing = dict((path, h) for path, h in paths.items() if os.path.exists(path))
|
||||
for path in paths.keys():
|
||||
if path not in existing:
|
||||
if verbosity >= 0:
|
||||
lament("#d", path)
|
||||
|
||||
paths = existing
|
||||
|
||||
def compare_hash(h1, h2):
|
||||
# hashes are in byte strings, so we have to convert them to integers.
|
||||
i1 = int.from_bytes(h1, byteorder="big")
|
||||
i2 = int.from_bytes(h2, byteorder="big")
|
||||
# return the hamming distance.
|
||||
return bin(i1 ^ i2).count('1')
|
||||
|
||||
def run():
|
||||
for dn, _, fns in os.walk(rootpath):
|
||||
if dn == ignore_dir:
|
||||
continue
|
||||
|
||||
for fn in fns:
|
||||
name, ext = os.path.splitext(fn)
|
||||
path = os.path.join(dn, fn)
|
||||
if ext not in exts:
|
||||
continue
|
||||
|
||||
if path in paths:
|
||||
if verbosity >= 2:
|
||||
lament("#s", path)
|
||||
continue
|
||||
|
||||
try:
|
||||
image = Image.open(path)
|
||||
except OSError:
|
||||
if verbosity >= 0:
|
||||
lament("#f", path)
|
||||
else:
|
||||
try:
|
||||
row, col = dhash.dhash_row_col(image)
|
||||
except OSError:
|
||||
if verbosity >= 0:
|
||||
lament("#f", path)
|
||||
else:
|
||||
if verbosity >= 1:
|
||||
lament("#o", path)
|
||||
h = dhash.format_bytes(row, col)
|
||||
paths[path] = h
|
||||
finally:
|
||||
image.close()
|
||||
|
||||
# first pass: exact hash matching.
|
||||
hashes = dict((v, k) for k, v in paths.items())
|
||||
for p1, h in paths.items():
|
||||
p2 = hashes[h]
|
||||
if p1 != p2:
|
||||
result(-1, p1, p2)
|
||||
|
||||
# second pass: fuzzy hash matching.
|
||||
if threshold <= 0:
|
||||
return
|
||||
seen = set()
|
||||
for p1, h1 in paths.items():
|
||||
if verbosity >= 2:
|
||||
lament("#c", p1)
|
||||
seen.add(p1)
|
||||
for p2, h2 in paths.items():
|
||||
if p2 in seen:
|
||||
continue
|
||||
if h1 == h2:
|
||||
continue
|
||||
diff = compare_hash(h1, h2)
|
||||
if diff <= threshold:
|
||||
result(diff, p1, p2)
|
||||
|
||||
try:
|
||||
run()
|
||||
except KeyboardInterrupt:
|
||||
if verbosity >= 0:
|
||||
lament("# interrupted")
|
||||
finally:
|
||||
if os.path.exists(dbname):
|
||||
backup = dbname+".bak"
|
||||
if os.path.exists(backup):
|
||||
os.remove(backup)
|
||||
os.rename(dbname, dbname+".bak")
|
||||
with open(dbname, "wb") as f:
|
||||
pickle.dump(paths, f)
|
Loading…
Reference in a new issue