Script used on my blog post xyz
You can download it by right-clicking here and choosing
"Save link as". You may need to "chmod +x" to run it without typing "python" first.
#!/usr/bin/python -utt
# vim:et
import sys
import os
import hashlib
'''Find and display duplicate files in a directory tree.
Based on
http://www.endlesslycurious.com/2011/06/01/finding-duplicate-files-using-python/
with a few changes:
1. Run on Python 2.6
2. Don't delete files
3. Instead of just listing 2nd+ filename(s), display filesize and
the list of all paths. This will allow me to "ln -f path1 path2"
4. Some cosmetic stuff.
Use at your own risk, no warranty, etc.'''
def find_duplicates(rootdir):
"""Find duplicate files in directory tree.
Return a list of items, where each item has:
1. file size
2. list of duplicated paths."""
MINFILESIZE = 1024*1024 # Don't bother with files smaller than 1MB
files_found = 0
filesizes = {}
# Build up dict with key as filesize and value as list of filenames.
for path, dirs, files in os.walk(rootdir):
for filename in files:
filepath = os.path.join(path, filename)
if os.path.islink(filepath):
continue # ignore symlinks
filesize = os.lstat(filepath).st_size
if filesize >= MINFILESIZE:
filesizes.setdefault(filesize, []).append(filepath)
files_found += 1
if 0 == (files_found % 1000):
print "%d files (minsize=%d) found" % \
(files_found, MINFILESIZE)
print "Total: %d files (minsize=%d)" % (files_found, MINFILESIZE)
duplicates = []
files_scanned = 0
# We are only interested in lists with more than one entry.
for asize,alist in [(K,V) for [K,V] in filesizes.iteritems() if len(V) > 1]:
hash2paths = {}
for filepath in alist:
filehash = hashlib.md5(open(filepath).read()).hexdigest()
# There is probably a prettier way to do this with setdefault.
otherpaths = hash2paths.get(filehash)
if otherpaths is not None:
otherpaths.append(filepath)
else:
hash2paths[filehash] = [filepath]
files_scanned += 1
if 0 == (files_scanned % 1000):
print "%d files scanned" % files_scanned
for apathlist in [val for (key, val) in hash2paths.iteritems()
if len(val) > 1]:
duplicates.append([asize,apathlist])
return duplicates
if __name__ == '__main__':
if len(sys.argv) < 2:
rootdir = '/Users'
else:
rootdir = sys.argv[1]
DUPS = find_duplicates(rootdir)
print '%d Duplicate files found.' % len(DUPS)
for f in sorted(DUPS):
print '\t%s' % f