#!/usr/bin/python -utt # vim:et import sys import os import hashlib '''Find and display duplicate files in a directory tree. Based on http://www.endlesslycurious.com/2011/06/01/finding-duplicate-files-using-python/ with a few changes: 1. Run on Python 2.6 2. Don't delete files 3. Instead of just listing 2nd+ filename(s), display filesize and the list of all paths. This will allow me to "ln -f path1 path2" 4. Some cosmetic stuff. Use at your own risk, no warranty, etc.''' def find_duplicates(rootdir): """Find duplicate files in directory tree. Return a list of items, where each item has: 1. file size 2. list of duplicated paths.""" MINFILESIZE = 1024*1024 # Don't bother with files smaller than 1MB files_found = 0 filesizes = {} # Build up dict with key as filesize and value as list of filenames. for path, dirs, files in os.walk(rootdir): for filename in files: filepath = os.path.join(path, filename) if os.path.islink(filepath): continue # ignore symlinks filesize = os.lstat(filepath).st_size if filesize >= MINFILESIZE: filesizes.setdefault(filesize, []).append(filepath) files_found += 1 if 0 == (files_found % 1000): print "%d files (minsize=%d) found" % \ (files_found, MINFILESIZE) print "Total: %d files (minsize=%d)" % (files_found, MINFILESIZE) duplicates = [] files_scanned = 0 # We are only interested in lists with more than one entry. for asize,alist in [(K,V) for [K,V] in filesizes.iteritems() if len(V) > 1]: hash2paths = {} for filepath in alist: filehash = hashlib.md5(open(filepath).read()).hexdigest() # There is probably a prettier way to do this with setdefault. otherpaths = hash2paths.get(filehash) if otherpaths is not None: otherpaths.append(filepath) else: hash2paths[filehash] = [filepath] files_scanned += 1 if 0 == (files_scanned % 1000): print "%d files scanned" % files_scanned for apathlist in [val for (key, val) in hash2paths.iteritems() if len(val) > 1]: duplicates.append([asize,apathlist]) return duplicates if __name__ == '__main__': if len(sys.argv) < 2: rootdir = '/Users' else: rootdir = sys.argv[1] DUPS = find_duplicates(rootdir) print '%d Duplicate files found.' % len(DUPS) for f in sorted(DUPS): print '\t%s' % f