dup2.py

Script used on my blog post xyz You can download it by right-clicking here and choosing "Save link as". You may need to "chmod +x" to run it without typing "python" first.
#!/usr/bin/python -utt
# vim:et
import sys
import os
import hashlib
'''Find and display duplicate files in a directory tree.

Based on
http://www.endlesslycurious.com/2011/06/01/finding-duplicate-files-using-python/

with a few changes:
1. Run on Python 2.6
2. Don't delete files
3. Instead of just listing 2nd+ filename(s), display filesize and
   the list of all paths.  This will allow me to "ln -f path1 path2"
4. Some cosmetic stuff.

Use at your own risk, no warranty, etc.'''

def find_duplicates(rootdir):
    """Find duplicate files in directory tree.

    Return a list of items, where each item has:
     1. file size
     2. list of duplicated paths."""

    MINFILESIZE = 1024*1024     # Don't bother with files smaller than 1MB
    files_found = 0
    filesizes = {}
    # Build up dict with key as filesize and value as list of filenames.
    for path, dirs, files in os.walk(rootdir):
        for filename in files:
            filepath = os.path.join(path, filename)
            if os.path.islink(filepath):
                continue                # ignore symlinks
            filesize = os.lstat(filepath).st_size
            if filesize >= MINFILESIZE:
                filesizes.setdefault(filesize, []).append(filepath)
                files_found += 1
                if 0 == (files_found % 1000):
                    print "%d files (minsize=%d) found" % \
                                        (files_found, MINFILESIZE)
    print "Total: %d files (minsize=%d)" % (files_found, MINFILESIZE)
    duplicates = []
    files_scanned = 0
    # We are only interested in lists with more than one entry.
    for asize,alist in [(K,V) for [K,V] in filesizes.iteritems() if len(V) > 1]:
        hash2paths = {}
        for filepath in alist:
            filehash = hashlib.md5(open(filepath).read()).hexdigest()
            # There is probably a prettier way to do this with setdefault.
            otherpaths = hash2paths.get(filehash)
            if otherpaths is not None:
                otherpaths.append(filepath)
            else:
                hash2paths[filehash] = [filepath]
            files_scanned += 1
            if 0 == (files_scanned % 1000):
                print "%d files scanned" % files_scanned
        for apathlist in [val for (key, val) in hash2paths.iteritems()
                                if len(val) > 1]:
            duplicates.append([asize,apathlist])
    return duplicates

if __name__ == '__main__':
    if len(sys.argv) < 2:
        rootdir = '/Users'
    else:
        rootdir = sys.argv[1]

    DUPS = find_duplicates(rootdir)

    print '%d Duplicate files found.' % len(DUPS)
    for f in sorted(DUPS):
        print '\t%s' % f