Determining Identical Files in Python, Bash
I've been consolidating files from multiple computers lately. Here are a couple quick scripts that I found useful.
1. Create an MD5 hash of every file in a directory.
2. Find duplicate MD5 hashes in multiple files.
3. Do stuff to the files which are duplicates.
# md5.py
import os, os.path, sys
from subprocess import Popen, PIPE
def walk(start_dir = '/'):
directories = [start_dir]
while directories:
directory = directories.pop()
for name in os.listdir(directory):
fullpath = os.path.join(directory,name)
if os.path.isfile(fullpath):
md5 = Popen(["md5", fullpath], stdout=PIPE).communicate()[0].strip().split(' = ')[-1]
print md5 + ' ' + fullpath
elif os.path.isdir(fullpath):
directories.append(fullpath)
if __name__ == "__main__":
walk(sys.argv[1])> python md5.py /path/to/dir1 > dir1_md5_files.txt > python md5.py /path/to/dir2 > dir2_md5_files.txt # get just the md5 hashes > cut -d' ' -f1 dir1_md5_files.txt > dir1_md5s.txt > cut -d' ' -f1 dir2_md5_files.txt > dir2_md5s.txt # find the duplicates > cat dir1_md5s.txt dir2_md5s.txt | sort | uniq -d > md5_dupes.txt