#!/usr/bin/env python import os import json from hashlib import md5 rootdir = os.getcwd() CHECKSUMS = "checksums.json" try: with open(CHECKSUMS) as f: output = json.load(f) except: output = {} def md5sum(filename): """ Opens a file and progressively generates an MD5 hash from its contents, avoiding loading the complete contents into ram at once http://stackoverflow.com/a/24847608 """ hash = md5() with open(filename, "rb") as f: for chunk in iter(lambda: f.read(128 * hash.block_size), b""): hash.update(chunk) return hash.hexdigest() def main(): """ Iterate over files in subdirectories of current dir, use md5sum() to generate checksums for each, append to a dict for inclusion in checksums.json (name configurable) or checks an existing checksums.json for conflicts """ for folder, subs, files in os.walk(rootdir): for filename in files: if filename == CHECKSUMS: continue fn = os.path.join(folder, filename)[len(rootdir)+1:] if fn in output: if output[fn] != md5sum(os.path.join(folder, filename)): # replace with code to redownload given file? print("Mismatch on {}".format(fn)) else: output[os.path.join(folder, filename)[len(rootdir)+1:]] = \ md5sum(os.path.join(folder, filename)) # Pretty prints json to output file with open(os.path.join(rootdir, CHECKSUMS), "w") as f: json.dump(output, f, indent=4) main()