#!/usr/bin/python from imohash.imohash import hashfile as _imohash import argparse import hashlib import os import psutil import sqlite3 # https://stackoverflow.com/a/17782753 def file_hash(path, block_size=4096*8): ''' Block size directly depends on the block size of your filesystem to avoid performances issues Here I have blocks of 4096 octets (Default NTFS) ''' _hash = hashlib.blake2b() if os.path.getsize(path) < psutil.virtual_memory().available: split = False else: split = True with open(path,'rb') as f: if split: for chunk in iter(lambda: f.read(block_size), b''): _hash.update(chunk) else: bytes = f.read() _hash.update(bytes) return _hash.digest() def hashify(top): old_dir = os.getcwd() os.chdir(top) ret = [] imo_hashes = set() for root,dirs,files in os.walk('.'): try: for file in files: filepath = os.path.join(root,file) imohash = _imohash(filepath) real_hash = None if imohash in imo_hashes: print(filepath) real_hash = file_hash(filepath) else: imo_hashes.add(imohash) ret.append( ( filepath, imohash, real_hash ) ) except PermissionError: print('Access denied:',root) except Exception as e: print(e,file) os.chdir(old_dir) #print(imo_hashes) return ret def __init_database__(path): con = sqlite3.connect(path) cur = con.cursor() cur.execute('CREATE TABLE IF NOT EXISTS `paths` (`path` TEXT, `imohash` BLOB, `blake` BLOB, UNIQUE(`path`, `imohash`, `blake`));') con.commit() return con def backup(top,db_path): paths = hashify(top) con = __init_database__(db_path) cur = con.cursor() cur.executemany('INSERT OR IGNORE INTO `paths` VALUES (?,?,?);',paths) con.commit() if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('dir') parser.add_argument('-d','--database',default='fs.db') args = parser.parse_args() __init_database__(args.database) backup(args.dir,args.database)