diff --git a/src/bitrot.py b/src/bitrot.py index bd51228..e75475f 100644 --- a/src/bitrot.py +++ b/src/bitrot.py @@ -30,6 +30,7 @@ import argparse import atexit import datetime import errno +import functools import hashlib import os import shutil @@ -37,22 +38,28 @@ import sqlite3 import stat import sys import tempfile +import time -CHUNK_SIZE = 16384 +DEFAULT_CHUNK_SIZE = 16384 DOT_THRESHOLD = 200 VERSION = (0, 5, 1) -def sha1(path): +def sha1(path, chunk_size): digest = hashlib.sha1() with open(path) as f: - d = f.read(CHUNK_SIZE) + d = f.read(chunk_size) while d: digest.update(d) - d = f.read(CHUNK_SIZE) + d = f.read(chunk_size) return digest.hexdigest() +def throttled_commit(conn, commit_interval, last_commit_time): + if time.time() - last_commit_time > commit_interval: + conn.commit() + return time.time() + return last_commit_time def get_sqlite3_cursor(path, copy=False): if copy: @@ -80,7 +87,7 @@ def get_sqlite3_cursor(path, copy=False): return conn -def run(verbosity=1, test=False): +def run(verbosity=1, test=False, commit_interval=300, chunk_size=DEFAULT_CHUNK_SIZE): current_dir = b'.' # sic, relative path bitrot_db = os.path.join(current_dir, b'.bitrot.db') conn = get_sqlite3_cursor(bitrot_db, copy=test) @@ -113,6 +120,8 @@ def run(verbosity=1, test=False): paths.append(p) total_size += st.st_size paths.sort() + last_commit_time = 0 + tcommit = functools.partial(throttled_commit, conn, commit_interval) for p in paths: st = os.stat(p) new_mtime = int(st.st_mtime) @@ -123,7 +132,7 @@ def run(verbosity=1, test=False): sys.stdout.write(size_fmt) sys.stdout.flush() last_reported_size = size_fmt - new_sha1 = sha1(p) + new_sha1 = sha1(p, chunk_size) update_ts = datetime.datetime.utcnow().strftime( '%Y-%m-%d %H:%M:%S%z' ) @@ -144,13 +153,14 @@ def run(verbosity=1, test=False): cur.execute('UPDATE bitrot SET mtime=?, path=?, ' 'timestamp=? WHERE hash=?', (new_mtime, p_uni, update_ts, new_sha1)) - conn.commit() + + last_commit_time = tcommit(last_commit_time) break else: new_paths.append(p) cur.execute('INSERT INTO bitrot VALUES (?, ?, ?, ?)', (p_uni, new_mtime, new_sha1, update_ts)) - conn.commit() + last_commit_time = tcommit(last_commit_time) continue stored_mtime, stored_sha1, update_ts = row if int(stored_mtime) != new_mtime: @@ -158,7 +168,7 @@ def run(verbosity=1, test=False): cur.execute('UPDATE bitrot SET mtime=?, hash=?, timestamp=? ' 'WHERE path=?', (new_mtime, new_sha1, update_ts, p_uni)) - conn.commit() + last_commit_time = tcommit(last_commit_time) elif stored_sha1 != new_sha1: error_count += 1 print('\rerror: SHA1 mismatch for {}: expected {}, got {}.' @@ -169,7 +179,8 @@ def run(verbosity=1, test=False): ) for path in missing_paths: cur.execute('DELETE FROM bitrot WHERE path=?', (path,)) - conn.commit() + last_commit_time = tcommit(last_commit_time) + conn.commit() cur.execute('SELECT COUNT(path) FROM bitrot') all_count = cur.fetchone()[0] if verbosity: @@ -238,6 +249,10 @@ def run_from_command_line(): help='just test against an existing database, don\'t update anything') parser.add_argument('--version', action='version', version='%(prog)s {}.{}.{}'.format(*VERSION)) + parser.add_argument('--commit-interval', type=float, default=300, + help='min time between commits (0 commits on every operation)') + parser.add_argument('--chunk-size', type=int, default=DEFAULT_CHUNK_SIZE, + help='read files this many bytes at a time') args = parser.parse_args() if args.sum: try: @@ -250,7 +265,9 @@ def run_from_command_line(): verbosity = 0 elif args.verbose: verbosity = 2 - run(verbosity=verbosity, test=args.test) + run(verbosity=verbosity, test=args.test, + commit_interval=args.commit_interval, + chunk_size=args.chunk_size) if __name__ == '__main__':