From fc46cb7c53d889b26dfd372b00cc787991b69e8d Mon Sep 17 00:00:00 2001 From: Yang Zhang Date: Thu, 29 Aug 2013 15:39:33 -0700 Subject: [PATCH 1/3] Add optional commit throttling --- src/bitrot.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/src/bitrot.py b/src/bitrot.py index 97fe219..eb33b21 100644 --- a/src/bitrot.py +++ b/src/bitrot.py @@ -36,6 +36,7 @@ import sqlite3 import stat import sys import tempfile +import time CHUNK_SIZE = 16384 @@ -79,7 +80,7 @@ def get_sqlite3_cursor(path, copy=False): return conn -def run(verbosity=1, test=False): +def run(verbosity=1, test=False, commit_interval=300): current_dir = b'.' # sic, relative path bitrot_db = os.path.join(current_dir, b'.bitrot.db') conn = get_sqlite3_cursor(bitrot_db, copy=test) @@ -107,6 +108,11 @@ def run(verbosity=1, test=False): paths.append(p) total_size += st.st_size paths.sort() + last_commit_time = [time.time()] + def throttled_commit(): + if time.time() - last_commit_time[0] > commit_interval: + conn.commit() + last_commit_time[0] = time.time() for p in paths: st = os.stat(p) new_mtime = int(st.st_mtime) @@ -138,13 +144,13 @@ def run(verbosity=1, test=False): cur.execute('UPDATE bitrot SET mtime=?, path=?, ' 'timestamp=? WHERE hash=?', (new_mtime, p_uni, update_ts, new_sha1)) - conn.commit() + throttled_commit() break else: new_paths.append(p) cur.execute('INSERT INTO bitrot VALUES (?, ?, ?, ?)', (p_uni, new_mtime, new_sha1, update_ts)) - conn.commit() + throttled_commit() continue stored_mtime, stored_sha1, update_ts = row if int(stored_mtime) != new_mtime: @@ -152,7 +158,7 @@ def run(verbosity=1, test=False): cur.execute('UPDATE bitrot SET mtime=?, hash=?, timestamp=? ' 'WHERE path=?', (new_mtime, new_sha1, update_ts, p_uni)) - conn.commit() + throttled_commit() elif stored_sha1 != new_sha1: error_count += 1 print('\rerror: SHA1 mismatch for {}: expected {}, got {}.' @@ -163,7 +169,8 @@ def run(verbosity=1, test=False): ) for path in missing_paths: cur.execute('DELETE FROM bitrot WHERE path=?', (path,)) - conn.commit() + throttled_commit() + conn.commit() cur.execute('SELECT COUNT(path) FROM bitrot') all_count = cur.fetchone()[0] if verbosity: @@ -232,6 +239,8 @@ def run_from_command_line(): help='just test against an existing database, don\'t update anything') parser.add_argument('--version', action='version', version='%(prog)s {}.{}.{}'.format(*VERSION)) + parser.add_argument('--commit-interval', type=float, default=300, + help='min time between commits (0 commits on every operation)') args = parser.parse_args() if args.sum: try: @@ -244,7 +253,7 @@ def run_from_command_line(): verbosity = 0 elif args.verbose: verbosity = 2 - run(verbosity=verbosity, test=args.test) + run(verbosity=verbosity, test=args.test, commit_interval=args.commit_interval) if __name__ == '__main__': From b6faaf94fade594552cfabee175a8e12aa988636 Mon Sep 17 00:00:00 2001 From: Yang Zhang Date: Thu, 29 Aug 2013 15:51:06 -0700 Subject: [PATCH 2/3] Make chunk size configurable --- src/bitrot.py | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/src/bitrot.py b/src/bitrot.py index eb33b21..653bca8 100644 --- a/src/bitrot.py +++ b/src/bitrot.py @@ -39,20 +39,11 @@ import tempfile import time -CHUNK_SIZE = 16384 +DEFAULT_CHUNK_SIZE = 16384 DOT_THRESHOLD = 200 VERSION = (0, 5, 1) -def sha1(path): - digest = hashlib.sha1() - with open(path) as f: - d = f.read(CHUNK_SIZE) - while d: - digest.update(d) - d = f.read(CHUNK_SIZE) - return digest.hexdigest() - def get_sqlite3_cursor(path, copy=False): if copy: @@ -80,7 +71,7 @@ def get_sqlite3_cursor(path, copy=False): return conn -def run(verbosity=1, test=False, commit_interval=300): +def run(verbosity=1, test=False, commit_interval=300, chunk_size=DEFAULT_CHUNK_SIZE): current_dir = b'.' # sic, relative path bitrot_db = os.path.join(current_dir, b'.bitrot.db') conn = get_sqlite3_cursor(bitrot_db, copy=test) @@ -113,6 +104,14 @@ def run(verbosity=1, test=False, commit_interval=300): if time.time() - last_commit_time[0] > commit_interval: conn.commit() last_commit_time[0] = time.time() + def sha1(path): + digest = hashlib.sha1() + with open(path) as f: + d = f.read(chunk_size) + while d: + digest.update(d) + d = f.read(chunk_size) + return digest.hexdigest() for p in paths: st = os.stat(p) new_mtime = int(st.st_mtime) @@ -241,6 +240,8 @@ def run_from_command_line(): version='%(prog)s {}.{}.{}'.format(*VERSION)) parser.add_argument('--commit-interval', type=float, default=300, help='min time between commits (0 commits on every operation)') + parser.add_argument('--chunk-size', type=int, default=DEFAULT_CHUNK_SIZE, + help='read files this many bytes at a time') args = parser.parse_args() if args.sum: try: @@ -253,7 +254,9 @@ def run_from_command_line(): verbosity = 0 elif args.verbose: verbosity = 2 - run(verbosity=verbosity, test=args.test, commit_interval=args.commit_interval) + run(verbosity=verbosity, test=args.test, + commit_interval=args.commit_interval, + chunk_size=args.chunk_size) if __name__ == '__main__': From 11e94f663cfe32884df9ccd390ad16cd24f33b3b Mon Sep 17 00:00:00 2001 From: Yang Zhang Date: Thu, 17 Oct 2013 11:40:01 -0700 Subject: [PATCH 3/3] Clean up throttling and sha1 from feedback --- src/bitrot.py | 41 +++++++++++++++++++++++------------------ 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/src/bitrot.py b/src/bitrot.py index 653bca8..c2590b3 100644 --- a/src/bitrot.py +++ b/src/bitrot.py @@ -29,6 +29,7 @@ from __future__ import unicode_literals import argparse import atexit import datetime +import functools import hashlib import os import shutil @@ -44,6 +45,20 @@ DOT_THRESHOLD = 200 VERSION = (0, 5, 1) +def sha1(path, chunk_size): + digest = hashlib.sha1() + with open(path) as f: + d = f.read(chunk_size) + while d: + digest.update(d) + d = f.read(chunk_size) + return digest.hexdigest() + +def throttled_commit(conn, commit_interval, last_commit_time): + if time.time() - last_commit_time > commit_interval: + conn.commit() + return time.time() + return last_commit_time def get_sqlite3_cursor(path, copy=False): if copy: @@ -99,19 +114,8 @@ def run(verbosity=1, test=False, commit_interval=300, chunk_size=DEFAULT_CHUNK_S paths.append(p) total_size += st.st_size paths.sort() - last_commit_time = [time.time()] - def throttled_commit(): - if time.time() - last_commit_time[0] > commit_interval: - conn.commit() - last_commit_time[0] = time.time() - def sha1(path): - digest = hashlib.sha1() - with open(path) as f: - d = f.read(chunk_size) - while d: - digest.update(d) - d = f.read(chunk_size) - return digest.hexdigest() + last_commit_time = 0 + tcommit = functools.partial(throttled_commit, conn, commit_interval) for p in paths: st = os.stat(p) new_mtime = int(st.st_mtime) @@ -122,7 +126,7 @@ def run(verbosity=1, test=False, commit_interval=300, chunk_size=DEFAULT_CHUNK_S sys.stdout.write(size_fmt) sys.stdout.flush() last_reported_size = size_fmt - new_sha1 = sha1(p) + new_sha1 = sha1(p, chunk_size) update_ts = datetime.datetime.utcnow().strftime( '%Y-%m-%d %H:%M:%S%z' ) @@ -143,13 +147,14 @@ def run(verbosity=1, test=False, commit_interval=300, chunk_size=DEFAULT_CHUNK_S cur.execute('UPDATE bitrot SET mtime=?, path=?, ' 'timestamp=? WHERE hash=?', (new_mtime, p_uni, update_ts, new_sha1)) - throttled_commit() + + last_commit_time = tcommit(last_commit_time) break else: new_paths.append(p) cur.execute('INSERT INTO bitrot VALUES (?, ?, ?, ?)', (p_uni, new_mtime, new_sha1, update_ts)) - throttled_commit() + last_commit_time = tcommit(last_commit_time) continue stored_mtime, stored_sha1, update_ts = row if int(stored_mtime) != new_mtime: @@ -157,7 +162,7 @@ def run(verbosity=1, test=False, commit_interval=300, chunk_size=DEFAULT_CHUNK_S cur.execute('UPDATE bitrot SET mtime=?, hash=?, timestamp=? ' 'WHERE path=?', (new_mtime, new_sha1, update_ts, p_uni)) - throttled_commit() + last_commit_time = tcommit(last_commit_time) elif stored_sha1 != new_sha1: error_count += 1 print('\rerror: SHA1 mismatch for {}: expected {}, got {}.' @@ -168,7 +173,7 @@ def run(verbosity=1, test=False, commit_interval=300, chunk_size=DEFAULT_CHUNK_S ) for path in missing_paths: cur.execute('DELETE FROM bitrot WHERE path=?', (path,)) - throttled_commit() + last_commit_time = tcommit(last_commit_time) conn.commit() cur.execute('SELECT COUNT(path) FROM bitrot') all_count = cur.fetchone()[0]