Merge branch 'periodic-commits' of git://github.com/yang/bitrot into yang-periodic-commits

Conflicts:
	src/bitrot.py
This commit is contained in:
Łukasz Langa 2013-10-19 22:17:17 +02:00
commit a8faff93e1

View File

@ -30,6 +30,7 @@ import argparse
import atexit import atexit
import datetime import datetime
import errno import errno
import functools
import hashlib import hashlib
import os import os
import shutil import shutil
@ -37,22 +38,28 @@ import sqlite3
import stat import stat
import sys import sys
import tempfile import tempfile
import time
CHUNK_SIZE = 16384 DEFAULT_CHUNK_SIZE = 16384
DOT_THRESHOLD = 200 DOT_THRESHOLD = 200
VERSION = (0, 5, 1) VERSION = (0, 5, 1)
def sha1(path): def sha1(path, chunk_size):
digest = hashlib.sha1() digest = hashlib.sha1()
with open(path) as f: with open(path) as f:
d = f.read(CHUNK_SIZE) d = f.read(chunk_size)
while d: while d:
digest.update(d) digest.update(d)
d = f.read(CHUNK_SIZE) d = f.read(chunk_size)
return digest.hexdigest() return digest.hexdigest()
def throttled_commit(conn, commit_interval, last_commit_time):
if time.time() - last_commit_time > commit_interval:
conn.commit()
return time.time()
return last_commit_time
def get_sqlite3_cursor(path, copy=False): def get_sqlite3_cursor(path, copy=False):
if copy: if copy:
@ -80,7 +87,7 @@ def get_sqlite3_cursor(path, copy=False):
return conn return conn
def run(verbosity=1, test=False): def run(verbosity=1, test=False, commit_interval=300, chunk_size=DEFAULT_CHUNK_SIZE):
current_dir = b'.' # sic, relative path current_dir = b'.' # sic, relative path
bitrot_db = os.path.join(current_dir, b'.bitrot.db') bitrot_db = os.path.join(current_dir, b'.bitrot.db')
conn = get_sqlite3_cursor(bitrot_db, copy=test) conn = get_sqlite3_cursor(bitrot_db, copy=test)
@ -113,6 +120,8 @@ def run(verbosity=1, test=False):
paths.append(p) paths.append(p)
total_size += st.st_size total_size += st.st_size
paths.sort() paths.sort()
last_commit_time = 0
tcommit = functools.partial(throttled_commit, conn, commit_interval)
for p in paths: for p in paths:
st = os.stat(p) st = os.stat(p)
new_mtime = int(st.st_mtime) new_mtime = int(st.st_mtime)
@ -123,7 +132,7 @@ def run(verbosity=1, test=False):
sys.stdout.write(size_fmt) sys.stdout.write(size_fmt)
sys.stdout.flush() sys.stdout.flush()
last_reported_size = size_fmt last_reported_size = size_fmt
new_sha1 = sha1(p) new_sha1 = sha1(p, chunk_size)
update_ts = datetime.datetime.utcnow().strftime( update_ts = datetime.datetime.utcnow().strftime(
'%Y-%m-%d %H:%M:%S%z' '%Y-%m-%d %H:%M:%S%z'
) )
@ -144,13 +153,14 @@ def run(verbosity=1, test=False):
cur.execute('UPDATE bitrot SET mtime=?, path=?, ' cur.execute('UPDATE bitrot SET mtime=?, path=?, '
'timestamp=? WHERE hash=?', 'timestamp=? WHERE hash=?',
(new_mtime, p_uni, update_ts, new_sha1)) (new_mtime, p_uni, update_ts, new_sha1))
conn.commit()
last_commit_time = tcommit(last_commit_time)
break break
else: else:
new_paths.append(p) new_paths.append(p)
cur.execute('INSERT INTO bitrot VALUES (?, ?, ?, ?)', cur.execute('INSERT INTO bitrot VALUES (?, ?, ?, ?)',
(p_uni, new_mtime, new_sha1, update_ts)) (p_uni, new_mtime, new_sha1, update_ts))
conn.commit() last_commit_time = tcommit(last_commit_time)
continue continue
stored_mtime, stored_sha1, update_ts = row stored_mtime, stored_sha1, update_ts = row
if int(stored_mtime) != new_mtime: if int(stored_mtime) != new_mtime:
@ -158,7 +168,7 @@ def run(verbosity=1, test=False):
cur.execute('UPDATE bitrot SET mtime=?, hash=?, timestamp=? ' cur.execute('UPDATE bitrot SET mtime=?, hash=?, timestamp=? '
'WHERE path=?', 'WHERE path=?',
(new_mtime, new_sha1, update_ts, p_uni)) (new_mtime, new_sha1, update_ts, p_uni))
conn.commit() last_commit_time = tcommit(last_commit_time)
elif stored_sha1 != new_sha1: elif stored_sha1 != new_sha1:
error_count += 1 error_count += 1
print('\rerror: SHA1 mismatch for {}: expected {}, got {}.' print('\rerror: SHA1 mismatch for {}: expected {}, got {}.'
@ -169,7 +179,8 @@ def run(verbosity=1, test=False):
) )
for path in missing_paths: for path in missing_paths:
cur.execute('DELETE FROM bitrot WHERE path=?', (path,)) cur.execute('DELETE FROM bitrot WHERE path=?', (path,))
conn.commit() last_commit_time = tcommit(last_commit_time)
conn.commit()
cur.execute('SELECT COUNT(path) FROM bitrot') cur.execute('SELECT COUNT(path) FROM bitrot')
all_count = cur.fetchone()[0] all_count = cur.fetchone()[0]
if verbosity: if verbosity:
@ -238,6 +249,10 @@ def run_from_command_line():
help='just test against an existing database, don\'t update anything') help='just test against an existing database, don\'t update anything')
parser.add_argument('--version', action='version', parser.add_argument('--version', action='version',
version='%(prog)s {}.{}.{}'.format(*VERSION)) version='%(prog)s {}.{}.{}'.format(*VERSION))
parser.add_argument('--commit-interval', type=float, default=300,
help='min time between commits (0 commits on every operation)')
parser.add_argument('--chunk-size', type=int, default=DEFAULT_CHUNK_SIZE,
help='read files this many bytes at a time')
args = parser.parse_args() args = parser.parse_args()
if args.sum: if args.sum:
try: try:
@ -250,7 +265,9 @@ def run_from_command_line():
verbosity = 0 verbosity = 0
elif args.verbose: elif args.verbose:
verbosity = 2 verbosity = 2
run(verbosity=verbosity, test=args.test) run(verbosity=verbosity, test=args.test,
commit_interval=args.commit_interval,
chunk_size=args.chunk_size)
if __name__ == '__main__': if __name__ == '__main__':