[0.7.0] Multiple bug fixes and refactors

This commit is contained in:
Łukasz Langa 2015-06-22 18:08:26 -07:00
parent 08c6d436bf
commit 13b0067ac8
2 changed files with 241 additions and 118 deletions

View File

@ -36,6 +36,23 @@ a 100 GB Aperture library in under 10 minutes. Both tests on HFS+.
Change Log Change Log
---------- ----------
0.7.0
~~~~~
* when a file changes or is renamed, the timestamp of the last check is
updated, too
* bugfix: files that disappeared during the run are now properly ignored
* bugfix: files that are locked or with otherwise denied access are
skipped. If they were read before, they will be considered "missing"
in the report.
* bugfix: if there are multiple files with the same content in the
scanned directory tree, renames are now handled properly for them
* refactored some horrible code to be a little less horrible
0.6.0 0.6.0
~~~~~ ~~~~~
@ -103,4 +120,5 @@ Authors
------- -------
Glued together by `Łukasz Langa <mailto:lukasz@langa.pl>`_. Multiple Glued together by `Łukasz Langa <mailto:lukasz@langa.pl>`_. Multiple
improvements by `Yang Zhang <mailto:yaaang@gmail.com>`_. improvements by `Yang Zhang <mailto:yaaang@gmail.com>`_ and `Jean-Louis
Fuchs <mailto:ganwell@fangorn.ch>`.

View File

@ -30,7 +30,6 @@ import argparse
import atexit import atexit
import datetime import datetime
import errno import errno
import functools
import hashlib import hashlib
import os import os
import shutil import shutil
@ -43,7 +42,8 @@ import time
DEFAULT_CHUNK_SIZE = 16384 DEFAULT_CHUNK_SIZE = 16384
DOT_THRESHOLD = 200 DOT_THRESHOLD = 200
VERSION = (0, 6, 0) VERSION = (0, 7, 0)
IGNORED_FILE_SYSTEM_ERRORS = {errno.ENOENT, errno.EACCES}
def sha1(path, chunk_size): def sha1(path, chunk_size):
@ -56,11 +56,8 @@ def sha1(path, chunk_size):
return digest.hexdigest() return digest.hexdigest()
def throttled_commit(conn, commit_interval, last_commit_time): def ts():
if time.time() - last_commit_time > commit_interval: return datetime.datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S%z')
conn.commit()
last_commit_time = time.time()
return last_commit_time
def get_sqlite3_cursor(path, copy=False): def get_sqlite3_cursor(path, copy=False):
@ -90,134 +87,208 @@ def get_sqlite3_cursor(path, copy=False):
return conn return conn
def run(verbosity=1, test=False, follow_links=False, commit_interval=300, def list_existing_paths(directory, expected=(), ignored=(), follow_links=False):
chunk_size=DEFAULT_CHUNK_SIZE): """list_existing_paths('/dir') -> ([path1, path2, ...], total_size)
current_dir = b'.' # sic, relative path
bitrot_db = os.path.join(current_dir, b'.bitrot.db') Returns a tuple with a list with existing files in `directory` and their
try: `total_size`.
conn = get_sqlite3_cursor(bitrot_db, copy=test)
except ValueError: Doesn't add entries listed in `ignored`. Doesn't add symlinks if
print('No database exists so cannot test. Run the tool once first.') `follow_links` is False (the default). All entries present in `expected`
sys.exit(2) must be files (can't be directories or symlinks).
cur = conn.cursor() """
new_paths = []
updated_paths = []
renamed_paths = []
error_count = 0
total_size = 0
current_size = 0
last_reported_size = ''
missing_paths = set()
cur.execute('SELECT path FROM bitrot')
row = cur.fetchone()
while row:
missing_paths.add(row[0])
row = cur.fetchone()
paths = [] paths = []
for path, _, files in os.walk(current_dir): total_size = 0
for path, _, files in os.walk(directory):
for f in files: for f in files:
p = os.path.join(path, f) p = os.path.join(path, f)
p_uni = p.decode('utf8') p_uni = p.decode('utf8')
try: try:
if follow_links or p_uni in missing_paths: if follow_links or p_uni in expected:
st = os.stat(p) st = os.stat(p)
else: else:
st = os.lstat(p) st = os.lstat(p)
except OSError as ex: except OSError as ex:
if ex.errno != errno.ENOENT: if ex.errno not in IGNORED_FILE_SYSTEM_ERRORS:
raise raise
else: else:
if not stat.S_ISREG(st.st_mode) or p == bitrot_db: if not stat.S_ISREG(st.st_mode) or p in ignored:
continue continue
paths.append(p) paths.append(p)
total_size += st.st_size total_size += st.st_size
paths.sort() paths.sort()
last_commit_time = 0 return paths, total_size
tcommit = functools.partial(throttled_commit, conn, commit_interval)
for p in paths:
st = os.stat(p) class BitrotException(Exception):
new_mtime = int(st.st_mtime) pass
current_size += st.st_size
if verbosity:
size_fmt = '\r{:>6.1%}'.format(current_size/(total_size or 1)) class Bitrot(object):
if size_fmt != last_reported_size: def __init__(
sys.stdout.write(size_fmt) self, verbosity=1, test=False, follow_links=False, commit_interval=300,
sys.stdout.flush() chunk_size=DEFAULT_CHUNK_SIZE,
last_reported_size = size_fmt ):
p_uni = p.decode('utf8') self.verbosity = verbosity
missing_paths.discard(p_uni) self.test = test
self.follow_links = follow_links
self.commit_interval = commit_interval
self.chunk_size = chunk_size
self._last_reported_size = ''
self._last_commit_ts = 0
def maybe_commit(self, conn):
if time.time() < self._last_commit_ts + self.commit_interval:
# no time for commit yet!
return
conn.commit()
self._last_commit_ts = time.time()
def run(self):
current_dir = b'.' # sic, relative path
bitrot_db = os.path.join(current_dir, b'.bitrot.db')
try: try:
new_sha1 = sha1(p, chunk_size) conn = get_sqlite3_cursor(bitrot_db, copy=self.test)
except (IOError, OSError) as e: except ValueError:
if verbosity: raise BitrotException(
2,
'No database exists so cannot test. Run the tool once first.',
)
cur = conn.cursor()
new_paths = []
updated_paths = []
renamed_paths = []
error_count = 0
current_size = 0
missing_paths = self.select_all_paths(cur)
paths, total_size = list_existing_paths(
current_dir, expected=missing_paths, ignored={bitrot_db},
follow_links=self.follow_links,
)
for p in paths:
p_uni = p.decode('utf8')
try:
st = os.stat(p)
except OSError as ex:
if ex.errno in IGNORED_FILE_SYSTEM_ERRORS:
# The file disappeared between listing existing paths and
# this run or is (temporarily?) locked with different
# permissions. We'll just skip it for now.
if self.verbosity:
print(
'\rwarning: `{}` is currently unavailable for '
'reading: {}'.format(
p_uni, ex,
),
file=sys.stderr,
)
continue
raise # Not expected? https://github.com/ambv/bitrot/issues/
new_mtime = int(st.st_mtime)
current_size += st.st_size
if self.verbosity:
self.report_progress(current_size, total_size)
missing_paths.discard(p_uni)
try:
new_sha1 = sha1(p, self.chunk_size)
except (IOError, OSError) as e:
if self.verbosity:
print(
'\rwarning: cannot compute hash of {} [{}]'.format(
p, errno.errorcode[e.args[0]],
),
file=sys.stderr,
)
continue
cur.execute('SELECT mtime, hash, timestamp FROM bitrot WHERE '
'path=?', (p_uni,))
row = cur.fetchone()
if not row:
stored_path = self.handle_unknown_path(
cur, p_uni, new_mtime, new_sha1,
)
self.maybe_commit(conn)
if p_uni == stored_path:
new_paths.append(p) # FIXME: shouldn't that be p_uni?
else:
renamed_paths.append((stored_path, p_uni))
missing_paths.discard(stored_path)
continue
stored_mtime, stored_sha1, stored_ts = row
if int(stored_mtime) != new_mtime:
updated_paths.append(p)
cur.execute('UPDATE bitrot SET mtime=?, hash=?, timestamp=? '
'WHERE path=?',
(new_mtime, new_sha1, ts(), p_uni))
self.maybe_commit(conn)
continue
if stored_sha1 != new_sha1:
error_count += 1
print( print(
'\rwarning: cannot compute hash of {} [{}]'.format( '\rerror: SHA1 mismatch for {}: expected {}, got {}.'
p, errno.errorcode[e.args[0]], ' Last good hash checked on {}.'.format(
p, stored_sha1, new_sha1, stored_ts
), ),
file=sys.stderr, file=sys.stderr,
) )
continue
update_ts = datetime.datetime.utcnow().strftime(
'%Y-%m-%d %H:%M:%S%z'
)
cur.execute('SELECT mtime, hash, timestamp FROM bitrot WHERE '
'path=?', (p_uni,))
row = cur.fetchone()
if not row:
cur.execute('SELECT mtime, path, timestamp FROM bitrot WHERE '
'hash=?', (new_sha1,))
rows = cur.fetchall()
for row in rows:
stored_mtime, stored_path, update_ts = row
if not os.path.exists(stored_path):
renamed_paths.append((stored_path, p_uni))
missing_paths.discard(stored_path)
cur.execute('UPDATE bitrot SET mtime=?, path=?, '
'timestamp=?, hash=? WHERE path=?',
(
new_mtime,
p_uni,
update_ts,
new_sha1,
stored_path
))
last_commit_time = tcommit(last_commit_time) for path in missing_paths:
break cur.execute('DELETE FROM bitrot WHERE path=?', (path,))
else:
new_paths.append(p) conn.commit()
cur.execute(
'INSERT INTO bitrot VALUES (?, ?, ?, ?)', if self.verbosity:
(p_uni, new_mtime, new_sha1, update_ts), cur.execute('SELECT COUNT(path) FROM bitrot')
) all_count = cur.fetchone()[0]
last_commit_time = tcommit(last_commit_time) self.report_done(
continue total_size,
stored_mtime, stored_sha1, update_ts = row all_count,
if int(stored_mtime) != new_mtime: error_count,
updated_paths.append(p) new_paths,
cur.execute('UPDATE bitrot SET mtime=?, hash=?, timestamp=? ' updated_paths,
'WHERE path=?', renamed_paths,
(new_mtime, new_sha1, update_ts, p_uni)) missing_paths,
last_commit_time = tcommit(last_commit_time)
elif stored_sha1 != new_sha1:
error_count += 1
print(
'\rerror: SHA1 mismatch for {}: expected {}, got {}.'
' Original info from {}.'.format(
p, stored_sha1, new_sha1, update_ts
),
file=sys.stderr,
) )
for path in missing_paths:
cur.execute('DELETE FROM bitrot WHERE path=?', (path,)) if error_count:
last_commit_time = tcommit(last_commit_time) raise BitrotException(
conn.commit() 1, 'There were {} errors found.'.format(error_count),
cur.execute('SELECT COUNT(path) FROM bitrot') )
all_count = cur.fetchone()[0]
if verbosity: def select_all_paths(self, cur):
result = set()
cur.execute('SELECT path FROM bitrot')
row = cur.fetchone()
while row:
result.add(row[0])
row = cur.fetchone()
return result
def report_progress(self, current_size, total_size):
size_fmt = '\r{:>6.1%}'.format(current_size/(total_size or 1))
if size_fmt == self._last_reported_size:
return
sys.stdout.write(size_fmt)
sys.stdout.flush()
self._last_reported_size = size_fmt
def report_done(
self, total_size, all_count, error_count, new_paths, updated_paths,
renamed_paths, missing_paths):
print('\rFinished. {:.2f} MiB of data read. {} errors found.' print('\rFinished. {:.2f} MiB of data read. {} errors found.'
''.format(total_size/1024/1024, error_count)) ''.format(total_size/1024/1024, error_count))
if verbosity == 1: if self.verbosity == 1:
print( print(
'{} entries in the database, {} new, {} updated, ' '{} entries in the database, {} new, {} updated, '
'{} renamed, {} missing.'.format( '{} renamed, {} missing.'.format(
@ -225,7 +296,7 @@ def run(verbosity=1, test=False, follow_links=False, commit_interval=300,
len(renamed_paths), len(missing_paths), len(renamed_paths), len(missing_paths),
), ),
) )
elif verbosity > 1: elif self.verbosity > 1:
print('{} entries in the database.'.format(all_count), end=' ') print('{} entries in the database.'.format(all_count), end=' ')
if new_paths: if new_paths:
print('{} entries new:'.format(len(new_paths))) print('{} entries new:'.format(len(new_paths)))
@ -249,10 +320,39 @@ def run(verbosity=1, test=False, follow_links=False, commit_interval=300,
print(' ', path) print(' ', path)
if not any((new_paths, updated_paths, missing_paths)): if not any((new_paths, updated_paths, missing_paths)):
print() print()
if test: if self.test:
print('warning: database file not updated on disk (test mode).') print('warning: database file not updated on disk (test mode).')
if error_count:
sys.exit(1) def handle_unknown_path(self, cur, new_path, new_mtime, new_sha1):
"""Either add a new entry to the database or update the existing entry
on rename.
Returns `new_path` if the entry was indeed new or the `stored_path` (e.g.
outdated path) if there was a rename.
"""
cur.execute('SELECT mtime, path, timestamp FROM bitrot WHERE hash=?',
(new_sha1,))
rows = cur.fetchall()
for row in rows:
stored_mtime, stored_path, stored_ts = row
if os.path.exists(stored_path):
# file still exists, move on
continue
# update the path in the database
cur.execute(
'UPDATE bitrot SET mtime=?, path=?, timestamp=? WHERE path=?',
(new_mtime, new_path, ts(), stored_path),
)
return stored_path
# no rename, just a new file with the same hash
cur.execute(
'INSERT INTO bitrot VALUES (?, ?, ?, ?)',
(new_path, new_mtime, new_sha1, ts()),
)
return new_path
def stable_sum(): def stable_sum():
@ -315,13 +415,18 @@ def run_from_command_line():
verbosity = 0 verbosity = 0
elif args.verbose: elif args.verbose:
verbosity = 2 verbosity = 2
run( bt = Bitrot(
verbosity=verbosity, verbosity=verbosity,
test=args.test, test=args.test,
follow_links=args.follow_links, follow_links=args.follow_links,
commit_interval=args.commit_interval, commit_interval=args.commit_interval,
chunk_size=args.chunk_size, chunk_size=args.chunk_size,
) )
try:
bt.run()
except BitrotException as bre:
print('error:', bre.args[1], file=sys.stderr)
sys.exit(bre.args[0])
if __name__ == '__main__': if __name__ == '__main__':