[0.7.0] Multiple bug fixes and refactors
This commit is contained in:
parent
08c6d436bf
commit
13b0067ac8
20
README.rst
20
README.rst
@ -36,6 +36,23 @@ a 100 GB Aperture library in under 10 minutes. Both tests on HFS+.
|
|||||||
Change Log
|
Change Log
|
||||||
----------
|
----------
|
||||||
|
|
||||||
|
0.7.0
|
||||||
|
~~~~~
|
||||||
|
|
||||||
|
* when a file changes or is renamed, the timestamp of the last check is
|
||||||
|
updated, too
|
||||||
|
|
||||||
|
* bugfix: files that disappeared during the run are now properly ignored
|
||||||
|
|
||||||
|
* bugfix: files that are locked or with otherwise denied access are
|
||||||
|
skipped. If they were read before, they will be considered "missing"
|
||||||
|
in the report.
|
||||||
|
|
||||||
|
* bugfix: if there are multiple files with the same content in the
|
||||||
|
scanned directory tree, renames are now handled properly for them
|
||||||
|
|
||||||
|
* refactored some horrible code to be a little less horrible
|
||||||
|
|
||||||
0.6.0
|
0.6.0
|
||||||
~~~~~
|
~~~~~
|
||||||
|
|
||||||
@ -103,4 +120,5 @@ Authors
|
|||||||
-------
|
-------
|
||||||
|
|
||||||
Glued together by `Łukasz Langa <mailto:lukasz@langa.pl>`_. Multiple
|
Glued together by `Łukasz Langa <mailto:lukasz@langa.pl>`_. Multiple
|
||||||
improvements by `Yang Zhang <mailto:yaaang@gmail.com>`_.
|
improvements by `Yang Zhang <mailto:yaaang@gmail.com>`_ and `Jean-Louis
|
||||||
|
Fuchs <mailto:ganwell@fangorn.ch>`.
|
||||||
|
339
src/bitrot.py
339
src/bitrot.py
@ -30,7 +30,6 @@ import argparse
|
|||||||
import atexit
|
import atexit
|
||||||
import datetime
|
import datetime
|
||||||
import errno
|
import errno
|
||||||
import functools
|
|
||||||
import hashlib
|
import hashlib
|
||||||
import os
|
import os
|
||||||
import shutil
|
import shutil
|
||||||
@ -43,7 +42,8 @@ import time
|
|||||||
|
|
||||||
DEFAULT_CHUNK_SIZE = 16384
|
DEFAULT_CHUNK_SIZE = 16384
|
||||||
DOT_THRESHOLD = 200
|
DOT_THRESHOLD = 200
|
||||||
VERSION = (0, 6, 0)
|
VERSION = (0, 7, 0)
|
||||||
|
IGNORED_FILE_SYSTEM_ERRORS = {errno.ENOENT, errno.EACCES}
|
||||||
|
|
||||||
|
|
||||||
def sha1(path, chunk_size):
|
def sha1(path, chunk_size):
|
||||||
@ -56,11 +56,8 @@ def sha1(path, chunk_size):
|
|||||||
return digest.hexdigest()
|
return digest.hexdigest()
|
||||||
|
|
||||||
|
|
||||||
def throttled_commit(conn, commit_interval, last_commit_time):
|
def ts():
|
||||||
if time.time() - last_commit_time > commit_interval:
|
return datetime.datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S%z')
|
||||||
conn.commit()
|
|
||||||
last_commit_time = time.time()
|
|
||||||
return last_commit_time
|
|
||||||
|
|
||||||
|
|
||||||
def get_sqlite3_cursor(path, copy=False):
|
def get_sqlite3_cursor(path, copy=False):
|
||||||
@ -90,134 +87,208 @@ def get_sqlite3_cursor(path, copy=False):
|
|||||||
return conn
|
return conn
|
||||||
|
|
||||||
|
|
||||||
def run(verbosity=1, test=False, follow_links=False, commit_interval=300,
|
def list_existing_paths(directory, expected=(), ignored=(), follow_links=False):
|
||||||
chunk_size=DEFAULT_CHUNK_SIZE):
|
"""list_existing_paths('/dir') -> ([path1, path2, ...], total_size)
|
||||||
current_dir = b'.' # sic, relative path
|
|
||||||
bitrot_db = os.path.join(current_dir, b'.bitrot.db')
|
Returns a tuple with a list with existing files in `directory` and their
|
||||||
try:
|
`total_size`.
|
||||||
conn = get_sqlite3_cursor(bitrot_db, copy=test)
|
|
||||||
except ValueError:
|
Doesn't add entries listed in `ignored`. Doesn't add symlinks if
|
||||||
print('No database exists so cannot test. Run the tool once first.')
|
`follow_links` is False (the default). All entries present in `expected`
|
||||||
sys.exit(2)
|
must be files (can't be directories or symlinks).
|
||||||
cur = conn.cursor()
|
"""
|
||||||
new_paths = []
|
|
||||||
updated_paths = []
|
|
||||||
renamed_paths = []
|
|
||||||
error_count = 0
|
|
||||||
total_size = 0
|
|
||||||
current_size = 0
|
|
||||||
last_reported_size = ''
|
|
||||||
missing_paths = set()
|
|
||||||
cur.execute('SELECT path FROM bitrot')
|
|
||||||
row = cur.fetchone()
|
|
||||||
while row:
|
|
||||||
missing_paths.add(row[0])
|
|
||||||
row = cur.fetchone()
|
|
||||||
paths = []
|
paths = []
|
||||||
for path, _, files in os.walk(current_dir):
|
total_size = 0
|
||||||
|
for path, _, files in os.walk(directory):
|
||||||
for f in files:
|
for f in files:
|
||||||
p = os.path.join(path, f)
|
p = os.path.join(path, f)
|
||||||
p_uni = p.decode('utf8')
|
p_uni = p.decode('utf8')
|
||||||
try:
|
try:
|
||||||
if follow_links or p_uni in missing_paths:
|
if follow_links or p_uni in expected:
|
||||||
st = os.stat(p)
|
st = os.stat(p)
|
||||||
else:
|
else:
|
||||||
st = os.lstat(p)
|
st = os.lstat(p)
|
||||||
except OSError as ex:
|
except OSError as ex:
|
||||||
if ex.errno != errno.ENOENT:
|
if ex.errno not in IGNORED_FILE_SYSTEM_ERRORS:
|
||||||
raise
|
raise
|
||||||
else:
|
else:
|
||||||
if not stat.S_ISREG(st.st_mode) or p == bitrot_db:
|
if not stat.S_ISREG(st.st_mode) or p in ignored:
|
||||||
continue
|
continue
|
||||||
paths.append(p)
|
paths.append(p)
|
||||||
total_size += st.st_size
|
total_size += st.st_size
|
||||||
paths.sort()
|
paths.sort()
|
||||||
last_commit_time = 0
|
return paths, total_size
|
||||||
tcommit = functools.partial(throttled_commit, conn, commit_interval)
|
|
||||||
for p in paths:
|
|
||||||
st = os.stat(p)
|
class BitrotException(Exception):
|
||||||
new_mtime = int(st.st_mtime)
|
pass
|
||||||
current_size += st.st_size
|
|
||||||
if verbosity:
|
|
||||||
size_fmt = '\r{:>6.1%}'.format(current_size/(total_size or 1))
|
class Bitrot(object):
|
||||||
if size_fmt != last_reported_size:
|
def __init__(
|
||||||
sys.stdout.write(size_fmt)
|
self, verbosity=1, test=False, follow_links=False, commit_interval=300,
|
||||||
sys.stdout.flush()
|
chunk_size=DEFAULT_CHUNK_SIZE,
|
||||||
last_reported_size = size_fmt
|
):
|
||||||
p_uni = p.decode('utf8')
|
self.verbosity = verbosity
|
||||||
missing_paths.discard(p_uni)
|
self.test = test
|
||||||
|
self.follow_links = follow_links
|
||||||
|
self.commit_interval = commit_interval
|
||||||
|
self.chunk_size = chunk_size
|
||||||
|
self._last_reported_size = ''
|
||||||
|
self._last_commit_ts = 0
|
||||||
|
|
||||||
|
def maybe_commit(self, conn):
|
||||||
|
if time.time() < self._last_commit_ts + self.commit_interval:
|
||||||
|
# no time for commit yet!
|
||||||
|
return
|
||||||
|
|
||||||
|
conn.commit()
|
||||||
|
self._last_commit_ts = time.time()
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
current_dir = b'.' # sic, relative path
|
||||||
|
bitrot_db = os.path.join(current_dir, b'.bitrot.db')
|
||||||
try:
|
try:
|
||||||
new_sha1 = sha1(p, chunk_size)
|
conn = get_sqlite3_cursor(bitrot_db, copy=self.test)
|
||||||
except (IOError, OSError) as e:
|
except ValueError:
|
||||||
if verbosity:
|
raise BitrotException(
|
||||||
|
2,
|
||||||
|
'No database exists so cannot test. Run the tool once first.',
|
||||||
|
)
|
||||||
|
|
||||||
|
cur = conn.cursor()
|
||||||
|
new_paths = []
|
||||||
|
updated_paths = []
|
||||||
|
renamed_paths = []
|
||||||
|
error_count = 0
|
||||||
|
current_size = 0
|
||||||
|
missing_paths = self.select_all_paths(cur)
|
||||||
|
paths, total_size = list_existing_paths(
|
||||||
|
current_dir, expected=missing_paths, ignored={bitrot_db},
|
||||||
|
follow_links=self.follow_links,
|
||||||
|
)
|
||||||
|
|
||||||
|
for p in paths:
|
||||||
|
p_uni = p.decode('utf8')
|
||||||
|
try:
|
||||||
|
st = os.stat(p)
|
||||||
|
except OSError as ex:
|
||||||
|
if ex.errno in IGNORED_FILE_SYSTEM_ERRORS:
|
||||||
|
# The file disappeared between listing existing paths and
|
||||||
|
# this run or is (temporarily?) locked with different
|
||||||
|
# permissions. We'll just skip it for now.
|
||||||
|
if self.verbosity:
|
||||||
|
print(
|
||||||
|
'\rwarning: `{}` is currently unavailable for '
|
||||||
|
'reading: {}'.format(
|
||||||
|
p_uni, ex,
|
||||||
|
),
|
||||||
|
file=sys.stderr,
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
raise # Not expected? https://github.com/ambv/bitrot/issues/
|
||||||
|
|
||||||
|
new_mtime = int(st.st_mtime)
|
||||||
|
current_size += st.st_size
|
||||||
|
if self.verbosity:
|
||||||
|
self.report_progress(current_size, total_size)
|
||||||
|
|
||||||
|
missing_paths.discard(p_uni)
|
||||||
|
try:
|
||||||
|
new_sha1 = sha1(p, self.chunk_size)
|
||||||
|
except (IOError, OSError) as e:
|
||||||
|
if self.verbosity:
|
||||||
|
print(
|
||||||
|
'\rwarning: cannot compute hash of {} [{}]'.format(
|
||||||
|
p, errno.errorcode[e.args[0]],
|
||||||
|
),
|
||||||
|
file=sys.stderr,
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
cur.execute('SELECT mtime, hash, timestamp FROM bitrot WHERE '
|
||||||
|
'path=?', (p_uni,))
|
||||||
|
row = cur.fetchone()
|
||||||
|
if not row:
|
||||||
|
stored_path = self.handle_unknown_path(
|
||||||
|
cur, p_uni, new_mtime, new_sha1,
|
||||||
|
)
|
||||||
|
self.maybe_commit(conn)
|
||||||
|
|
||||||
|
if p_uni == stored_path:
|
||||||
|
new_paths.append(p) # FIXME: shouldn't that be p_uni?
|
||||||
|
else:
|
||||||
|
renamed_paths.append((stored_path, p_uni))
|
||||||
|
missing_paths.discard(stored_path)
|
||||||
|
continue
|
||||||
|
|
||||||
|
stored_mtime, stored_sha1, stored_ts = row
|
||||||
|
if int(stored_mtime) != new_mtime:
|
||||||
|
updated_paths.append(p)
|
||||||
|
cur.execute('UPDATE bitrot SET mtime=?, hash=?, timestamp=? '
|
||||||
|
'WHERE path=?',
|
||||||
|
(new_mtime, new_sha1, ts(), p_uni))
|
||||||
|
self.maybe_commit(conn)
|
||||||
|
continue
|
||||||
|
|
||||||
|
if stored_sha1 != new_sha1:
|
||||||
|
error_count += 1
|
||||||
print(
|
print(
|
||||||
'\rwarning: cannot compute hash of {} [{}]'.format(
|
'\rerror: SHA1 mismatch for {}: expected {}, got {}.'
|
||||||
p, errno.errorcode[e.args[0]],
|
' Last good hash checked on {}.'.format(
|
||||||
|
p, stored_sha1, new_sha1, stored_ts
|
||||||
),
|
),
|
||||||
file=sys.stderr,
|
file=sys.stderr,
|
||||||
)
|
)
|
||||||
continue
|
|
||||||
update_ts = datetime.datetime.utcnow().strftime(
|
|
||||||
'%Y-%m-%d %H:%M:%S%z'
|
|
||||||
)
|
|
||||||
cur.execute('SELECT mtime, hash, timestamp FROM bitrot WHERE '
|
|
||||||
'path=?', (p_uni,))
|
|
||||||
row = cur.fetchone()
|
|
||||||
if not row:
|
|
||||||
cur.execute('SELECT mtime, path, timestamp FROM bitrot WHERE '
|
|
||||||
'hash=?', (new_sha1,))
|
|
||||||
rows = cur.fetchall()
|
|
||||||
for row in rows:
|
|
||||||
stored_mtime, stored_path, update_ts = row
|
|
||||||
if not os.path.exists(stored_path):
|
|
||||||
renamed_paths.append((stored_path, p_uni))
|
|
||||||
missing_paths.discard(stored_path)
|
|
||||||
cur.execute('UPDATE bitrot SET mtime=?, path=?, '
|
|
||||||
'timestamp=?, hash=? WHERE path=?',
|
|
||||||
(
|
|
||||||
new_mtime,
|
|
||||||
p_uni,
|
|
||||||
update_ts,
|
|
||||||
new_sha1,
|
|
||||||
stored_path
|
|
||||||
))
|
|
||||||
|
|
||||||
last_commit_time = tcommit(last_commit_time)
|
for path in missing_paths:
|
||||||
break
|
cur.execute('DELETE FROM bitrot WHERE path=?', (path,))
|
||||||
else:
|
|
||||||
new_paths.append(p)
|
conn.commit()
|
||||||
cur.execute(
|
|
||||||
'INSERT INTO bitrot VALUES (?, ?, ?, ?)',
|
if self.verbosity:
|
||||||
(p_uni, new_mtime, new_sha1, update_ts),
|
cur.execute('SELECT COUNT(path) FROM bitrot')
|
||||||
)
|
all_count = cur.fetchone()[0]
|
||||||
last_commit_time = tcommit(last_commit_time)
|
self.report_done(
|
||||||
continue
|
total_size,
|
||||||
stored_mtime, stored_sha1, update_ts = row
|
all_count,
|
||||||
if int(stored_mtime) != new_mtime:
|
error_count,
|
||||||
updated_paths.append(p)
|
new_paths,
|
||||||
cur.execute('UPDATE bitrot SET mtime=?, hash=?, timestamp=? '
|
updated_paths,
|
||||||
'WHERE path=?',
|
renamed_paths,
|
||||||
(new_mtime, new_sha1, update_ts, p_uni))
|
missing_paths,
|
||||||
last_commit_time = tcommit(last_commit_time)
|
|
||||||
elif stored_sha1 != new_sha1:
|
|
||||||
error_count += 1
|
|
||||||
print(
|
|
||||||
'\rerror: SHA1 mismatch for {}: expected {}, got {}.'
|
|
||||||
' Original info from {}.'.format(
|
|
||||||
p, stored_sha1, new_sha1, update_ts
|
|
||||||
),
|
|
||||||
file=sys.stderr,
|
|
||||||
)
|
)
|
||||||
for path in missing_paths:
|
|
||||||
cur.execute('DELETE FROM bitrot WHERE path=?', (path,))
|
if error_count:
|
||||||
last_commit_time = tcommit(last_commit_time)
|
raise BitrotException(
|
||||||
conn.commit()
|
1, 'There were {} errors found.'.format(error_count),
|
||||||
cur.execute('SELECT COUNT(path) FROM bitrot')
|
)
|
||||||
all_count = cur.fetchone()[0]
|
|
||||||
if verbosity:
|
def select_all_paths(self, cur):
|
||||||
|
result = set()
|
||||||
|
cur.execute('SELECT path FROM bitrot')
|
||||||
|
row = cur.fetchone()
|
||||||
|
while row:
|
||||||
|
result.add(row[0])
|
||||||
|
row = cur.fetchone()
|
||||||
|
return result
|
||||||
|
|
||||||
|
def report_progress(self, current_size, total_size):
|
||||||
|
size_fmt = '\r{:>6.1%}'.format(current_size/(total_size or 1))
|
||||||
|
if size_fmt == self._last_reported_size:
|
||||||
|
return
|
||||||
|
|
||||||
|
sys.stdout.write(size_fmt)
|
||||||
|
sys.stdout.flush()
|
||||||
|
self._last_reported_size = size_fmt
|
||||||
|
|
||||||
|
def report_done(
|
||||||
|
self, total_size, all_count, error_count, new_paths, updated_paths,
|
||||||
|
renamed_paths, missing_paths):
|
||||||
print('\rFinished. {:.2f} MiB of data read. {} errors found.'
|
print('\rFinished. {:.2f} MiB of data read. {} errors found.'
|
||||||
''.format(total_size/1024/1024, error_count))
|
''.format(total_size/1024/1024, error_count))
|
||||||
if verbosity == 1:
|
if self.verbosity == 1:
|
||||||
print(
|
print(
|
||||||
'{} entries in the database, {} new, {} updated, '
|
'{} entries in the database, {} new, {} updated, '
|
||||||
'{} renamed, {} missing.'.format(
|
'{} renamed, {} missing.'.format(
|
||||||
@ -225,7 +296,7 @@ def run(verbosity=1, test=False, follow_links=False, commit_interval=300,
|
|||||||
len(renamed_paths), len(missing_paths),
|
len(renamed_paths), len(missing_paths),
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
elif verbosity > 1:
|
elif self.verbosity > 1:
|
||||||
print('{} entries in the database.'.format(all_count), end=' ')
|
print('{} entries in the database.'.format(all_count), end=' ')
|
||||||
if new_paths:
|
if new_paths:
|
||||||
print('{} entries new:'.format(len(new_paths)))
|
print('{} entries new:'.format(len(new_paths)))
|
||||||
@ -249,10 +320,39 @@ def run(verbosity=1, test=False, follow_links=False, commit_interval=300,
|
|||||||
print(' ', path)
|
print(' ', path)
|
||||||
if not any((new_paths, updated_paths, missing_paths)):
|
if not any((new_paths, updated_paths, missing_paths)):
|
||||||
print()
|
print()
|
||||||
if test:
|
if self.test:
|
||||||
print('warning: database file not updated on disk (test mode).')
|
print('warning: database file not updated on disk (test mode).')
|
||||||
if error_count:
|
|
||||||
sys.exit(1)
|
def handle_unknown_path(self, cur, new_path, new_mtime, new_sha1):
|
||||||
|
"""Either add a new entry to the database or update the existing entry
|
||||||
|
on rename.
|
||||||
|
|
||||||
|
Returns `new_path` if the entry was indeed new or the `stored_path` (e.g.
|
||||||
|
outdated path) if there was a rename.
|
||||||
|
"""
|
||||||
|
cur.execute('SELECT mtime, path, timestamp FROM bitrot WHERE hash=?',
|
||||||
|
(new_sha1,))
|
||||||
|
rows = cur.fetchall()
|
||||||
|
for row in rows:
|
||||||
|
stored_mtime, stored_path, stored_ts = row
|
||||||
|
if os.path.exists(stored_path):
|
||||||
|
# file still exists, move on
|
||||||
|
continue
|
||||||
|
|
||||||
|
# update the path in the database
|
||||||
|
cur.execute(
|
||||||
|
'UPDATE bitrot SET mtime=?, path=?, timestamp=? WHERE path=?',
|
||||||
|
(new_mtime, new_path, ts(), stored_path),
|
||||||
|
)
|
||||||
|
|
||||||
|
return stored_path
|
||||||
|
|
||||||
|
# no rename, just a new file with the same hash
|
||||||
|
cur.execute(
|
||||||
|
'INSERT INTO bitrot VALUES (?, ?, ?, ?)',
|
||||||
|
(new_path, new_mtime, new_sha1, ts()),
|
||||||
|
)
|
||||||
|
return new_path
|
||||||
|
|
||||||
|
|
||||||
def stable_sum():
|
def stable_sum():
|
||||||
@ -315,13 +415,18 @@ def run_from_command_line():
|
|||||||
verbosity = 0
|
verbosity = 0
|
||||||
elif args.verbose:
|
elif args.verbose:
|
||||||
verbosity = 2
|
verbosity = 2
|
||||||
run(
|
bt = Bitrot(
|
||||||
verbosity=verbosity,
|
verbosity=verbosity,
|
||||||
test=args.test,
|
test=args.test,
|
||||||
follow_links=args.follow_links,
|
follow_links=args.follow_links,
|
||||||
commit_interval=args.commit_interval,
|
commit_interval=args.commit_interval,
|
||||||
chunk_size=args.chunk_size,
|
chunk_size=args.chunk_size,
|
||||||
)
|
)
|
||||||
|
try:
|
||||||
|
bt.run()
|
||||||
|
except BitrotException as bre:
|
||||||
|
print('error:', bre.args[1], file=sys.stderr)
|
||||||
|
sys.exit(bre.args[0])
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
Loading…
x
Reference in New Issue
Block a user