bitrot 0.8.0, fsencoding and self-integrity check

This commit is contained in:
Łukasz Langa 2016-05-02 17:49:25 -07:00
parent a09f0b0ad6
commit e4efbc290c
3 changed files with 106 additions and 8 deletions

2
.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
.bitrot.db
.bitrot.sha512

View File

@ -36,6 +36,16 @@ a 100 GB Aperture library in under 10 minutes. Both tests on HFS+.
Change Log
----------
0.8.0
~~~~~
* bitrot now keeps track of its own database's bitrot by storing
a checksum of .bitrot.db in .bitrot.sha512
* bugfix: now properly uses the filesystem encoding to decode file names
for use with the .bitrotdb database. Report and original patch by
pallinger.
0.7.1
~~~~~

View File

@ -42,8 +42,9 @@ import time
DEFAULT_CHUNK_SIZE = 16384
DOT_THRESHOLD = 200
VERSION = (0, 7, 1)
VERSION = (0, 8, 0)
IGNORED_FILE_SYSTEM_ERRORS = {errno.ENOENT, errno.EACCES}
FSENCODING = sys.getfilesystemencoding()
def sha1(path, chunk_size):
@ -102,7 +103,22 @@ def list_existing_paths(directory, expected=(), ignored=(), follow_links=False):
for path, _, files in os.walk(directory):
for f in files:
p = os.path.join(path, f)
p_uni = p.decode('utf8')
try:
p_uni = p.decode(FSENCODING)
except UnicodeDecodeError:
try:
print(
"warning: cannot decode file name:",
path,
file=sys.stderr,
)
except UnicodeDecodeError:
# yup, even printing the filename might fail in certain
# occasions
pass
continue
try:
if follow_links or p_uni in expected:
st = os.stat(p)
@ -146,8 +162,10 @@ class Bitrot(object):
self._last_commit_ts = time.time()
def run(self):
current_dir = b'.' # sic, relative path
bitrot_db = os.path.join(current_dir, b'.bitrot.db')
check_sha512_integrity()
bitrot_db = get_path()
bitrot_sha512 = get_path(ext=b'sha512')
try:
conn = get_sqlite3_cursor(bitrot_db, copy=self.test)
except ValueError:
@ -164,7 +182,7 @@ class Bitrot(object):
current_size = 0
missing_paths = self.select_all_paths(cur)
paths, total_size = list_existing_paths(
current_dir, expected=missing_paths, ignored={bitrot_db},
b'.', expected=missing_paths, ignored={bitrot_db, bitrot_sha512},
follow_links=self.follow_links,
)
@ -260,6 +278,8 @@ class Bitrot(object):
missing_paths,
)
update_sha512_integrity()
if error_count:
raise BitrotException(
1, 'There were {} errors found.'.format(error_count),
@ -355,9 +375,16 @@ class Bitrot(object):
return new_path
def stable_sum():
current_dir = b'.' # sic, relative path
bitrot_db = os.path.join(current_dir, b'.bitrot.db')
def get_path(directory=b'.', ext=b'db'):
"""Compose the path to the selected bitrot file."""
return os.path.join(directory, b'.bitrot.' + ext)
def stable_sum(bitrot_db):
"""Calculates a stable SHA512 of all entries in the database.
Useful for comparing if two directories hold the same data, as it ignores
timing information."""
digest = hashlib.sha512()
conn = get_sqlite3_cursor(bitrot_db)
cur = conn.cursor()
@ -369,7 +396,60 @@ def stable_sum():
return digest.hexdigest()
def check_sha512_integrity():
sha512_path = get_path(ext='sha512')
if not os.path.exists(sha512_path):
return
print('Checking bitrot.db integrity... ', end='')
with open(sha512_path, 'rb') as f:
old_sha512 = f.read().strip()
bitrot_db = get_path()
digest = hashlib.sha512()
with open(bitrot_db, 'rb') as f:
digest.update(f.read())
new_sha512 = digest.hexdigest()
if new_sha512 != old_sha512:
if len(old_sha512) == 128:
print(
"error: SHA512 of the file is different, bitrot.db might be "
"corrupt."
)
else:
print(
"error: SHA512 of the file is different but bitrot.sha512 has "
"a suspicious length. It might be corrupt."
)
print(
"If you'd like to continue anyway, delete the .bitrot.sha512 "
"file and try again."
)
raise BitrotException(
3, 'bitrot.db integrity check failed, cannot continue.',
)
print('ok.')
def update_sha512_integrity():
old_sha512 = 0
sha512_path = get_path(ext='sha512')
if os.path.exists(sha512_path):
with open(sha512_path, 'rb') as f:
old_sha512 = f.read().strip()
bitrot_db = get_path()
digest = hashlib.sha512()
with open(bitrot_db, 'rb') as f:
digest.update(f.read())
new_sha512 = digest.hexdigest()
if new_sha512 != old_sha512:
print('Updating bitrot.sha512... ', end='')
with open(sha512_path, 'wb') as f:
f.write(new_sha512)
print('done.')
def run_from_command_line():
global FSENCODING
parser = argparse.ArgumentParser(prog='bitrot')
parser.add_argument(
'-l', '--follow-links', action='store_true',
@ -403,6 +483,10 @@ def run_from_command_line():
parser.add_argument(
'--chunk-size', type=int, default=DEFAULT_CHUNK_SIZE,
help='read files this many bytes at a time')
parser.add_argument(
'--fsencoding', default='',
help='override the codec to decode filenames, otherwise taken from '
'the LANG environment variables')
args = parser.parse_args()
if args.sum:
try:
@ -422,6 +506,8 @@ def run_from_command_line():
commit_interval=args.commit_interval,
chunk_size=args.chunk_size,
)
if args.fsencoding:
FSENCODING = args.fsencoding
try:
bt.run()
except BitrotException as bre: