From e4efbc290cf44b2fc45d3921f1c8ce3dad5f04d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20Langa?= Date: Mon, 2 May 2016 17:49:25 -0700 Subject: [PATCH] bitrot 0.8.0, fsencoding and self-integrity check --- .gitignore | 2 + README.rst | 10 +++++ src/bitrot.py | 102 ++++++++++++++++++++++++++++++++++++++++++++++---- 3 files changed, 106 insertions(+), 8 deletions(-) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..2a719ef --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +.bitrot.db +.bitrot.sha512 diff --git a/README.rst b/README.rst index c753c1b..c4b3303 100644 --- a/README.rst +++ b/README.rst @@ -36,6 +36,16 @@ a 100 GB Aperture library in under 10 minutes. Both tests on HFS+. Change Log ---------- +0.8.0 +~~~~~ + +* bitrot now keeps track of its own database's bitrot by storing + a checksum of .bitrot.db in .bitrot.sha512 + +* bugfix: now properly uses the filesystem encoding to decode file names + for use with the .bitrotdb database. Report and original patch by + pallinger. + 0.7.1 ~~~~~ diff --git a/src/bitrot.py b/src/bitrot.py index dbac3aa..cd92af9 100644 --- a/src/bitrot.py +++ b/src/bitrot.py @@ -42,8 +42,9 @@ import time DEFAULT_CHUNK_SIZE = 16384 DOT_THRESHOLD = 200 -VERSION = (0, 7, 1) +VERSION = (0, 8, 0) IGNORED_FILE_SYSTEM_ERRORS = {errno.ENOENT, errno.EACCES} +FSENCODING = sys.getfilesystemencoding() def sha1(path, chunk_size): @@ -102,7 +103,22 @@ def list_existing_paths(directory, expected=(), ignored=(), follow_links=False): for path, _, files in os.walk(directory): for f in files: p = os.path.join(path, f) - p_uni = p.decode('utf8') + try: + p_uni = p.decode(FSENCODING) + except UnicodeDecodeError: + try: + print( + "warning: cannot decode file name:", + path, + file=sys.stderr, + ) + except UnicodeDecodeError: + # yup, even printing the filename might fail in certain + # occasions + pass + + continue + try: if follow_links or p_uni in expected: st = os.stat(p) @@ -146,8 +162,10 @@ class Bitrot(object): self._last_commit_ts = time.time() def run(self): - current_dir = b'.' # sic, relative path - bitrot_db = os.path.join(current_dir, b'.bitrot.db') + check_sha512_integrity() + + bitrot_db = get_path() + bitrot_sha512 = get_path(ext=b'sha512') try: conn = get_sqlite3_cursor(bitrot_db, copy=self.test) except ValueError: @@ -164,7 +182,7 @@ class Bitrot(object): current_size = 0 missing_paths = self.select_all_paths(cur) paths, total_size = list_existing_paths( - current_dir, expected=missing_paths, ignored={bitrot_db}, + b'.', expected=missing_paths, ignored={bitrot_db, bitrot_sha512}, follow_links=self.follow_links, ) @@ -260,6 +278,8 @@ class Bitrot(object): missing_paths, ) + update_sha512_integrity() + if error_count: raise BitrotException( 1, 'There were {} errors found.'.format(error_count), @@ -355,9 +375,16 @@ class Bitrot(object): return new_path -def stable_sum(): - current_dir = b'.' # sic, relative path - bitrot_db = os.path.join(current_dir, b'.bitrot.db') +def get_path(directory=b'.', ext=b'db'): + """Compose the path to the selected bitrot file.""" + return os.path.join(directory, b'.bitrot.' + ext) + + +def stable_sum(bitrot_db): + """Calculates a stable SHA512 of all entries in the database. + + Useful for comparing if two directories hold the same data, as it ignores + timing information.""" digest = hashlib.sha512() conn = get_sqlite3_cursor(bitrot_db) cur = conn.cursor() @@ -369,7 +396,60 @@ def stable_sum(): return digest.hexdigest() +def check_sha512_integrity(): + sha512_path = get_path(ext='sha512') + if not os.path.exists(sha512_path): + return + + print('Checking bitrot.db integrity... ', end='') + with open(sha512_path, 'rb') as f: + old_sha512 = f.read().strip() + bitrot_db = get_path() + digest = hashlib.sha512() + with open(bitrot_db, 'rb') as f: + digest.update(f.read()) + new_sha512 = digest.hexdigest() + if new_sha512 != old_sha512: + if len(old_sha512) == 128: + print( + "error: SHA512 of the file is different, bitrot.db might be " + "corrupt." + ) + else: + print( + "error: SHA512 of the file is different but bitrot.sha512 has " + "a suspicious length. It might be corrupt." + ) + print( + "If you'd like to continue anyway, delete the .bitrot.sha512 " + "file and try again." + ) + raise BitrotException( + 3, 'bitrot.db integrity check failed, cannot continue.', + ) + + print('ok.') + +def update_sha512_integrity(): + old_sha512 = 0 + sha512_path = get_path(ext='sha512') + if os.path.exists(sha512_path): + with open(sha512_path, 'rb') as f: + old_sha512 = f.read().strip() + bitrot_db = get_path() + digest = hashlib.sha512() + with open(bitrot_db, 'rb') as f: + digest.update(f.read()) + new_sha512 = digest.hexdigest() + if new_sha512 != old_sha512: + print('Updating bitrot.sha512... ', end='') + with open(sha512_path, 'wb') as f: + f.write(new_sha512) + print('done.') + def run_from_command_line(): + global FSENCODING + parser = argparse.ArgumentParser(prog='bitrot') parser.add_argument( '-l', '--follow-links', action='store_true', @@ -403,6 +483,10 @@ def run_from_command_line(): parser.add_argument( '--chunk-size', type=int, default=DEFAULT_CHUNK_SIZE, help='read files this many bytes at a time') + parser.add_argument( + '--fsencoding', default='', + help='override the codec to decode filenames, otherwise taken from ' + 'the LANG environment variables') args = parser.parse_args() if args.sum: try: @@ -422,6 +506,8 @@ def run_from_command_line(): commit_interval=args.commit_interval, chunk_size=args.chunk_size, ) + if args.fsencoding: + FSENCODING = args.fsencoding try: bt.run() except BitrotException as bre: