bitrot/src/bitrot.py
Łukasz Langa 67e7b8c904
v1.0.0
2020-05-18 00:15:24 +02:00

596 lines
21 KiB
Python
Executable File

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright (C) 2013 by Łukasz Langa
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import argparse
import atexit
import datetime
import errno
import hashlib
import os
import shutil
import sqlite3
import stat
import sys
import tempfile
import time
import unicodedata
from concurrent.futures import ProcessPoolExecutor, wait, as_completed
DEFAULT_CHUNK_SIZE = 16384 # block size in HFS+; 4X the block size in ext4
DOT_THRESHOLD = 200
VERSION = (1, 0, 0)
IGNORED_FILE_SYSTEM_ERRORS = {errno.ENOENT, errno.EACCES}
FSENCODING = sys.getfilesystemencoding()
if sys.version[0] == '2':
str = type(u'text')
# use `bytes` for bytestrings
def normalize_path(path):
path_uni = path.decode(FSENCODING)
if FSENCODING in ('utf-8', 'UTF-8'):
return unicodedata.normalize('NFKD', path_uni)
return path_uni
def sha1(path, chunk_size):
digest = hashlib.sha1()
with open(path, 'rb') as f:
d = f.read(chunk_size)
while d:
digest.update(d)
d = f.read(chunk_size)
return digest.hexdigest()
def ts():
return datetime.datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S%z')
def get_sqlite3_cursor(path, copy=False):
path = path.decode(FSENCODING)
if copy:
if not os.path.exists(path):
raise ValueError("error: bitrot database at {} does not exist."
"".format(path))
db_copy = tempfile.NamedTemporaryFile(prefix='bitrot_', suffix='.db',
delete=False)
with open(path, 'rb') as db_orig:
try:
shutil.copyfileobj(db_orig, db_copy)
finally:
db_copy.close()
path = db_copy.name
atexit.register(os.unlink, path)
conn = sqlite3.connect(path)
atexit.register(conn.close)
cur = conn.cursor()
tables = set(t for t, in cur.execute('SELECT name FROM sqlite_master'))
if 'bitrot' not in tables:
cur.execute('CREATE TABLE bitrot (path TEXT PRIMARY KEY, '
'mtime INTEGER, hash TEXT, timestamp TEXT)')
if 'bitrot_hash_idx' not in tables:
cur.execute('CREATE INDEX bitrot_hash_idx ON bitrot (hash)')
atexit.register(conn.commit)
return conn
def list_existing_paths(directory, expected=(), ignored=(), follow_links=False):
"""list_existing_paths(b'/dir') -> ([path1, path2, ...], total_size)
Returns a tuple with a set of existing files in `directory` and its subdirectories
and their `total_size`. If directory was a bytes object, so will be the returned
paths.
Doesn't add entries listed in `ignored`. Doesn't add symlinks if
`follow_links` is False (the default). All entries present in `expected`
must be files (can't be directories or symlinks).
"""
paths = set()
total_size = 0
for path, _, files in os.walk(directory):
for f in files:
p = os.path.join(path, f)
try:
p_uni = p.decode(FSENCODING)
except UnicodeDecodeError:
binary_stderr = getattr(sys.stderr, 'buffer', sys.stderr)
binary_stderr.write(b"warning: cannot decode file name: ")
binary_stderr.write(p)
binary_stderr.write(b"\n")
continue
try:
if follow_links or p_uni in expected:
st = os.stat(p)
else:
st = os.lstat(p)
except OSError as ex:
if ex.errno not in IGNORED_FILE_SYSTEM_ERRORS:
raise
else:
if not stat.S_ISREG(st.st_mode) or p in ignored:
continue
paths.add(p)
total_size += st.st_size
return paths, total_size
def compute_one(path, chunk_size):
"""Return a tuple with (unicode path, size, mtime, sha1). Takes a binary path."""
p_uni = normalize_path(path)
try:
st = os.stat(path)
except OSError as ex:
if ex.errno in IGNORED_FILE_SYSTEM_ERRORS:
# The file disappeared between listing existing paths and
# this run or is (temporarily?) locked with different
# permissions. We'll just skip it for now.
print(
'\rwarning: `{}` is currently unavailable for '
'reading: {}'.format(
p_uni, ex,
),
file=sys.stderr,
)
raise BitrotException
raise # Not expected? https://github.com/ambv/bitrot/issues/
new_mtime = int(st.st_mtime)
try:
new_sha1 = sha1(path, chunk_size)
except (IOError, OSError) as e:
print(
'\rwarning: cannot compute hash of {} [{}]'.format(
p_uni, errno.errorcode[e.args[0]],
),
file=sys.stderr,
)
raise BitrotException
return p_uni, st.st_size, int(st.st_mtime), new_sha1
class BitrotException(Exception):
pass
class Bitrot(object):
def __init__(
self, verbosity=1, test=False, follow_links=False, commit_interval=300,
chunk_size=DEFAULT_CHUNK_SIZE, workers=os.cpu_count(),
):
self.verbosity = verbosity
self.test = test
self.follow_links = follow_links
self.commit_interval = commit_interval
self.chunk_size = chunk_size
self._last_reported_size = ''
self._last_commit_ts = 0
self.pool = ProcessPoolExecutor(max_workers=workers)
def maybe_commit(self, conn):
if time.time() < self._last_commit_ts + self.commit_interval:
# no time for commit yet!
return
conn.commit()
self._last_commit_ts = time.time()
def run(self):
check_sha512_integrity(verbosity=self.verbosity)
bitrot_db = get_path()
bitrot_sha512 = get_path(ext=b'sha512')
try:
conn = get_sqlite3_cursor(bitrot_db, copy=self.test)
except ValueError:
raise BitrotException(
2,
'No database exists so cannot test. Run the tool once first.',
)
cur = conn.cursor()
new_paths = []
updated_paths = []
renamed_paths = []
errors = []
current_size = 0
missing_paths = self.select_all_paths(cur)
hashes = self.select_all_hashes(cur)
paths, total_size = list_existing_paths(
b'.', expected=missing_paths, ignored={bitrot_db, bitrot_sha512},
follow_links=self.follow_links,
)
paths_uni = set(normalize_path(p) for p in paths)
futures = [self.pool.submit(compute_one, p, self.chunk_size) for p in paths]
for future in as_completed(futures):
try:
p_uni, new_size, new_mtime, new_sha1 = future.result()
except BitrotException:
continue
current_size += new_size
if self.verbosity:
self.report_progress(current_size, total_size)
if p_uni not in missing_paths:
# We are not expecting this path, it wasn't in the database yet.
# It's either new or a rename. Let's handle that.
stored_path = self.handle_unknown_path(
cur, p_uni, new_mtime, new_sha1, paths_uni, hashes
)
self.maybe_commit(conn)
if p_uni == stored_path:
new_paths.append(p_uni)
missing_paths.discard(p_uni)
else:
renamed_paths.append((stored_path, p_uni))
missing_paths.discard(stored_path)
continue
# At this point we know we're seeing an expected file.
missing_paths.discard(p_uni)
cur.execute('SELECT mtime, hash, timestamp FROM bitrot WHERE path=?',
(p_uni,))
row = cur.fetchone()
if not row:
print(
'\rwarning: path disappeared from the database while running:',
p_uni,
file=sys.stderr,
)
continue
stored_mtime, stored_sha1, stored_ts = row
if int(stored_mtime) != new_mtime:
updated_paths.append(p_uni)
cur.execute('UPDATE bitrot SET mtime=?, hash=?, timestamp=? '
'WHERE path=?',
(new_mtime, new_sha1, ts(), p_uni))
self.maybe_commit(conn)
continue
if stored_sha1 != new_sha1:
errors.append(p_uni)
print(
'\rerror: SHA1 mismatch for {}: expected {}, got {}.'
' Last good hash checked on {}.'.format(
p_uni, stored_sha1, new_sha1, stored_ts
),
file=sys.stderr,
)
for path in missing_paths:
cur.execute('DELETE FROM bitrot WHERE path=?', (path,))
conn.commit()
if not self.test:
cur.execute('vacuum')
if self.verbosity:
cur.execute('SELECT COUNT(path) FROM bitrot')
all_count = cur.fetchone()[0]
self.report_done(
total_size,
all_count,
len(errors),
new_paths,
updated_paths,
renamed_paths,
missing_paths,
)
update_sha512_integrity(verbosity=self.verbosity)
if errors:
raise BitrotException(
1, 'There were {} errors found.'.format(len(errors)), errors,
)
def select_all_paths(self, cur):
"""Return a set of all distinct paths in the bitrot database.
The paths are Unicode and are normalized if FSENCODING was UTF-8.
"""
result = set()
cur.execute('SELECT path FROM bitrot')
row = cur.fetchone()
while row:
result.add(row[0])
row = cur.fetchone()
return result
def select_all_hashes(self, cur):
"""Return a dict where keys are hashes and values are sets of paths.
The paths are Unicode and are normalized if FSENCODING was UTF-8.
"""
result = {}
cur.execute('SELECT hash, path FROM bitrot')
row = cur.fetchone()
while row:
rhash, rpath = row
result.setdefault(rhash, set()).add(rpath)
row = cur.fetchone()
return result
def report_progress(self, current_size, total_size):
size_fmt = '\r{:>6.1%}'.format(current_size/(total_size or 1))
if size_fmt == self._last_reported_size:
return
sys.stdout.write(size_fmt)
sys.stdout.flush()
self._last_reported_size = size_fmt
def report_done(
self, total_size, all_count, error_count, new_paths, updated_paths,
renamed_paths, missing_paths):
"""Print a report on what happened. All paths should be Unicode here."""
print('\rFinished. {:.2f} MiB of data read. {} errors found.'
''.format(total_size/1024/1024, error_count))
if self.verbosity == 1:
print(
'{} entries in the database, {} new, {} updated, '
'{} renamed, {} missing.'.format(
all_count, len(new_paths), len(updated_paths),
len(renamed_paths), len(missing_paths),
),
)
elif self.verbosity > 1:
print('{} entries in the database.'.format(all_count), end=' ')
if new_paths:
print('{} entries new:'.format(len(new_paths)))
new_paths.sort()
for path in new_paths:
print(' ', path)
if updated_paths:
print('{} entries updated:'.format(len(updated_paths)))
updated_paths.sort()
for path in updated_paths:
print(' ', path)
if renamed_paths:
print('{} entries renamed:'.format(len(renamed_paths)))
renamed_paths.sort()
for path in renamed_paths:
print(
' from',
path[0],
'to',
path[1],
)
if missing_paths:
print('{} entries missing:'.format(len(missing_paths)))
missing_paths = sorted(missing_paths)
for path in missing_paths:
print(' ', path)
if not any((new_paths, updated_paths, missing_paths)):
print()
if self.test and self.verbosity:
print('warning: database file not updated on disk (test mode).')
def handle_unknown_path(self, cur, new_path, new_mtime, new_sha1, paths_uni, hashes):
"""Either add a new entry to the database or update the existing entry
on rename.
`cur` is the database cursor. `new_path` is the new Unicode path.
`paths_uni` are Unicode paths seen on disk during this run of Bitrot.
`hashes` is a dictionary selected from the database, keys are hashes, values
are sets of Unicode paths that are stored in the DB under the given hash.
Returns `new_path` if the entry was indeed new or the `old_path` (e.g.
outdated path stored in the database for this hash) if there was a rename.
"""
for old_path in hashes.get(new_sha1, ()):
if old_path not in paths_uni:
# File of the same hash used to exist but no longer does.
# Let's treat `new_path` as a renamed version of that `old_path`.
cur.execute(
'UPDATE bitrot SET mtime=?, path=?, timestamp=? WHERE path=?',
(new_mtime, new_path, ts(), old_path),
)
return old_path
else:
# Either we haven't found `new_sha1` at all in the database, or all
# currently stored paths for this hash still point to existing files.
# Let's insert a new entry for what appears to be a new file.
cur.execute(
'INSERT INTO bitrot VALUES (?, ?, ?, ?)',
(new_path, new_mtime, new_sha1, ts()),
)
return new_path
def get_path(directory=b'.', ext=b'db'):
"""Compose the path to the selected bitrot file."""
return os.path.join(directory, b'.bitrot.' + ext)
def stable_sum(bitrot_db=None):
"""Calculates a stable SHA512 of all entries in the database.
Useful for comparing if two directories hold the same data, as it ignores
timing information."""
if bitrot_db is None:
bitrot_db = get_path()
digest = hashlib.sha512()
conn = get_sqlite3_cursor(bitrot_db)
cur = conn.cursor()
cur.execute('SELECT hash FROM bitrot ORDER BY path')
row = cur.fetchone()
while row:
digest.update(row[0].encode('ascii'))
row = cur.fetchone()
return digest.hexdigest()
def check_sha512_integrity(verbosity=1):
sha512_path = get_path(ext=b'sha512')
if not os.path.exists(sha512_path):
return
if verbosity:
print('Checking bitrot.db integrity... ', end='')
sys.stdout.flush()
with open(sha512_path, 'rb') as f:
old_sha512 = f.read().strip()
bitrot_db = get_path()
digest = hashlib.sha512()
with open(bitrot_db, 'rb') as f:
digest.update(f.read())
new_sha512 = digest.hexdigest().encode('ascii')
if new_sha512 != old_sha512:
if verbosity:
if len(old_sha512) == 128:
print(
"error: SHA512 of the file is different, bitrot.db might "
"be corrupt.",
)
else:
print(
"error: SHA512 of the file is different but bitrot.sha512 "
"has a suspicious length. It might be corrupt.",
)
print(
"If you'd like to continue anyway, delete the .bitrot.sha512 "
"file and try again.",
file=sys.stderr,
)
raise BitrotException(
3, 'bitrot.db integrity check failed, cannot continue.',
)
if verbosity:
print('ok.')
def update_sha512_integrity(verbosity=1):
old_sha512 = 0
sha512_path = get_path(ext=b'sha512')
if os.path.exists(sha512_path):
with open(sha512_path, 'rb') as f:
old_sha512 = f.read().strip()
bitrot_db = get_path()
digest = hashlib.sha512()
with open(bitrot_db, 'rb') as f:
digest.update(f.read())
new_sha512 = digest.hexdigest().encode('ascii')
if new_sha512 != old_sha512:
if verbosity:
print('Updating bitrot.sha512... ', end='')
sys.stdout.flush()
with open(sha512_path, 'wb') as f:
f.write(new_sha512)
if verbosity:
print('done.')
def run_from_command_line():
global FSENCODING
parser = argparse.ArgumentParser(prog='bitrot')
parser.add_argument(
'-l', '--follow-links', action='store_true',
help='follow symbolic links and store target files\' hashes. Once '
'a path is present in the database, it will be checked against '
'changes in content even if it becomes a symbolic link. In '
'other words, if you run `bitrot -l`, on subsequent runs '
'symbolic links registered during the first run will be '
'properly followed and checked even if you run without `-l`.')
parser.add_argument(
'-q', '--quiet', action='store_true',
help='don\'t print anything besides checksum errors')
parser.add_argument(
'-s', '--sum', action='store_true',
help='using only the data already gathered, return a SHA-512 sum '
'of hashes of all the entries in the database. No timestamps '
'are used in calculation.')
parser.add_argument(
'-v', '--verbose', action='store_true',
help='list new, updated and missing entries')
parser.add_argument(
'-t', '--test', action='store_true',
help='just test against an existing database, don\'t update anything')
parser.add_argument(
'--version', action='version',
version='%(prog)s {}.{}.{}'.format(*VERSION))
parser.add_argument(
'--commit-interval', type=float, default=300,
help='min time in seconds between commits '
'(0 commits on every operation)')
parser.add_argument(
'-w', '--workers', type=int, default=os.cpu_count(),
help='run this many workers (use -w1 for slow magnetic disks)')
parser.add_argument(
'--chunk-size', type=int, default=DEFAULT_CHUNK_SIZE,
help='read files this many bytes at a time')
parser.add_argument(
'--fsencoding', default='',
help='override the codec to decode filenames, otherwise taken from '
'the LANG environment variables')
args = parser.parse_args()
if args.sum:
try:
print(stable_sum())
except RuntimeError as e:
print(str(e).encode('utf8'), file=sys.stderr)
else:
verbosity = 1
if args.quiet:
verbosity = 0
elif args.verbose:
verbosity = 2
bt = Bitrot(
verbosity=verbosity,
test=args.test,
follow_links=args.follow_links,
commit_interval=args.commit_interval,
chunk_size=args.chunk_size,
workers=args.workers,
)
if args.fsencoding:
FSENCODING = args.fsencoding
try:
bt.run()
except BitrotException as bre:
print('error:', bre.args[1], file=sys.stderr)
sys.exit(bre.args[0])
if __name__ == '__main__':
run_from_command_line()