bitrot/src/bitrot.py

550 lines
19 KiB
Python
Raw Normal View History

2013-01-17 15:01:22 +01:00
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright (C) 2013 by Łukasz Langa
2013-01-17 15:01:22 +01:00
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import argparse
2013-01-17 15:01:22 +01:00
import atexit
import datetime
2013-10-17 11:42:23 -07:00
import errno
2013-01-17 15:01:22 +01:00
import hashlib
import os
2013-03-15 17:12:04 +01:00
import shutil
2013-01-17 15:59:29 +01:00
import sqlite3
import stat
2013-01-17 15:01:22 +01:00
import sys
2013-03-15 17:12:04 +01:00
import tempfile
2013-08-29 15:39:33 -07:00
import time
import unicodedata
2013-01-17 15:01:22 +01:00
2016-11-01 12:02:34 -07:00
DEFAULT_CHUNK_SIZE = 16384 # block size in HFS+; 4X the block size in ext4
2013-01-17 15:01:22 +01:00
DOT_THRESHOLD = 200
2016-11-01 12:02:34 -07:00
VERSION = (0, 9, 2)
IGNORED_FILE_SYSTEM_ERRORS = {errno.ENOENT, errno.EACCES}
FSENCODING = sys.getfilesystemencoding()
2013-01-17 15:01:22 +01:00
2013-01-17 15:59:29 +01:00
if sys.version[0] == '2':
str = type(u'text')
# use `bytes` for bytestrings
def normalize_path(path):
if FSENCODING in ('utf-8', 'UTF-8'):
return unicodedata.normalize('NFKD', path)
else:
return path
def sha1(path, chunk_size):
2013-01-17 15:01:22 +01:00
digest = hashlib.sha1()
with open(path, 'rb') as f:
d = f.read(chunk_size)
2013-01-17 15:01:22 +01:00
while d:
digest.update(d)
d = f.read(chunk_size)
2013-01-17 15:01:22 +01:00
return digest.hexdigest()
def ts():
return datetime.datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S%z')
2013-01-17 15:01:22 +01:00
2013-03-15 17:12:04 +01:00
def get_sqlite3_cursor(path, copy=False):
path = path.decode(FSENCODING)
2013-03-15 17:12:04 +01:00
if copy:
if not os.path.exists(path):
raise ValueError("error: bitrot database at {} does not exist."
"".format(path))
db_copy = tempfile.NamedTemporaryFile(prefix='bitrot_', suffix='.db',
delete=False)
with open(path, 'rb') as db_orig:
try:
shutil.copyfileobj(db_orig, db_copy)
finally:
db_copy.close()
path = db_copy.name
atexit.register(os.unlink, path)
2013-01-17 15:59:29 +01:00
conn = sqlite3.connect(path)
atexit.register(conn.close)
cur = conn.cursor()
2013-11-11 00:43:22 -08:00
tables = set(t for t, in cur.execute('SELECT name FROM sqlite_master'))
if 'bitrot' not in tables:
2013-01-17 15:59:29 +01:00
cur.execute('CREATE TABLE bitrot (path TEXT PRIMARY KEY, '
'mtime INTEGER, hash TEXT, timestamp TEXT)')
2013-11-11 00:43:22 -08:00
if 'bitrot_hash_idx' not in tables:
2013-08-26 18:59:00 -07:00
cur.execute('CREATE INDEX bitrot_hash_idx ON bitrot (hash)')
atexit.register(conn.commit)
2013-01-17 15:59:29 +01:00
return conn
def list_existing_paths(directory, expected=(), ignored=(), follow_links=False):
"""list_existing_paths('/dir') -> ([path1, path2, ...], total_size)
Returns a tuple with a list with existing files in `directory` and their
`total_size`.
Doesn't add entries listed in `ignored`. Doesn't add symlinks if
`follow_links` is False (the default). All entries present in `expected`
must be files (can't be directories or symlinks).
"""
paths = set()
paths_decoded_and_normalized = set()
total_size = 0
for path, _, files in os.walk(directory):
2013-01-17 15:01:22 +01:00
for f in files:
p = os.path.join(path, f)
try:
p_uni = p.decode(FSENCODING)
except UnicodeDecodeError:
binary_stderr = getattr(sys.stderr, 'buffer', sys.stderr)
binary_stderr.write(b"warning: cannot decode file name: ")
binary_stderr.write(p)
binary_stderr.write(b"\n")
continue
try:
if follow_links or normalize_path(p_uni) in expected:
st = os.stat(p)
else:
st = os.lstat(p)
except OSError as ex:
if ex.errno not in IGNORED_FILE_SYSTEM_ERRORS:
raise
else:
if not stat.S_ISREG(st.st_mode) or p in ignored:
continue
paths.add(p)
paths_decoded_and_normalized.add(normalize_path(p.decode(FSENCODING)))
total_size += st.st_size
return paths, total_size, paths_decoded_and_normalized
class BitrotException(Exception):
pass
class Bitrot(object):
def __init__(
self, verbosity=1, test=False, follow_links=False, commit_interval=300,
chunk_size=DEFAULT_CHUNK_SIZE,
):
self.verbosity = verbosity
self.test = test
self.follow_links = follow_links
self.commit_interval = commit_interval
self.chunk_size = chunk_size
self._last_reported_size = ''
self._last_commit_ts = 0
def maybe_commit(self, conn):
if time.time() < self._last_commit_ts + self.commit_interval:
# no time for commit yet!
return
conn.commit()
self._last_commit_ts = time.time()
def run(self):
check_sha512_integrity(verbosity=self.verbosity)
bitrot_db = get_path()
bitrot_sha512 = get_path(ext=b'sha512')
try:
conn = get_sqlite3_cursor(bitrot_db, copy=self.test)
except ValueError:
raise BitrotException(
2,
'No database exists so cannot test. Run the tool once first.',
)
cur = conn.cursor()
new_paths = []
updated_paths = []
renamed_paths = []
errors = []
current_size = 0
missing_paths = self.select_all_paths(cur)
hashes = self.select_all_hashes(cur)
paths, total_size, paths_decoded_and_normalized = list_existing_paths(
b'.', expected=missing_paths, ignored={bitrot_db, bitrot_sha512},
follow_links=self.follow_links,
)
for p in sorted(paths):
p_uni = p.decode(FSENCODING)
try:
st = os.stat(p)
except OSError as ex:
if ex.errno in IGNORED_FILE_SYSTEM_ERRORS:
# The file disappeared between listing existing paths and
# this run or is (temporarily?) locked with different
# permissions. We'll just skip it for now.
print(
'\rwarning: `{}` is currently unavailable for '
'reading: {}'.format(
p_uni, ex,
),
file=sys.stderr,
)
continue
raise # Not expected? https://github.com/ambv/bitrot/issues/
new_mtime = int(st.st_mtime)
current_size += st.st_size
if self.verbosity:
self.report_progress(current_size, total_size)
missing_paths.discard(normalize_path(p_uni))
try:
new_sha1 = sha1(p, self.chunk_size)
except (IOError, OSError) as e:
print(
'\rwarning: cannot compute hash of {} [{}]'.format(
p, errno.errorcode[e.args[0]],
),
file=sys.stderr,
)
continue
cur.execute('SELECT mtime, hash, timestamp FROM bitrot WHERE '
'path=?', (normalize_path(p_uni),))
row = cur.fetchone()
if not row:
stored_path = self.handle_unknown_path(
cur, p_uni, new_mtime, new_sha1, paths_decoded_and_normalized, hashes
)
self.maybe_commit(conn)
if p_uni == stored_path:
new_paths.append(p) # FIXME: shouldn't that be p_uni?
else:
renamed_paths.append((stored_path, p_uni))
missing_paths.discard(normalize_path(stored_path))
continue
stored_mtime, stored_sha1, stored_ts = row
if int(stored_mtime) != new_mtime:
updated_paths.append(p)
cur.execute('UPDATE bitrot SET mtime=?, hash=?, timestamp=? '
'WHERE path=?',
(new_mtime, new_sha1, ts(), normalize_path(p_uni)))
self.maybe_commit(conn)
continue
if stored_sha1 != new_sha1:
errors.append(p)
print(
'\rerror: SHA1 mismatch for {}: expected {}, got {}.'
' Last good hash checked on {}.'.format(
p.decode(FSENCODING), stored_sha1, new_sha1, stored_ts
),
file=sys.stderr,
2013-11-11 00:43:22 -08:00
)
for path in missing_paths:
cur.execute('DELETE FROM bitrot WHERE path=?', (normalize_path(path),)) # it is expected that content of missing_paths is already normalized, but just to be sure
conn.commit()
if not self.test:
cur.execute('vacuum')
if self.verbosity:
cur.execute('SELECT COUNT(path) FROM bitrot')
all_count = cur.fetchone()[0]
self.report_done(
total_size,
all_count,
len(errors),
new_paths,
updated_paths,
renamed_paths,
missing_paths,
)
update_sha512_integrity(verbosity=self.verbosity)
2016-07-14 08:28:05 -06:00
if errors:
raise BitrotException(
1, 'There were {} errors found.'.format(len(errors)), errors,
)
def select_all_paths(self, cur):
result = set()
cur.execute('SELECT path FROM bitrot')
row = cur.fetchone()
while row:
result.add(row[0])
row = cur.fetchone()
return result
def select_all_hashes(self, cur):
result = {}
cur.execute('SELECT hash, path FROM bitrot')
row = cur.fetchone()
while row:
rhash, rpath = row
result.setdefault(rhash, set()).add(rpath)
row = cur.fetchone()
return result
def report_progress(self, current_size, total_size):
size_fmt = '\r{:>6.1%}'.format(current_size/(total_size or 1))
if size_fmt == self._last_reported_size:
return
sys.stdout.write(size_fmt)
sys.stdout.flush()
self._last_reported_size = size_fmt
def report_done(
self, total_size, all_count, error_count, new_paths, updated_paths,
renamed_paths, missing_paths):
print('\rFinished. {:.2f} MiB of data read. {} errors found.'
''.format(total_size/1024/1024, error_count))
if self.verbosity == 1:
2013-11-11 00:43:22 -08:00
print(
'{} entries in the database, {} new, {} updated, '
'{} renamed, {} missing.'.format(
all_count, len(new_paths), len(updated_paths),
len(renamed_paths), len(missing_paths),
),
)
elif self.verbosity > 1:
print('{} entries in the database.'.format(all_count), end=' ')
if new_paths:
print('{} entries new:'.format(len(new_paths)))
new_paths.sort()
for path in new_paths:
print(' ', path.decode(FSENCODING))
if updated_paths:
print('{} entries updated:'.format(len(updated_paths)))
updated_paths.sort()
for path in updated_paths:
print(' ', path.decode(FSENCODING))
if renamed_paths:
print('{} entries renamed:'.format(len(renamed_paths)))
renamed_paths.sort()
for path in renamed_paths:
print(
' from',
path[0],
'to',
path[1],
)
if missing_paths:
print('{} entries missing:'.format(len(missing_paths)))
missing_paths = sorted(missing_paths)
for path in missing_paths:
print(' ', path)
if not any((new_paths, updated_paths, missing_paths)):
print()
if self.test and self.verbosity:
print('warning: database file not updated on disk (test mode).')
def handle_unknown_path(self, cur, new_path, new_mtime, new_sha1, paths_decoded_and_normalized, hashes):
"""Either add a new entry to the database or update the existing entry
on rename.
Returns `new_path` if the entry was indeed new or the `stored_path` (e.g.
outdated path) if there was a rename.
"""
try: # if the path isn't in the database
found = [path for path in hashes[new_sha1] if path not in paths_decoded_and_normalized]
renamed = found.pop()
# update the path in the database
cur.execute(
'UPDATE bitrot SET mtime=?, path=?, timestamp=? WHERE path=?',
(new_mtime, normalize_path(new_path), ts(), normalize_path(renamed)),
)
return renamed
# From hashes[new_sha1] or found.pop()
except (KeyError,IndexError):
cur.execute(
'INSERT INTO bitrot VALUES (?, ?, ?, ?)',
(normalize_path(new_path), new_mtime, new_sha1, ts()),
)
return new_path
2013-01-17 15:01:22 +01:00
def get_path(directory=b'.', ext=b'db'):
"""Compose the path to the selected bitrot file."""
return os.path.join(directory, b'.bitrot.' + ext)
2016-10-29 19:27:18 -07:00
def stable_sum(bitrot_db=None):
"""Calculates a stable SHA512 of all entries in the database.
Useful for comparing if two directories hold the same data, as it ignores
timing information."""
2016-10-29 19:27:18 -07:00
if bitrot_db is None:
bitrot_db = get_path()
2013-02-26 15:44:51 +01:00
digest = hashlib.sha512()
conn = get_sqlite3_cursor(bitrot_db)
cur = conn.cursor()
cur.execute('SELECT hash FROM bitrot ORDER BY path')
row = cur.fetchone()
while row:
2016-10-29 19:27:18 -07:00
digest.update(row[0].encode('ascii'))
2013-02-26 15:44:51 +01:00
row = cur.fetchone()
return digest.hexdigest()
def check_sha512_integrity(verbosity=1):
sha512_path = get_path(ext=b'sha512')
if not os.path.exists(sha512_path):
return
if verbosity:
print('Checking bitrot.db integrity... ', end='')
sys.stdout.flush()
with open(sha512_path, 'rb') as f:
old_sha512 = f.read().strip()
bitrot_db = get_path()
digest = hashlib.sha512()
with open(bitrot_db, 'rb') as f:
digest.update(f.read())
new_sha512 = digest.hexdigest().encode('ascii')
if new_sha512 != old_sha512:
if verbosity:
if len(old_sha512) == 128:
print(
"error: SHA512 of the file is different, bitrot.db might "
"be corrupt.",
)
else:
print(
"error: SHA512 of the file is different but bitrot.sha512 "
"has a suspicious length. It might be corrupt.",
)
print(
"If you'd like to continue anyway, delete the .bitrot.sha512 "
"file and try again.",
file=sys.stderr,
)
raise BitrotException(
3, 'bitrot.db integrity check failed, cannot continue.',
)
if verbosity:
print('ok.')
def update_sha512_integrity(verbosity=1):
old_sha512 = 0
sha512_path = get_path(ext=b'sha512')
if os.path.exists(sha512_path):
with open(sha512_path, 'rb') as f:
old_sha512 = f.read().strip()
bitrot_db = get_path()
digest = hashlib.sha512()
with open(bitrot_db, 'rb') as f:
digest.update(f.read())
new_sha512 = digest.hexdigest().encode('ascii')
if new_sha512 != old_sha512:
if verbosity:
print('Updating bitrot.sha512... ', end='')
sys.stdout.flush()
with open(sha512_path, 'wb') as f:
f.write(new_sha512)
if verbosity:
print('done.')
def run_from_command_line():
global FSENCODING
parser = argparse.ArgumentParser(prog='bitrot')
2013-11-11 00:43:22 -08:00
parser.add_argument(
'-l', '--follow-links', action='store_true',
help='follow symbolic links and store target files\' hashes. Once '
'a path is present in the database, it will be checked against '
'changes in content even if it becomes a symbolic link. In '
'other words, if you run `bitrot -l`, on subsequent runs '
'symbolic links registered during the first run will be '
'properly followed and checked even if you run without `-l`.')
2013-11-11 00:43:22 -08:00
parser.add_argument(
'-q', '--quiet', action='store_true',
help='don\'t print anything besides checksum errors')
2013-11-11 00:43:22 -08:00
parser.add_argument(
'-s', '--sum', action='store_true',
2013-02-26 15:44:51 +01:00
help='using only the data already gathered, return a SHA-512 sum '
'of hashes of all the entries in the database. No timestamps '
'are used in calculation.')
2013-11-11 00:43:22 -08:00
parser.add_argument(
'-v', '--verbose', action='store_true',
help='list new, updated and missing entries')
2013-11-11 00:43:22 -08:00
parser.add_argument(
'-t', '--test', action='store_true',
2013-03-15 17:12:04 +01:00
help='just test against an existing database, don\'t update anything')
2013-11-11 00:43:22 -08:00
parser.add_argument(
'--version', action='version',
version='%(prog)s {}.{}.{}'.format(*VERSION))
2013-11-11 00:43:22 -08:00
parser.add_argument(
'--commit-interval', type=float, default=300,
help='min time in seconds between commits '
'(0 commits on every operation)')
2013-11-11 00:43:22 -08:00
parser.add_argument(
'--chunk-size', type=int, default=DEFAULT_CHUNK_SIZE,
2013-08-29 15:51:06 -07:00
help='read files this many bytes at a time')
parser.add_argument(
'--fsencoding', default='',
help='override the codec to decode filenames, otherwise taken from '
'the LANG environment variables')
args = parser.parse_args()
2013-02-26 15:44:51 +01:00
if args.sum:
try:
print(stable_sum())
except RuntimeError as e:
print(str(e).encode('utf8'), file=sys.stderr)
2013-02-26 15:44:51 +01:00
else:
verbosity = 1
if args.quiet:
verbosity = 0
elif args.verbose:
verbosity = 2
bt = Bitrot(
2013-11-11 00:43:22 -08:00
verbosity=verbosity,
test=args.test,
follow_links=args.follow_links,
2013-08-29 15:51:06 -07:00
commit_interval=args.commit_interval,
chunk_size=args.chunk_size,
)
if args.fsencoding:
FSENCODING = args.fsencoding
try:
bt.run()
except BitrotException as bre:
print('error:', bre.args[1], file=sys.stderr)
sys.exit(bre.args[0])
2013-01-17 15:01:22 +01:00
if __name__ == '__main__':
run_from_command_line()