2013-01-17 15:01:22 +01:00
|
|
|
#!/usr/bin/env python
|
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
|
|
|
# Copyright (C) 2013 by Łukasz Langa
|
2013-11-11 00:38:05 -08:00
|
|
|
|
2013-01-17 15:01:22 +01:00
|
|
|
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
|
|
# of this software and associated documentation files (the "Software"), to deal
|
|
|
|
# in the Software without restriction, including without limitation the rights
|
|
|
|
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
|
|
# copies of the Software, and to permit persons to whom the Software is
|
|
|
|
# furnished to do so, subject to the following conditions:
|
|
|
|
|
|
|
|
# The above copyright notice and this permission notice shall be included in
|
|
|
|
# all copies or substantial portions of the Software.
|
|
|
|
|
|
|
|
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
|
|
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
|
|
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
|
|
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
|
|
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
|
|
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
|
|
# THE SOFTWARE.
|
|
|
|
|
|
|
|
from __future__ import absolute_import
|
|
|
|
from __future__ import division
|
|
|
|
from __future__ import print_function
|
|
|
|
from __future__ import unicode_literals
|
|
|
|
|
2013-02-10 19:16:10 +01:00
|
|
|
import argparse
|
2013-01-17 15:01:22 +01:00
|
|
|
import atexit
|
|
|
|
import datetime
|
2013-10-17 11:42:23 -07:00
|
|
|
import errno
|
2013-01-17 15:01:22 +01:00
|
|
|
import hashlib
|
|
|
|
import os
|
2013-03-15 17:12:04 +01:00
|
|
|
import shutil
|
2013-01-17 15:59:29 +01:00
|
|
|
import sqlite3
|
2013-03-04 00:49:42 +01:00
|
|
|
import stat
|
2013-01-17 15:01:22 +01:00
|
|
|
import sys
|
2013-03-15 17:12:04 +01:00
|
|
|
import tempfile
|
2013-08-29 15:39:33 -07:00
|
|
|
import time
|
2013-01-17 15:01:22 +01:00
|
|
|
|
|
|
|
|
2016-11-01 12:02:34 -07:00
|
|
|
DEFAULT_CHUNK_SIZE = 16384 # block size in HFS+; 4X the block size in ext4
|
2013-01-17 15:01:22 +01:00
|
|
|
DOT_THRESHOLD = 200
|
2016-11-01 12:02:34 -07:00
|
|
|
VERSION = (0, 9, 2)
|
2015-06-22 18:08:26 -07:00
|
|
|
IGNORED_FILE_SYSTEM_ERRORS = {errno.ENOENT, errno.EACCES}
|
2016-05-02 17:49:25 -07:00
|
|
|
FSENCODING = sys.getfilesystemencoding()
|
2013-01-17 15:01:22 +01:00
|
|
|
|
2013-01-17 15:59:29 +01:00
|
|
|
|
2016-08-09 14:51:57 -07:00
|
|
|
if sys.version[0] == '2':
|
|
|
|
str = type(u'text')
|
|
|
|
# use `bytes` for bytestrings
|
|
|
|
|
|
|
|
|
2013-10-17 11:40:01 -07:00
|
|
|
def sha1(path, chunk_size):
|
2013-01-17 15:01:22 +01:00
|
|
|
digest = hashlib.sha1()
|
2015-11-02 16:23:18 -08:00
|
|
|
with open(path, 'rb') as f:
|
2013-10-17 11:40:01 -07:00
|
|
|
d = f.read(chunk_size)
|
2013-01-17 15:01:22 +01:00
|
|
|
while d:
|
|
|
|
digest.update(d)
|
2013-10-17 11:40:01 -07:00
|
|
|
d = f.read(chunk_size)
|
2013-01-17 15:01:22 +01:00
|
|
|
return digest.hexdigest()
|
|
|
|
|
2013-10-27 06:45:25 +01:00
|
|
|
|
2015-06-22 18:08:26 -07:00
|
|
|
def ts():
|
|
|
|
return datetime.datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S%z')
|
2013-01-17 15:01:22 +01:00
|
|
|
|
2013-10-27 06:45:25 +01:00
|
|
|
|
2013-03-15 17:12:04 +01:00
|
|
|
def get_sqlite3_cursor(path, copy=False):
|
2016-08-09 14:51:57 -07:00
|
|
|
path = path.decode(FSENCODING)
|
2013-03-15 17:12:04 +01:00
|
|
|
if copy:
|
|
|
|
if not os.path.exists(path):
|
|
|
|
raise ValueError("error: bitrot database at {} does not exist."
|
|
|
|
"".format(path))
|
|
|
|
db_copy = tempfile.NamedTemporaryFile(prefix='bitrot_', suffix='.db',
|
|
|
|
delete=False)
|
|
|
|
with open(path, 'rb') as db_orig:
|
|
|
|
try:
|
|
|
|
shutil.copyfileobj(db_orig, db_copy)
|
|
|
|
finally:
|
|
|
|
db_copy.close()
|
|
|
|
path = db_copy.name
|
|
|
|
atexit.register(os.unlink, path)
|
2013-01-17 15:59:29 +01:00
|
|
|
conn = sqlite3.connect(path)
|
|
|
|
atexit.register(conn.close)
|
|
|
|
cur = conn.cursor()
|
2013-11-11 00:43:22 -08:00
|
|
|
tables = set(t for t, in cur.execute('SELECT name FROM sqlite_master'))
|
|
|
|
if 'bitrot' not in tables:
|
2013-01-17 15:59:29 +01:00
|
|
|
cur.execute('CREATE TABLE bitrot (path TEXT PRIMARY KEY, '
|
|
|
|
'mtime INTEGER, hash TEXT, timestamp TEXT)')
|
2013-11-11 00:43:22 -08:00
|
|
|
if 'bitrot_hash_idx' not in tables:
|
2013-08-26 18:59:00 -07:00
|
|
|
cur.execute('CREATE INDEX bitrot_hash_idx ON bitrot (hash)')
|
2013-10-27 06:49:55 +01:00
|
|
|
atexit.register(conn.commit)
|
2013-01-17 15:59:29 +01:00
|
|
|
return conn
|
|
|
|
|
|
|
|
|
2015-06-22 18:08:26 -07:00
|
|
|
def list_existing_paths(directory, expected=(), ignored=(), follow_links=False):
|
|
|
|
"""list_existing_paths('/dir') -> ([path1, path2, ...], total_size)
|
|
|
|
|
|
|
|
Returns a tuple with a list with existing files in `directory` and their
|
|
|
|
`total_size`.
|
|
|
|
|
|
|
|
Doesn't add entries listed in `ignored`. Doesn't add symlinks if
|
|
|
|
`follow_links` is False (the default). All entries present in `expected`
|
|
|
|
must be files (can't be directories or symlinks).
|
|
|
|
"""
|
2017-03-03 19:16:46 +01:00
|
|
|
paths = set()
|
2015-06-22 18:08:26 -07:00
|
|
|
total_size = 0
|
|
|
|
for path, _, files in os.walk(directory):
|
2013-01-17 15:01:22 +01:00
|
|
|
for f in files:
|
|
|
|
p = os.path.join(path, f)
|
2016-05-02 17:49:25 -07:00
|
|
|
try:
|
|
|
|
p_uni = p.decode(FSENCODING)
|
|
|
|
except UnicodeDecodeError:
|
2016-10-29 19:09:08 -07:00
|
|
|
binary_stderr = getattr(sys.stderr, 'buffer', sys.stderr)
|
|
|
|
binary_stderr.write(b"warning: cannot decode file name: ")
|
|
|
|
binary_stderr.write(p)
|
|
|
|
binary_stderr.write(b"\n")
|
2016-05-02 17:49:25 -07:00
|
|
|
continue
|
|
|
|
|
2013-08-18 20:16:36 -07:00
|
|
|
try:
|
2015-06-22 18:08:26 -07:00
|
|
|
if follow_links or p_uni in expected:
|
2013-11-11 00:38:05 -08:00
|
|
|
st = os.stat(p)
|
|
|
|
else:
|
|
|
|
st = os.lstat(p)
|
2013-08-18 20:16:36 -07:00
|
|
|
except OSError as ex:
|
2015-06-22 18:08:26 -07:00
|
|
|
if ex.errno not in IGNORED_FILE_SYSTEM_ERRORS:
|
2013-08-18 20:16:36 -07:00
|
|
|
raise
|
|
|
|
else:
|
2015-06-22 18:08:26 -07:00
|
|
|
if not stat.S_ISREG(st.st_mode) or p in ignored:
|
2013-08-18 20:16:36 -07:00
|
|
|
continue
|
2017-03-03 19:16:46 +01:00
|
|
|
paths.add(p)
|
2013-08-18 20:16:36 -07:00
|
|
|
total_size += st.st_size
|
2015-06-22 18:08:26 -07:00
|
|
|
return paths, total_size
|
|
|
|
|
|
|
|
|
|
|
|
class BitrotException(Exception):
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
class Bitrot(object):
|
|
|
|
def __init__(
|
|
|
|
self, verbosity=1, test=False, follow_links=False, commit_interval=300,
|
|
|
|
chunk_size=DEFAULT_CHUNK_SIZE,
|
|
|
|
):
|
|
|
|
self.verbosity = verbosity
|
|
|
|
self.test = test
|
|
|
|
self.follow_links = follow_links
|
|
|
|
self.commit_interval = commit_interval
|
|
|
|
self.chunk_size = chunk_size
|
|
|
|
self._last_reported_size = ''
|
|
|
|
self._last_commit_ts = 0
|
|
|
|
|
|
|
|
def maybe_commit(self, conn):
|
|
|
|
if time.time() < self._last_commit_ts + self.commit_interval:
|
|
|
|
# no time for commit yet!
|
|
|
|
return
|
|
|
|
|
|
|
|
conn.commit()
|
|
|
|
self._last_commit_ts = time.time()
|
|
|
|
|
|
|
|
def run(self):
|
2016-08-09 14:51:57 -07:00
|
|
|
check_sha512_integrity(verbosity=self.verbosity)
|
2016-05-02 17:49:25 -07:00
|
|
|
|
|
|
|
bitrot_db = get_path()
|
|
|
|
bitrot_sha512 = get_path(ext=b'sha512')
|
2013-11-11 00:38:05 -08:00
|
|
|
try:
|
2015-06-22 18:08:26 -07:00
|
|
|
conn = get_sqlite3_cursor(bitrot_db, copy=self.test)
|
|
|
|
except ValueError:
|
|
|
|
raise BitrotException(
|
|
|
|
2,
|
|
|
|
'No database exists so cannot test. Run the tool once first.',
|
|
|
|
)
|
|
|
|
|
|
|
|
cur = conn.cursor()
|
|
|
|
new_paths = []
|
|
|
|
updated_paths = []
|
|
|
|
renamed_paths = []
|
2016-07-13 12:58:20 -06:00
|
|
|
errors = []
|
2015-06-22 18:08:26 -07:00
|
|
|
current_size = 0
|
|
|
|
missing_paths = self.select_all_paths(cur)
|
2017-03-03 19:16:46 +01:00
|
|
|
hashes = self.select_all_hashes(cur)
|
2015-06-22 18:08:26 -07:00
|
|
|
paths, total_size = list_existing_paths(
|
2016-05-02 17:49:25 -07:00
|
|
|
b'.', expected=missing_paths, ignored={bitrot_db, bitrot_sha512},
|
2015-06-22 18:08:26 -07:00
|
|
|
follow_links=self.follow_links,
|
2013-03-04 00:49:42 +01:00
|
|
|
)
|
2015-06-22 18:08:26 -07:00
|
|
|
|
2017-03-03 19:16:46 +01:00
|
|
|
for p in sorted(paths):
|
2016-10-31 14:03:59 +00:00
|
|
|
p_uni = p.decode(FSENCODING)
|
2015-06-22 18:08:26 -07:00
|
|
|
try:
|
|
|
|
st = os.stat(p)
|
|
|
|
except OSError as ex:
|
|
|
|
if ex.errno in IGNORED_FILE_SYSTEM_ERRORS:
|
|
|
|
# The file disappeared between listing existing paths and
|
|
|
|
# this run or is (temporarily?) locked with different
|
|
|
|
# permissions. We'll just skip it for now.
|
2016-09-21 17:37:36 +02:00
|
|
|
print(
|
|
|
|
'\rwarning: `{}` is currently unavailable for '
|
|
|
|
'reading: {}'.format(
|
|
|
|
p_uni, ex,
|
|
|
|
),
|
|
|
|
file=sys.stderr,
|
|
|
|
)
|
2015-06-22 18:08:26 -07:00
|
|
|
continue
|
|
|
|
|
|
|
|
raise # Not expected? https://github.com/ambv/bitrot/issues/
|
|
|
|
|
|
|
|
new_mtime = int(st.st_mtime)
|
|
|
|
current_size += st.st_size
|
|
|
|
if self.verbosity:
|
|
|
|
self.report_progress(current_size, total_size)
|
|
|
|
|
|
|
|
missing_paths.discard(p_uni)
|
|
|
|
try:
|
|
|
|
new_sha1 = sha1(p, self.chunk_size)
|
|
|
|
except (IOError, OSError) as e:
|
2016-09-21 17:40:42 +02:00
|
|
|
print(
|
|
|
|
'\rwarning: cannot compute hash of {} [{}]'.format(
|
|
|
|
p, errno.errorcode[e.args[0]],
|
|
|
|
),
|
|
|
|
file=sys.stderr,
|
|
|
|
)
|
2015-06-22 18:08:26 -07:00
|
|
|
continue
|
|
|
|
|
|
|
|
cur.execute('SELECT mtime, hash, timestamp FROM bitrot WHERE '
|
|
|
|
'path=?', (p_uni,))
|
|
|
|
row = cur.fetchone()
|
|
|
|
if not row:
|
|
|
|
stored_path = self.handle_unknown_path(
|
2017-03-03 19:16:46 +01:00
|
|
|
cur, p_uni, new_mtime, new_sha1, paths, hashes
|
2015-06-22 18:08:26 -07:00
|
|
|
)
|
|
|
|
self.maybe_commit(conn)
|
|
|
|
|
|
|
|
if p_uni == stored_path:
|
|
|
|
new_paths.append(p) # FIXME: shouldn't that be p_uni?
|
|
|
|
else:
|
2013-03-04 00:49:42 +01:00
|
|
|
renamed_paths.append((stored_path, p_uni))
|
|
|
|
missing_paths.discard(stored_path)
|
2015-06-22 18:08:26 -07:00
|
|
|
continue
|
|
|
|
|
|
|
|
stored_mtime, stored_sha1, stored_ts = row
|
|
|
|
if int(stored_mtime) != new_mtime:
|
|
|
|
updated_paths.append(p)
|
|
|
|
cur.execute('UPDATE bitrot SET mtime=?, hash=?, timestamp=? '
|
|
|
|
'WHERE path=?',
|
|
|
|
(new_mtime, new_sha1, ts(), p_uni))
|
|
|
|
self.maybe_commit(conn)
|
|
|
|
continue
|
|
|
|
|
|
|
|
if stored_sha1 != new_sha1:
|
2016-07-13 12:58:20 -06:00
|
|
|
errors.append(p)
|
2015-06-22 18:08:26 -07:00
|
|
|
print(
|
|
|
|
'\rerror: SHA1 mismatch for {}: expected {}, got {}.'
|
|
|
|
' Last good hash checked on {}.'.format(
|
|
|
|
p, stored_sha1, new_sha1, stored_ts
|
|
|
|
),
|
|
|
|
file=sys.stderr,
|
2013-11-11 00:43:22 -08:00
|
|
|
)
|
2015-06-22 18:08:26 -07:00
|
|
|
|
|
|
|
for path in missing_paths:
|
|
|
|
cur.execute('DELETE FROM bitrot WHERE path=?', (path,))
|
|
|
|
|
|
|
|
conn.commit()
|
|
|
|
|
|
|
|
if self.verbosity:
|
|
|
|
cur.execute('SELECT COUNT(path) FROM bitrot')
|
|
|
|
all_count = cur.fetchone()[0]
|
|
|
|
self.report_done(
|
|
|
|
total_size,
|
|
|
|
all_count,
|
2016-07-13 12:58:20 -06:00
|
|
|
len(errors),
|
2015-06-22 18:08:26 -07:00
|
|
|
new_paths,
|
|
|
|
updated_paths,
|
|
|
|
renamed_paths,
|
|
|
|
missing_paths,
|
|
|
|
)
|
|
|
|
|
2016-08-09 14:51:57 -07:00
|
|
|
update_sha512_integrity(verbosity=self.verbosity)
|
2016-05-02 17:49:25 -07:00
|
|
|
|
2016-07-14 08:28:05 -06:00
|
|
|
if errors:
|
2015-06-22 18:08:26 -07:00
|
|
|
raise BitrotException(
|
2016-07-13 12:58:20 -06:00
|
|
|
1, 'There were {} errors found.'.format(len(errors)), errors,
|
2013-03-04 00:49:42 +01:00
|
|
|
)
|
2015-06-22 18:08:26 -07:00
|
|
|
|
|
|
|
def select_all_paths(self, cur):
|
|
|
|
result = set()
|
|
|
|
cur.execute('SELECT path FROM bitrot')
|
|
|
|
row = cur.fetchone()
|
|
|
|
while row:
|
|
|
|
result.add(row[0])
|
|
|
|
row = cur.fetchone()
|
|
|
|
return result
|
|
|
|
|
2017-03-03 19:16:46 +01:00
|
|
|
def select_all_hashes(self, cur):
|
|
|
|
result = {}
|
|
|
|
cur.execute('SELECT hash, path FROM bitrot')
|
|
|
|
row = cur.fetchone()
|
|
|
|
while row:
|
|
|
|
rhash, rpath = row
|
|
|
|
result.setdefault(rhash, set()).add(rpath)
|
|
|
|
row = cur.fetchone()
|
|
|
|
return result
|
|
|
|
|
2015-06-22 18:08:26 -07:00
|
|
|
def report_progress(self, current_size, total_size):
|
|
|
|
size_fmt = '\r{:>6.1%}'.format(current_size/(total_size or 1))
|
|
|
|
if size_fmt == self._last_reported_size:
|
|
|
|
return
|
|
|
|
|
|
|
|
sys.stdout.write(size_fmt)
|
|
|
|
sys.stdout.flush()
|
|
|
|
self._last_reported_size = size_fmt
|
|
|
|
|
|
|
|
def report_done(
|
|
|
|
self, total_size, all_count, error_count, new_paths, updated_paths,
|
|
|
|
renamed_paths, missing_paths):
|
2013-03-04 01:41:45 +01:00
|
|
|
print('\rFinished. {:.2f} MiB of data read. {} errors found.'
|
2015-06-22 18:08:26 -07:00
|
|
|
''.format(total_size/1024/1024, error_count))
|
|
|
|
if self.verbosity == 1:
|
2013-11-11 00:43:22 -08:00
|
|
|
print(
|
|
|
|
'{} entries in the database, {} new, {} updated, '
|
|
|
|
'{} renamed, {} missing.'.format(
|
|
|
|
all_count, len(new_paths), len(updated_paths),
|
|
|
|
len(renamed_paths), len(missing_paths),
|
|
|
|
),
|
|
|
|
)
|
2015-06-22 18:08:26 -07:00
|
|
|
elif self.verbosity > 1:
|
2013-03-04 01:41:45 +01:00
|
|
|
print('{} entries in the database.'.format(all_count), end=' ')
|
2013-02-10 19:16:10 +01:00
|
|
|
if new_paths:
|
2013-03-04 01:41:45 +01:00
|
|
|
print('{} entries new:'.format(len(new_paths)))
|
2013-02-10 19:16:10 +01:00
|
|
|
new_paths.sort()
|
|
|
|
for path in new_paths:
|
2016-10-29 19:09:08 -07:00
|
|
|
print(' ', path.decode(FSENCODING))
|
2013-02-10 19:16:10 +01:00
|
|
|
if updated_paths:
|
2013-03-04 01:41:45 +01:00
|
|
|
print('{} entries updated:'.format(len(updated_paths)))
|
2013-02-10 19:16:10 +01:00
|
|
|
updated_paths.sort()
|
|
|
|
for path in updated_paths:
|
2016-10-29 19:09:08 -07:00
|
|
|
print(' ', path.decode(FSENCODING))
|
2013-03-04 00:49:42 +01:00
|
|
|
if renamed_paths:
|
2013-03-04 01:41:45 +01:00
|
|
|
print('{} entries renamed:'.format(len(renamed_paths)))
|
2013-03-04 00:49:42 +01:00
|
|
|
renamed_paths.sort()
|
|
|
|
for path in renamed_paths:
|
2016-10-29 19:09:08 -07:00
|
|
|
print(
|
|
|
|
' from',
|
|
|
|
path[0].decode(FSENCODING),
|
|
|
|
'to',
|
|
|
|
path[1].decode(FSENCODING),
|
|
|
|
)
|
2013-02-10 19:16:10 +01:00
|
|
|
if missing_paths:
|
2013-03-04 01:41:45 +01:00
|
|
|
print('{} entries missing:'.format(len(missing_paths)))
|
2013-02-10 19:16:10 +01:00
|
|
|
missing_paths = sorted(missing_paths)
|
|
|
|
for path in missing_paths:
|
2013-03-04 01:41:45 +01:00
|
|
|
print(' ', path)
|
2013-02-10 19:16:10 +01:00
|
|
|
if not any((new_paths, updated_paths, missing_paths)):
|
|
|
|
print()
|
2016-08-09 14:51:57 -07:00
|
|
|
if self.test and self.verbosity:
|
2013-03-15 23:32:14 +01:00
|
|
|
print('warning: database file not updated on disk (test mode).')
|
2015-06-22 18:08:26 -07:00
|
|
|
|
2017-03-03 19:16:46 +01:00
|
|
|
def handle_unknown_path(self, cur, new_path, new_mtime, new_sha1, paths, hashes):
|
2015-06-22 18:08:26 -07:00
|
|
|
"""Either add a new entry to the database or update the existing entry
|
|
|
|
on rename.
|
|
|
|
|
|
|
|
Returns `new_path` if the entry was indeed new or the `stored_path` (e.g.
|
|
|
|
outdated path) if there was a rename.
|
|
|
|
"""
|
|
|
|
|
2017-03-03 19:16:46 +01:00
|
|
|
try: # if the path isn't in the database
|
|
|
|
found = [path for path in hashes[new_sha1] if path not in paths]
|
|
|
|
renamed = found.pop()
|
2015-06-22 18:08:26 -07:00
|
|
|
# update the path in the database
|
|
|
|
cur.execute(
|
|
|
|
'UPDATE bitrot SET mtime=?, path=?, timestamp=? WHERE path=?',
|
2017-03-03 19:16:46 +01:00
|
|
|
(new_mtime, new_path, ts(), renamed),
|
2015-06-22 18:08:26 -07:00
|
|
|
)
|
|
|
|
|
2017-03-03 19:16:46 +01:00
|
|
|
return renamed
|
|
|
|
|
|
|
|
# From hashes[new_sha1] or found.pop()
|
|
|
|
except (KeyError,IndexError):
|
|
|
|
cur.execute(
|
|
|
|
'INSERT INTO bitrot VALUES (?, ?, ?, ?)',
|
|
|
|
(new_path, new_mtime, new_sha1, ts()),
|
|
|
|
)
|
|
|
|
return new_path
|
2013-01-17 15:01:22 +01:00
|
|
|
|
2016-05-02 17:49:25 -07:00
|
|
|
def get_path(directory=b'.', ext=b'db'):
|
|
|
|
"""Compose the path to the selected bitrot file."""
|
|
|
|
return os.path.join(directory, b'.bitrot.' + ext)
|
|
|
|
|
|
|
|
|
2016-10-29 19:27:18 -07:00
|
|
|
def stable_sum(bitrot_db=None):
|
2016-05-02 17:49:25 -07:00
|
|
|
"""Calculates a stable SHA512 of all entries in the database.
|
|
|
|
|
|
|
|
Useful for comparing if two directories hold the same data, as it ignores
|
|
|
|
timing information."""
|
2016-10-29 19:27:18 -07:00
|
|
|
if bitrot_db is None:
|
|
|
|
bitrot_db = get_path()
|
2013-02-26 15:44:51 +01:00
|
|
|
digest = hashlib.sha512()
|
|
|
|
conn = get_sqlite3_cursor(bitrot_db)
|
|
|
|
cur = conn.cursor()
|
|
|
|
cur.execute('SELECT hash FROM bitrot ORDER BY path')
|
|
|
|
row = cur.fetchone()
|
|
|
|
while row:
|
2016-10-29 19:27:18 -07:00
|
|
|
digest.update(row[0].encode('ascii'))
|
2013-02-26 15:44:51 +01:00
|
|
|
row = cur.fetchone()
|
|
|
|
return digest.hexdigest()
|
|
|
|
|
|
|
|
|
2016-08-09 14:51:57 -07:00
|
|
|
def check_sha512_integrity(verbosity=1):
|
|
|
|
sha512_path = get_path(ext=b'sha512')
|
2016-05-02 17:49:25 -07:00
|
|
|
if not os.path.exists(sha512_path):
|
|
|
|
return
|
|
|
|
|
2016-08-09 14:51:57 -07:00
|
|
|
if verbosity:
|
|
|
|
print('Checking bitrot.db integrity... ', end='')
|
|
|
|
sys.stdout.flush()
|
2016-05-02 17:49:25 -07:00
|
|
|
with open(sha512_path, 'rb') as f:
|
|
|
|
old_sha512 = f.read().strip()
|
|
|
|
bitrot_db = get_path()
|
|
|
|
digest = hashlib.sha512()
|
|
|
|
with open(bitrot_db, 'rb') as f:
|
|
|
|
digest.update(f.read())
|
2016-08-09 14:51:57 -07:00
|
|
|
new_sha512 = digest.hexdigest().encode('ascii')
|
2016-05-02 17:49:25 -07:00
|
|
|
if new_sha512 != old_sha512:
|
2016-08-09 14:51:57 -07:00
|
|
|
if verbosity:
|
|
|
|
if len(old_sha512) == 128:
|
|
|
|
print(
|
|
|
|
"error: SHA512 of the file is different, bitrot.db might "
|
|
|
|
"be corrupt.",
|
|
|
|
)
|
|
|
|
else:
|
|
|
|
print(
|
|
|
|
"error: SHA512 of the file is different but bitrot.sha512 "
|
|
|
|
"has a suspicious length. It might be corrupt.",
|
|
|
|
)
|
2016-05-02 17:49:25 -07:00
|
|
|
print(
|
2016-08-09 14:51:57 -07:00
|
|
|
"If you'd like to continue anyway, delete the .bitrot.sha512 "
|
|
|
|
"file and try again.",
|
|
|
|
file=sys.stderr,
|
2016-05-02 17:49:25 -07:00
|
|
|
)
|
|
|
|
raise BitrotException(
|
|
|
|
3, 'bitrot.db integrity check failed, cannot continue.',
|
|
|
|
)
|
|
|
|
|
2016-08-09 14:51:57 -07:00
|
|
|
if verbosity:
|
|
|
|
print('ok.')
|
2016-05-02 17:49:25 -07:00
|
|
|
|
2016-08-09 14:51:57 -07:00
|
|
|
def update_sha512_integrity(verbosity=1):
|
2016-05-02 17:49:25 -07:00
|
|
|
old_sha512 = 0
|
2016-08-09 14:51:57 -07:00
|
|
|
sha512_path = get_path(ext=b'sha512')
|
2016-05-02 17:49:25 -07:00
|
|
|
if os.path.exists(sha512_path):
|
|
|
|
with open(sha512_path, 'rb') as f:
|
|
|
|
old_sha512 = f.read().strip()
|
|
|
|
bitrot_db = get_path()
|
|
|
|
digest = hashlib.sha512()
|
|
|
|
with open(bitrot_db, 'rb') as f:
|
|
|
|
digest.update(f.read())
|
2016-08-09 14:51:57 -07:00
|
|
|
new_sha512 = digest.hexdigest().encode('ascii')
|
2016-05-02 17:49:25 -07:00
|
|
|
if new_sha512 != old_sha512:
|
2016-08-09 14:51:57 -07:00
|
|
|
if verbosity:
|
|
|
|
print('Updating bitrot.sha512... ', end='')
|
|
|
|
sys.stdout.flush()
|
2016-05-02 17:49:25 -07:00
|
|
|
with open(sha512_path, 'wb') as f:
|
|
|
|
f.write(new_sha512)
|
2016-08-09 14:51:57 -07:00
|
|
|
if verbosity:
|
|
|
|
print('done.')
|
2016-05-02 17:49:25 -07:00
|
|
|
|
2013-02-10 19:16:10 +01:00
|
|
|
def run_from_command_line():
|
2016-05-02 17:49:25 -07:00
|
|
|
global FSENCODING
|
|
|
|
|
2013-02-10 19:16:10 +01:00
|
|
|
parser = argparse.ArgumentParser(prog='bitrot')
|
2013-11-11 00:43:22 -08:00
|
|
|
parser.add_argument(
|
|
|
|
'-l', '--follow-links', action='store_true',
|
2013-11-11 00:38:05 -08:00
|
|
|
help='follow symbolic links and store target files\' hashes. Once '
|
|
|
|
'a path is present in the database, it will be checked against '
|
|
|
|
'changes in content even if it becomes a symbolic link. In '
|
|
|
|
'other words, if you run `bitrot -l`, on subsequent runs '
|
|
|
|
'symbolic links registered during the first run will be '
|
|
|
|
'properly followed and checked even if you run without `-l`.')
|
2013-11-11 00:43:22 -08:00
|
|
|
parser.add_argument(
|
|
|
|
'-q', '--quiet', action='store_true',
|
2013-02-10 19:16:10 +01:00
|
|
|
help='don\'t print anything besides checksum errors')
|
2013-11-11 00:43:22 -08:00
|
|
|
parser.add_argument(
|
|
|
|
'-s', '--sum', action='store_true',
|
2013-02-26 15:44:51 +01:00
|
|
|
help='using only the data already gathered, return a SHA-512 sum '
|
|
|
|
'of hashes of all the entries in the database. No timestamps '
|
|
|
|
'are used in calculation.')
|
2013-11-11 00:43:22 -08:00
|
|
|
parser.add_argument(
|
|
|
|
'-v', '--verbose', action='store_true',
|
2013-02-10 19:16:10 +01:00
|
|
|
help='list new, updated and missing entries')
|
2013-11-11 00:43:22 -08:00
|
|
|
parser.add_argument(
|
|
|
|
'-t', '--test', action='store_true',
|
2013-03-15 17:12:04 +01:00
|
|
|
help='just test against an existing database, don\'t update anything')
|
2013-11-11 00:43:22 -08:00
|
|
|
parser.add_argument(
|
|
|
|
'--version', action='version',
|
2013-02-10 19:16:10 +01:00
|
|
|
version='%(prog)s {}.{}.{}'.format(*VERSION))
|
2013-11-11 00:43:22 -08:00
|
|
|
parser.add_argument(
|
|
|
|
'--commit-interval', type=float, default=300,
|
2013-11-11 00:38:05 -08:00
|
|
|
help='min time in seconds between commits '
|
|
|
|
'(0 commits on every operation)')
|
2013-11-11 00:43:22 -08:00
|
|
|
parser.add_argument(
|
|
|
|
'--chunk-size', type=int, default=DEFAULT_CHUNK_SIZE,
|
2013-08-29 15:51:06 -07:00
|
|
|
help='read files this many bytes at a time')
|
2016-05-02 17:49:25 -07:00
|
|
|
parser.add_argument(
|
|
|
|
'--fsencoding', default='',
|
|
|
|
help='override the codec to decode filenames, otherwise taken from '
|
|
|
|
'the LANG environment variables')
|
2013-02-10 19:16:10 +01:00
|
|
|
args = parser.parse_args()
|
2013-02-26 15:44:51 +01:00
|
|
|
if args.sum:
|
|
|
|
try:
|
|
|
|
print(stable_sum())
|
|
|
|
except RuntimeError as e:
|
2016-08-09 14:51:57 -07:00
|
|
|
print(str(e).encode('utf8'), file=sys.stderr)
|
2013-02-26 15:44:51 +01:00
|
|
|
else:
|
|
|
|
verbosity = 1
|
|
|
|
if args.quiet:
|
|
|
|
verbosity = 0
|
|
|
|
elif args.verbose:
|
|
|
|
verbosity = 2
|
2015-06-22 18:08:26 -07:00
|
|
|
bt = Bitrot(
|
2013-11-11 00:43:22 -08:00
|
|
|
verbosity=verbosity,
|
2013-11-11 00:38:05 -08:00
|
|
|
test=args.test,
|
|
|
|
follow_links=args.follow_links,
|
2013-08-29 15:51:06 -07:00
|
|
|
commit_interval=args.commit_interval,
|
2013-11-11 00:38:05 -08:00
|
|
|
chunk_size=args.chunk_size,
|
|
|
|
)
|
2016-05-02 17:49:25 -07:00
|
|
|
if args.fsencoding:
|
|
|
|
FSENCODING = args.fsencoding
|
2015-06-22 18:08:26 -07:00
|
|
|
try:
|
|
|
|
bt.run()
|
|
|
|
except BitrotException as bre:
|
|
|
|
print('error:', bre.args[1], file=sys.stderr)
|
|
|
|
sys.exit(bre.args[0])
|
2013-02-10 19:16:10 +01:00
|
|
|
|
|
|
|
|
2013-01-17 15:01:22 +01:00
|
|
|
if __name__ == '__main__':
|
2013-02-10 19:16:10 +01:00
|
|
|
run_from_command_line()
|