Add --follow-links, skip files with ENOACCES et al.

This commit is contained in:
Łukasz Langa 2013-11-11 00:38:05 -08:00
parent 1f94944f87
commit 1b8a582e34
2 changed files with 63 additions and 31 deletions

View File

@ -2,8 +2,8 @@
bitrot bitrot
====== ======
Detects bit rotten files on the hard drive to save your precious photo and Detects bit rotten files on the hard drive to save your precious photo
music collection from slow decay. and music collection from slow decay.
Usage Usage
----- -----
@ -12,26 +12,26 @@ Go to the desired directory and simply invoke::
$ bitrot $ bitrot
This will start digging through your directory structure recursively indexing This will start digging through your directory structure recursively
all files found. The index is stored in a ``.bitrot.db`` file which is a SQLite indexing all files found. The index is stored in a ``.bitrot.db`` file
3 database. which is a SQLite 3 database.
Next time you run ``bitrot`` it will add new files and update the index for Next time you run ``bitrot`` it will add new files and update the index
files with a changed modification date. Most importantly however, it will for files with a changed modification date. Most importantly however, it
report all errors, e.g. files that changed on the hard drive but still have the will report all errors, e.g. files that changed on the hard drive but
same modification date. still have the same modification date.
All paths stored in ``.bitrot.db`` are relative so it's safe to rescan a folder All paths stored in ``.bitrot.db`` are relative so it's safe to rescan
after moving it to another drive. a folder after moving it to another drive.
Performance Performance
----------- -----------
Obviously depends on how fast the underlying drive is. No rigorous performance Obviously depends on how fast the underlying drive is. No rigorous
tests have been done. For informational purposes, on my typical 5400 RPM laptop performance tests have been done. For informational purposes, a typical
hard drive scanning a 60+ GB music library takes around 15 minutes. On an OCZ 5400 RPM laptop hard drive scanning a 60+ GB music library takes around
Vertex 3 SSD drive ``bitrot`` is able to scan a 100 GB Aperture library in 15 minutes. On an OCZ Vertex 3 SSD drive ``bitrot`` is able to scan
under 10 minutes. Both tests on HFS+. a 100 GB Aperture library in under 10 minutes. Both tests on HFS+.
Change Log Change Log
---------- ----------
@ -42,7 +42,10 @@ Change Log
* more control over performance with ``--commit-interval`` and * more control over performance with ``--commit-interval`` and
``--chunk-size`` command-line arguments ``--chunk-size`` command-line arguments
* bugfix: symbolic links are now properly skipped * bugfix: symbolic links are now properly skipped (or can be followed if
``--follow-links`` is passed)
* bugfix: files that cannot be opened are now gracefully skipped
* bugfix: fixed a rare division by zero when run in an empty directory * bugfix: fixed a rare division by zero when run in an empty directory
@ -54,8 +57,9 @@ Change Log
0.5.0 0.5.0
~~~~~ ~~~~~
* ``--test`` command-line argument for testing the state without updating the * ``--test`` command-line argument for testing the state without
database on disk (works for testing databases you don't have write access to) updating the database on disk (works for testing databases you don't
have write access to)
* size of the data read is reported upon finish * size of the data read is reported upon finish
@ -66,19 +70,22 @@ Change Log
* renames are now reported as such * renames are now reported as such
* all non-regular files (e.g. symbolic links, pipes, sockets) are now skipped * all non-regular files (e.g. symbolic links, pipes, sockets) are now
skipped
* progress presented in percentage * progress presented in percentage
0.3.0 0.3.0
~~~~~ ~~~~~
* ``--sum`` command-line argument for easy comparison of multiple databases * ``--sum`` command-line argument for easy comparison of multiple
databases
0.2.1 0.2.1
~~~~~ ~~~~~
* fixed regression from 0.2.0 where new files caused a ``KeyError`` exception * fixed regression from 0.2.0 where new files caused a ``KeyError``
exception
0.2.0 0.2.0
~~~~~ ~~~~~

View File

@ -2,7 +2,7 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright (C) 2013 by Łukasz Langa # Copyright (C) 2013 by Łukasz Langa
#
# Permission is hereby granted, free of charge, to any person obtaining a copy # Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal # of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights # in the Software without restriction, including without limitation the rights
@ -90,7 +90,7 @@ def get_sqlite3_cursor(path, copy=False):
return conn return conn
def run(verbosity=1, test=False, commit_interval=300, def run(verbosity=1, test=False, follow_links=False, commit_interval=300,
chunk_size=DEFAULT_CHUNK_SIZE): chunk_size=DEFAULT_CHUNK_SIZE):
current_dir = b'.' # sic, relative path current_dir = b'.' # sic, relative path
bitrot_db = os.path.join(current_dir, b'.bitrot.db') bitrot_db = os.path.join(current_dir, b'.bitrot.db')
@ -113,8 +113,12 @@ def run(verbosity=1, test=False, commit_interval=300,
for path, _, files in os.walk(current_dir): for path, _, files in os.walk(current_dir):
for f in files: for f in files:
p = os.path.join(path, f) p = os.path.join(path, f)
p_uni = p.decode('utf8')
try: try:
st = os.lstat(p) if follow_links or p_uni in missing_paths:
st = os.stat(p)
else:
st = os.lstat(p)
except OSError as ex: except OSError as ex:
if ex.errno != errno.ENOENT: if ex.errno != errno.ENOENT:
raise raise
@ -136,12 +140,22 @@ def run(verbosity=1, test=False, commit_interval=300,
sys.stdout.write(size_fmt) sys.stdout.write(size_fmt)
sys.stdout.flush() sys.stdout.flush()
last_reported_size = size_fmt last_reported_size = size_fmt
new_sha1 = sha1(p, chunk_size) p_uni = p.decode('utf8')
missing_paths.discard(p_uni)
try:
new_sha1 = sha1(p, chunk_size)
except (IOError, OSError) as e:
if verbosity:
print(
'\rwarning: cannot compute hash of {} [{}]'.format(
p, errno.errorcode[e.args[0]],
),
file=sys.stderr,
)
continue
update_ts = datetime.datetime.utcnow().strftime( update_ts = datetime.datetime.utcnow().strftime(
'%Y-%m-%d %H:%M:%S%z' '%Y-%m-%d %H:%M:%S%z'
) )
p_uni = p.decode('utf8')
missing_paths.discard(p_uni)
cur.execute('SELECT mtime, hash, timestamp FROM bitrot WHERE ' cur.execute('SELECT mtime, hash, timestamp FROM bitrot WHERE '
'path=?', (p_uni,)) 'path=?', (p_uni,))
row = cur.fetchone() row = cur.fetchone()
@ -241,6 +255,13 @@ def stable_sum():
def run_from_command_line(): def run_from_command_line():
parser = argparse.ArgumentParser(prog='bitrot') parser = argparse.ArgumentParser(prog='bitrot')
parser.add_argument('-l', '--follow-links', action='store_true',
help='follow symbolic links and store target files\' hashes. Once '
'a path is present in the database, it will be checked against '
'changes in content even if it becomes a symbolic link. In '
'other words, if you run `bitrot -l`, on subsequent runs '
'symbolic links registered during the first run will be '
'properly followed and checked even if you run without `-l`.')
parser.add_argument('-q', '--quiet', action='store_true', parser.add_argument('-q', '--quiet', action='store_true',
help='don\'t print anything besides checksum errors') help='don\'t print anything besides checksum errors')
parser.add_argument('-s', '--sum', action='store_true', parser.add_argument('-s', '--sum', action='store_true',
@ -254,7 +275,8 @@ def run_from_command_line():
parser.add_argument('--version', action='version', parser.add_argument('--version', action='version',
version='%(prog)s {}.{}.{}'.format(*VERSION)) version='%(prog)s {}.{}.{}'.format(*VERSION))
parser.add_argument('--commit-interval', type=float, default=300, parser.add_argument('--commit-interval', type=float, default=300,
help='min time between commits (0 commits on every operation)') help='min time in seconds between commits '
'(0 commits on every operation)')
parser.add_argument('--chunk-size', type=int, default=DEFAULT_CHUNK_SIZE, parser.add_argument('--chunk-size', type=int, default=DEFAULT_CHUNK_SIZE,
help='read files this many bytes at a time') help='read files this many bytes at a time')
args = parser.parse_args() args = parser.parse_args()
@ -269,9 +291,12 @@ def run_from_command_line():
verbosity = 0 verbosity = 0
elif args.verbose: elif args.verbose:
verbosity = 2 verbosity = 2
run(verbosity=verbosity, test=args.test, run(verbosity=verbosity,
test=args.test,
follow_links=args.follow_links,
commit_interval=args.commit_interval, commit_interval=args.commit_interval,
chunk_size=args.chunk_size) chunk_size=args.chunk_size,
)
if __name__ == '__main__': if __name__ == '__main__':