Add --follow-links, skip files with ENOACCES et al.

This commit is contained in:
Łukasz Langa 2013-11-11 00:38:05 -08:00
parent 1f94944f87
commit 1b8a582e34
2 changed files with 63 additions and 31 deletions

@ -2,8 +2,8 @@
bitrot
======
Detects bit rotten files on the hard drive to save your precious photo and
music collection from slow decay.
Detects bit rotten files on the hard drive to save your precious photo
and music collection from slow decay.
Usage
-----
@ -12,26 +12,26 @@ Go to the desired directory and simply invoke::
$ bitrot
This will start digging through your directory structure recursively indexing
all files found. The index is stored in a ``.bitrot.db`` file which is a SQLite
3 database.
This will start digging through your directory structure recursively
indexing all files found. The index is stored in a ``.bitrot.db`` file
which is a SQLite 3 database.
Next time you run ``bitrot`` it will add new files and update the index for
files with a changed modification date. Most importantly however, it will
report all errors, e.g. files that changed on the hard drive but still have the
same modification date.
Next time you run ``bitrot`` it will add new files and update the index
for files with a changed modification date. Most importantly however, it
will report all errors, e.g. files that changed on the hard drive but
still have the same modification date.
All paths stored in ``.bitrot.db`` are relative so it's safe to rescan a folder
after moving it to another drive.
All paths stored in ``.bitrot.db`` are relative so it's safe to rescan
a folder after moving it to another drive.
Performance
-----------
Obviously depends on how fast the underlying drive is. No rigorous performance
tests have been done. For informational purposes, on my typical 5400 RPM laptop
hard drive scanning a 60+ GB music library takes around 15 minutes. On an OCZ
Vertex 3 SSD drive ``bitrot`` is able to scan a 100 GB Aperture library in
under 10 minutes. Both tests on HFS+.
Obviously depends on how fast the underlying drive is. No rigorous
performance tests have been done. For informational purposes, a typical
5400 RPM laptop hard drive scanning a 60+ GB music library takes around
15 minutes. On an OCZ Vertex 3 SSD drive ``bitrot`` is able to scan
a 100 GB Aperture library in under 10 minutes. Both tests on HFS+.
Change Log
----------
@ -42,7 +42,10 @@ Change Log
* more control over performance with ``--commit-interval`` and
``--chunk-size`` command-line arguments
* bugfix: symbolic links are now properly skipped
* bugfix: symbolic links are now properly skipped (or can be followed if
``--follow-links`` is passed)
* bugfix: files that cannot be opened are now gracefully skipped
* bugfix: fixed a rare division by zero when run in an empty directory
@ -54,8 +57,9 @@ Change Log
0.5.0
~~~~~
* ``--test`` command-line argument for testing the state without updating the
database on disk (works for testing databases you don't have write access to)
* ``--test`` command-line argument for testing the state without
updating the database on disk (works for testing databases you don't
have write access to)
* size of the data read is reported upon finish
@ -66,19 +70,22 @@ Change Log
* renames are now reported as such
* all non-regular files (e.g. symbolic links, pipes, sockets) are now skipped
* all non-regular files (e.g. symbolic links, pipes, sockets) are now
skipped
* progress presented in percentage
0.3.0
~~~~~
* ``--sum`` command-line argument for easy comparison of multiple databases
* ``--sum`` command-line argument for easy comparison of multiple
databases
0.2.1
~~~~~
* fixed regression from 0.2.0 where new files caused a ``KeyError`` exception
* fixed regression from 0.2.0 where new files caused a ``KeyError``
exception
0.2.0
~~~~~

@ -2,7 +2,7 @@
# -*- coding: utf-8 -*-
# Copyright (C) 2013 by Łukasz Langa
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
@ -90,7 +90,7 @@ def get_sqlite3_cursor(path, copy=False):
return conn
def run(verbosity=1, test=False, commit_interval=300,
def run(verbosity=1, test=False, follow_links=False, commit_interval=300,
chunk_size=DEFAULT_CHUNK_SIZE):
current_dir = b'.' # sic, relative path
bitrot_db = os.path.join(current_dir, b'.bitrot.db')
@ -113,8 +113,12 @@ def run(verbosity=1, test=False, commit_interval=300,
for path, _, files in os.walk(current_dir):
for f in files:
p = os.path.join(path, f)
p_uni = p.decode('utf8')
try:
st = os.lstat(p)
if follow_links or p_uni in missing_paths:
st = os.stat(p)
else:
st = os.lstat(p)
except OSError as ex:
if ex.errno != errno.ENOENT:
raise
@ -136,12 +140,22 @@ def run(verbosity=1, test=False, commit_interval=300,
sys.stdout.write(size_fmt)
sys.stdout.flush()
last_reported_size = size_fmt
new_sha1 = sha1(p, chunk_size)
p_uni = p.decode('utf8')
missing_paths.discard(p_uni)
try:
new_sha1 = sha1(p, chunk_size)
except (IOError, OSError) as e:
if verbosity:
print(
'\rwarning: cannot compute hash of {} [{}]'.format(
p, errno.errorcode[e.args[0]],
),
file=sys.stderr,
)
continue
update_ts = datetime.datetime.utcnow().strftime(
'%Y-%m-%d %H:%M:%S%z'
)
p_uni = p.decode('utf8')
missing_paths.discard(p_uni)
cur.execute('SELECT mtime, hash, timestamp FROM bitrot WHERE '
'path=?', (p_uni,))
row = cur.fetchone()
@ -241,6 +255,13 @@ def stable_sum():
def run_from_command_line():
parser = argparse.ArgumentParser(prog='bitrot')
parser.add_argument('-l', '--follow-links', action='store_true',
help='follow symbolic links and store target files\' hashes. Once '
'a path is present in the database, it will be checked against '
'changes in content even if it becomes a symbolic link. In '
'other words, if you run `bitrot -l`, on subsequent runs '
'symbolic links registered during the first run will be '
'properly followed and checked even if you run without `-l`.')
parser.add_argument('-q', '--quiet', action='store_true',
help='don\'t print anything besides checksum errors')
parser.add_argument('-s', '--sum', action='store_true',
@ -254,7 +275,8 @@ def run_from_command_line():
parser.add_argument('--version', action='version',
version='%(prog)s {}.{}.{}'.format(*VERSION))
parser.add_argument('--commit-interval', type=float, default=300,
help='min time between commits (0 commits on every operation)')
help='min time in seconds between commits '
'(0 commits on every operation)')
parser.add_argument('--chunk-size', type=int, default=DEFAULT_CHUNK_SIZE,
help='read files this many bytes at a time')
args = parser.parse_args()
@ -269,9 +291,12 @@ def run_from_command_line():
verbosity = 0
elif args.verbose:
verbosity = 2
run(verbosity=verbosity, test=args.test,
run(verbosity=verbosity,
test=args.test,
follow_links=args.follow_links,
commit_interval=args.commit_interval,
chunk_size=args.chunk_size)
chunk_size=args.chunk_size,
)
if __name__ == '__main__':