Use a process pool to calculate hashes and perform stat()

Fixes #23
This commit is contained in:
Łukasz Langa 2020-05-17 22:55:35 +02:00
parent 104e07b66b
commit 0dc3390b7f
No known key found for this signature in database
GPG Key ID: B26995E310250568
3 changed files with 62 additions and 38 deletions

View File

@ -2,7 +2,7 @@
# -*- coding: utf-8 -*-
# Copyright (C) 2013 by Łukasz Langa
#
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
@ -26,5 +26,11 @@ from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from multiprocessing import freeze_support
from bitrot import run_from_command_line
run_from_command_line()
if __name__ == "__main__":
freeze_support()
run_from_command_line()

View File

@ -58,8 +58,8 @@ setup(
include_package_data = True,
zip_safe = False, # if only because of the readme file
install_requires = [
'futures; python_version == "2.7"'
],
classifiers = [
'Development Status :: 4 - Beta',
'License :: OSI Approved :: MIT License',

View File

@ -40,6 +40,8 @@ import tempfile
import time
import unicodedata
from concurrent.futures import ProcessPoolExecutor, wait, as_completed
DEFAULT_CHUNK_SIZE = 16384 # block size in HFS+; 4X the block size in ext4
DOT_THRESHOLD = 200
@ -144,6 +146,43 @@ def list_existing_paths(directory, expected=(), ignored=(), follow_links=False):
return paths, total_size
def compute_one(path, chunk_size):
"""Return a tuple with (unicode path, size, mtime, sha1). Takes a binary path."""
p_uni = normalize_path(path)
try:
st = os.stat(path)
except OSError as ex:
if ex.errno in IGNORED_FILE_SYSTEM_ERRORS:
# The file disappeared between listing existing paths and
# this run or is (temporarily?) locked with different
# permissions. We'll just skip it for now.
print(
'\rwarning: `{}` is currently unavailable for '
'reading: {}'.format(
p_uni, ex,
),
file=sys.stderr,
)
raise BitrotException
raise # Not expected? https://github.com/ambv/bitrot/issues/
new_mtime = int(st.st_mtime)
try:
new_sha1 = sha1(path, chunk_size)
except (IOError, OSError) as e:
print(
'\rwarning: cannot compute hash of {} [{}]'.format(
p_uni, errno.errorcode[e.args[0]],
),
file=sys.stderr,
)
raise BitrotException
return p_uni, st.st_size, int(st.st_mtime), new_sha1
class BitrotException(Exception):
pass
@ -151,7 +190,7 @@ class BitrotException(Exception):
class Bitrot(object):
def __init__(
self, verbosity=1, test=False, follow_links=False, commit_interval=300,
chunk_size=DEFAULT_CHUNK_SIZE,
chunk_size=DEFAULT_CHUNK_SIZE, workers=os.cpu_count(),
):
self.verbosity = verbosity
self.test = test
@ -160,6 +199,7 @@ class Bitrot(object):
self.chunk_size = chunk_size
self._last_reported_size = ''
self._last_commit_ts = 0
self.pool = ProcessPoolExecutor(max_workers=workers)
def maybe_commit(self, conn):
if time.time() < self._last_commit_ts + self.commit_interval:
@ -195,44 +235,18 @@ class Bitrot(object):
follow_links=self.follow_links,
)
paths_uni = set(normalize_path(p) for p in paths)
futures = [self.pool.submit(compute_one, p, self.chunk_size) for p in paths]
for p in sorted(paths):
p_uni = normalize_path(p)
for future in as_completed(futures):
try:
st = os.stat(p)
except OSError as ex:
if ex.errno in IGNORED_FILE_SYSTEM_ERRORS:
# The file disappeared between listing existing paths and
# this run or is (temporarily?) locked with different
# permissions. We'll just skip it for now.
print(
'\rwarning: `{}` is currently unavailable for '
'reading: {}'.format(
p_uni, ex,
),
file=sys.stderr,
)
continue
p_uni, new_size, new_mtime, new_sha1 = future.result()
except BitrotException:
continue
raise # Not expected? https://github.com/ambv/bitrot/issues/
new_mtime = int(st.st_mtime)
current_size += st.st_size
current_size += new_size
if self.verbosity:
self.report_progress(current_size, total_size)
try:
new_sha1 = sha1(p, self.chunk_size)
except (IOError, OSError) as e:
print(
'\rwarning: cannot compute hash of {} [{}]'.format(
p, errno.errorcode[e.args[0]],
),
file=sys.stderr,
)
missing_paths.discard(p_uni)
continue
if p_uni not in missing_paths:
# We are not expecting this path, it wasn't in the database yet.
# It's either new or a rename. Let's handle that.
@ -271,11 +285,11 @@ class Bitrot(object):
continue
if stored_sha1 != new_sha1:
errors.append(p)
errors.append(p_uni)
print(
'\rerror: SHA1 mismatch for {}: expected {}, got {}.'
' Last good hash checked on {}.'.format(
p.decode(FSENCODING), stored_sha1, new_sha1, stored_ts
p_uni, stored_sha1, new_sha1, stored_ts
),
file=sys.stderr,
)
@ -538,6 +552,9 @@ def run_from_command_line():
'--commit-interval', type=float, default=300,
help='min time in seconds between commits '
'(0 commits on every operation)')
parser.add_argument(
'-w', '--workers', type=int, default=os.cpu_count(),
help='run this many workers (use -w1 for slow magnetic disks)')
parser.add_argument(
'--chunk-size', type=int, default=DEFAULT_CHUNK_SIZE,
help='read files this many bytes at a time')
@ -563,6 +580,7 @@ def run_from_command_line():
follow_links=args.follow_links,
commit_interval=args.commit_interval,
chunk_size=args.chunk_size,
workers=args.workers,
)
if args.fsencoding:
FSENCODING = args.fsencoding