Simplify normalization and Unicode handling

This commit is contained in:
Łukasz Langa 2020-05-17 21:18:48 +02:00
parent 7608b56ea6
commit 8ee84344e8
No known key found for this signature in database
GPG Key ID: B26995E310250568

View File

@ -54,10 +54,11 @@ if sys.version[0] == '2':
def normalize_path(path):
path_uni = path.decode(FSENCODING)
if FSENCODING in ('utf-8', 'UTF-8'):
return unicodedata.normalize('NFKD', path)
else:
return path
return unicodedata.normalize('NFKD', path_uni)
return path_uni
def sha1(path, chunk_size):
@ -103,17 +104,17 @@ def get_sqlite3_cursor(path, copy=False):
def list_existing_paths(directory, expected=(), ignored=(), follow_links=False):
"""list_existing_paths('/dir') -> ([path1, path2, ...], total_size)
"""list_existing_paths(b'/dir') -> ([path1, path2, ...], total_size)
Returns a tuple with a list with existing files in `directory` and their
`total_size`.
Returns a tuple with a set of existing files in `directory` and its subdirectories
and their `total_size`. If directory was a bytes object, so will be the returned
paths.
Doesn't add entries listed in `ignored`. Doesn't add symlinks if
`follow_links` is False (the default). All entries present in `expected`
must be files (can't be directories or symlinks).
"""
paths = set()
paths_decoded_and_normalized = set()
total_size = 0
for path, _, files in os.walk(directory):
for f in files:
@ -128,7 +129,7 @@ def list_existing_paths(directory, expected=(), ignored=(), follow_links=False):
continue
try:
if follow_links or normalize_path(p_uni) in expected:
if follow_links or p_uni in expected:
st = os.stat(p)
else:
st = os.lstat(p)
@ -139,9 +140,8 @@ def list_existing_paths(directory, expected=(), ignored=(), follow_links=False):
if not stat.S_ISREG(st.st_mode) or p in ignored:
continue
paths.add(p)
paths_decoded_and_normalized.add(normalize_path(p.decode(FSENCODING)))
total_size += st.st_size
return paths, total_size, paths_decoded_and_normalized
return paths, total_size
class BitrotException(Exception):
@ -190,13 +190,14 @@ class Bitrot(object):
current_size = 0
missing_paths = self.select_all_paths(cur)
hashes = self.select_all_hashes(cur)
paths, total_size, paths_decoded_and_normalized = list_existing_paths(
paths, total_size = list_existing_paths(
b'.', expected=missing_paths, ignored={bitrot_db, bitrot_sha512},
follow_links=self.follow_links,
)
paths_uni = set(normalize_path(p) for p in paths)
for p in sorted(paths):
p_uni = p.decode(FSENCODING)
p_uni = normalize_path(p)
try:
st = os.stat(p)
except OSError as ex:
@ -220,7 +221,7 @@ class Bitrot(object):
if self.verbosity:
self.report_progress(current_size, total_size)
missing_paths.discard(normalize_path(p_uni))
missing_paths.discard(p_uni)
try:
new_sha1 = sha1(p, self.chunk_size)
except (IOError, OSError) as e:
@ -232,28 +233,28 @@ class Bitrot(object):
)
continue
cur.execute('SELECT mtime, hash, timestamp FROM bitrot WHERE '
'path=?', (normalize_path(p_uni),))
cur.execute('SELECT mtime, hash, timestamp FROM bitrot WHERE path=?',
(p_uni,))
row = cur.fetchone()
if not row:
stored_path = self.handle_unknown_path(
cur, p_uni, new_mtime, new_sha1, paths_decoded_and_normalized, hashes
cur, p_uni, new_mtime, new_sha1, paths_uni, hashes
)
self.maybe_commit(conn)
if p_uni == stored_path:
new_paths.append(p) # FIXME: shouldn't that be p_uni?
new_paths.append(p_uni)
else:
renamed_paths.append((stored_path, p_uni))
missing_paths.discard(normalize_path(stored_path))
missing_paths.discard(stored_path)
continue
stored_mtime, stored_sha1, stored_ts = row
if int(stored_mtime) != new_mtime:
updated_paths.append(p)
updated_paths.append(p_uni)
cur.execute('UPDATE bitrot SET mtime=?, hash=?, timestamp=? '
'WHERE path=?',
(new_mtime, new_sha1, ts(), normalize_path(p_uni)))
(new_mtime, new_sha1, ts(), p_uni))
self.maybe_commit(conn)
continue
@ -268,7 +269,7 @@ class Bitrot(object):
)
for path in missing_paths:
cur.execute('DELETE FROM bitrot WHERE path=?', (normalize_path(path),)) # it is expected that content of missing_paths is already normalized, but just to be sure
cur.execute('DELETE FROM bitrot WHERE path=?', (path,))
conn.commit()
@ -296,6 +297,10 @@ class Bitrot(object):
)
def select_all_paths(self, cur):
"""Return a set of all distinct paths in the bitrot database.
The paths are Unicode and are normalized if FSENCODING was UTF-8.
"""
result = set()
cur.execute('SELECT path FROM bitrot')
row = cur.fetchone()
@ -305,6 +310,10 @@ class Bitrot(object):
return result
def select_all_hashes(self, cur):
"""Return a dict where keys are hashes and values are sets of paths.
The paths are Unicode and are normalized if FSENCODING was UTF-8.
"""
result = {}
cur.execute('SELECT hash, path FROM bitrot')
row = cur.fetchone()
@ -326,6 +335,7 @@ class Bitrot(object):
def report_done(
self, total_size, all_count, error_count, new_paths, updated_paths,
renamed_paths, missing_paths):
"""Print a report on what happened. All paths should be Unicode here."""
print('\rFinished. {:.2f} MiB of data read. {} errors found.'
''.format(total_size/1024/1024, error_count))
if self.verbosity == 1:
@ -342,12 +352,12 @@ class Bitrot(object):
print('{} entries new:'.format(len(new_paths)))
new_paths.sort()
for path in new_paths:
print(' ', path.decode(FSENCODING))
print(' ', path)
if updated_paths:
print('{} entries updated:'.format(len(updated_paths)))
updated_paths.sort()
for path in updated_paths:
print(' ', path.decode(FSENCODING))
print(' ', path)
if renamed_paths:
print('{} entries renamed:'.format(len(renamed_paths)))
renamed_paths.sort()
@ -368,7 +378,7 @@ class Bitrot(object):
if self.test and self.verbosity:
print('warning: database file not updated on disk (test mode).')
def handle_unknown_path(self, cur, new_path, new_mtime, new_sha1, paths_decoded_and_normalized, hashes):
def handle_unknown_path(self, cur, new_path, new_mtime, new_sha1, paths_uni, hashes):
"""Either add a new entry to the database or update the existing entry
on rename.
@ -377,12 +387,12 @@ class Bitrot(object):
"""
try: # if the path isn't in the database
found = [path for path in hashes[new_sha1] if path not in paths_decoded_and_normalized]
found = [path for path in hashes[new_sha1] if path not in paths_uni]
renamed = found.pop()
# update the path in the database
cur.execute(
'UPDATE bitrot SET mtime=?, path=?, timestamp=? WHERE path=?',
(new_mtime, normalize_path(new_path), ts(), normalize_path(renamed)),
(new_mtime, new_path, ts(), renamed),
)
return renamed
@ -391,7 +401,7 @@ class Bitrot(object):
except (KeyError,IndexError):
cur.execute(
'INSERT INTO bitrot VALUES (?, ?, ?, ?)',
(normalize_path(new_path), new_mtime, new_sha1, ts()),
(new_path, new_mtime, new_sha1, ts()),
)
return new_path