Normalize unicode paths in the database (#37)

* Normalize unicode paths in the database to enable use of the same database across different platforms

* Check if unicode normalization should be applied without regexp
This commit is contained in:
Stan Senotrusov 2020-05-18 00:27:05 +08:00 committed by GitHub
parent 4ea0a57e0a
commit 74f043b3ca
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -38,6 +38,7 @@ import stat
import sys
import tempfile
import time
import unicodedata
DEFAULT_CHUNK_SIZE = 16384 # block size in HFS+; 4X the block size in ext4
@ -52,6 +53,13 @@ if sys.version[0] == '2':
# use `bytes` for bytestrings
def normalize_path(path):
if FSENCODING == 'utf-8' or FSENCODING == 'UTF-8':
return unicodedata.normalize('NFKC', path)
else:
return path
def sha1(path, chunk_size):
digest = hashlib.sha1()
with open(path, 'rb') as f:
@ -105,6 +113,7 @@ def list_existing_paths(directory, expected=(), ignored=(), follow_links=False):
must be files (can't be directories or symlinks).
"""
paths = set()
paths_decoded_and_normalized = set()
total_size = 0
for path, _, files in os.walk(directory):
for f in files:
@ -119,7 +128,7 @@ def list_existing_paths(directory, expected=(), ignored=(), follow_links=False):
continue
try:
if follow_links or p_uni in expected:
if follow_links or normalize_path(p_uni) in expected:
st = os.stat(p)
else:
st = os.lstat(p)
@ -130,8 +139,9 @@ def list_existing_paths(directory, expected=(), ignored=(), follow_links=False):
if not stat.S_ISREG(st.st_mode) or p in ignored:
continue
paths.add(p)
paths_decoded_and_normalized.add(normalize_path(p.decode(FSENCODING)))
total_size += st.st_size
return paths, total_size
return paths, total_size, paths_decoded_and_normalized
class BitrotException(Exception):
@ -180,7 +190,7 @@ class Bitrot(object):
current_size = 0
missing_paths = self.select_all_paths(cur)
hashes = self.select_all_hashes(cur)
paths, total_size = list_existing_paths(
paths, total_size, paths_decoded_and_normalized = list_existing_paths(
b'.', expected=missing_paths, ignored={bitrot_db, bitrot_sha512},
follow_links=self.follow_links,
)
@ -210,7 +220,7 @@ class Bitrot(object):
if self.verbosity:
self.report_progress(current_size, total_size)
missing_paths.discard(p_uni)
missing_paths.discard(normalize_path(p_uni))
try:
new_sha1 = sha1(p, self.chunk_size)
except (IOError, OSError) as e:
@ -223,11 +233,11 @@ class Bitrot(object):
continue
cur.execute('SELECT mtime, hash, timestamp FROM bitrot WHERE '
'path=?', (p_uni,))
'path=?', (normalize_path(p_uni),))
row = cur.fetchone()
if not row:
stored_path = self.handle_unknown_path(
cur, p_uni, new_mtime, new_sha1, paths, hashes
cur, p_uni, new_mtime, new_sha1, paths_decoded_and_normalized, hashes
)
self.maybe_commit(conn)
@ -235,7 +245,7 @@ class Bitrot(object):
new_paths.append(p) # FIXME: shouldn't that be p_uni?
else:
renamed_paths.append((stored_path, p_uni))
missing_paths.discard(stored_path)
missing_paths.discard(normalize_path(stored_path))
continue
stored_mtime, stored_sha1, stored_ts = row
@ -243,7 +253,7 @@ class Bitrot(object):
updated_paths.append(p)
cur.execute('UPDATE bitrot SET mtime=?, hash=?, timestamp=? '
'WHERE path=?',
(new_mtime, new_sha1, ts(), p_uni))
(new_mtime, new_sha1, ts(), normalize_path(p_uni)))
self.maybe_commit(conn)
continue
@ -258,7 +268,7 @@ class Bitrot(object):
)
for path in missing_paths:
cur.execute('DELETE FROM bitrot WHERE path=?', (path,))
cur.execute('DELETE FROM bitrot WHERE path=?', (normalize_path(path),)) # it is expected that content of missing_paths is already normalized, but just to be sure
conn.commit()
@ -358,7 +368,7 @@ class Bitrot(object):
if self.test and self.verbosity:
print('warning: database file not updated on disk (test mode).')
def handle_unknown_path(self, cur, new_path, new_mtime, new_sha1, paths, hashes):
def handle_unknown_path(self, cur, new_path, new_mtime, new_sha1, paths_decoded_and_normalized, hashes):
"""Either add a new entry to the database or update the existing entry
on rename.
@ -367,12 +377,12 @@ class Bitrot(object):
"""
try: # if the path isn't in the database
found = [path for path in hashes[new_sha1] if path not in paths]
found = [path for path in hashes[new_sha1] if path not in paths_decoded_and_normalized]
renamed = found.pop()
# update the path in the database
cur.execute(
'UPDATE bitrot SET mtime=?, path=?, timestamp=? WHERE path=?',
(new_mtime, new_path, ts(), renamed),
(new_mtime, normalize_path(new_path), ts(), normalize_path(renamed)),
)
return renamed
@ -381,7 +391,7 @@ class Bitrot(object):
except (KeyError,IndexError):
cur.execute(
'INSERT INTO bitrot VALUES (?, ?, ?, ?)',
(new_path, new_mtime, new_sha1, ts()),
(normalize_path(new_path), new_mtime, new_sha1, ts()),
)
return new_path