Normalize unicode paths in the database (#37)
* Normalize unicode paths in the database to enable use of the same database across different platforms * Check if unicode normalization should be applied without regexp
This commit is contained in:
parent
4ea0a57e0a
commit
74f043b3ca
@ -38,6 +38,7 @@ import stat
|
||||
import sys
|
||||
import tempfile
|
||||
import time
|
||||
import unicodedata
|
||||
|
||||
|
||||
DEFAULT_CHUNK_SIZE = 16384 # block size in HFS+; 4X the block size in ext4
|
||||
@ -52,6 +53,13 @@ if sys.version[0] == '2':
|
||||
# use `bytes` for bytestrings
|
||||
|
||||
|
||||
def normalize_path(path):
|
||||
if FSENCODING == 'utf-8' or FSENCODING == 'UTF-8':
|
||||
return unicodedata.normalize('NFKC', path)
|
||||
else:
|
||||
return path
|
||||
|
||||
|
||||
def sha1(path, chunk_size):
|
||||
digest = hashlib.sha1()
|
||||
with open(path, 'rb') as f:
|
||||
@ -105,6 +113,7 @@ def list_existing_paths(directory, expected=(), ignored=(), follow_links=False):
|
||||
must be files (can't be directories or symlinks).
|
||||
"""
|
||||
paths = set()
|
||||
paths_decoded_and_normalized = set()
|
||||
total_size = 0
|
||||
for path, _, files in os.walk(directory):
|
||||
for f in files:
|
||||
@ -119,7 +128,7 @@ def list_existing_paths(directory, expected=(), ignored=(), follow_links=False):
|
||||
continue
|
||||
|
||||
try:
|
||||
if follow_links or p_uni in expected:
|
||||
if follow_links or normalize_path(p_uni) in expected:
|
||||
st = os.stat(p)
|
||||
else:
|
||||
st = os.lstat(p)
|
||||
@ -130,8 +139,9 @@ def list_existing_paths(directory, expected=(), ignored=(), follow_links=False):
|
||||
if not stat.S_ISREG(st.st_mode) or p in ignored:
|
||||
continue
|
||||
paths.add(p)
|
||||
paths_decoded_and_normalized.add(normalize_path(p.decode(FSENCODING)))
|
||||
total_size += st.st_size
|
||||
return paths, total_size
|
||||
return paths, total_size, paths_decoded_and_normalized
|
||||
|
||||
|
||||
class BitrotException(Exception):
|
||||
@ -180,7 +190,7 @@ class Bitrot(object):
|
||||
current_size = 0
|
||||
missing_paths = self.select_all_paths(cur)
|
||||
hashes = self.select_all_hashes(cur)
|
||||
paths, total_size = list_existing_paths(
|
||||
paths, total_size, paths_decoded_and_normalized = list_existing_paths(
|
||||
b'.', expected=missing_paths, ignored={bitrot_db, bitrot_sha512},
|
||||
follow_links=self.follow_links,
|
||||
)
|
||||
@ -210,7 +220,7 @@ class Bitrot(object):
|
||||
if self.verbosity:
|
||||
self.report_progress(current_size, total_size)
|
||||
|
||||
missing_paths.discard(p_uni)
|
||||
missing_paths.discard(normalize_path(p_uni))
|
||||
try:
|
||||
new_sha1 = sha1(p, self.chunk_size)
|
||||
except (IOError, OSError) as e:
|
||||
@ -223,11 +233,11 @@ class Bitrot(object):
|
||||
continue
|
||||
|
||||
cur.execute('SELECT mtime, hash, timestamp FROM bitrot WHERE '
|
||||
'path=?', (p_uni,))
|
||||
'path=?', (normalize_path(p_uni),))
|
||||
row = cur.fetchone()
|
||||
if not row:
|
||||
stored_path = self.handle_unknown_path(
|
||||
cur, p_uni, new_mtime, new_sha1, paths, hashes
|
||||
cur, p_uni, new_mtime, new_sha1, paths_decoded_and_normalized, hashes
|
||||
)
|
||||
self.maybe_commit(conn)
|
||||
|
||||
@ -235,7 +245,7 @@ class Bitrot(object):
|
||||
new_paths.append(p) # FIXME: shouldn't that be p_uni?
|
||||
else:
|
||||
renamed_paths.append((stored_path, p_uni))
|
||||
missing_paths.discard(stored_path)
|
||||
missing_paths.discard(normalize_path(stored_path))
|
||||
continue
|
||||
|
||||
stored_mtime, stored_sha1, stored_ts = row
|
||||
@ -243,7 +253,7 @@ class Bitrot(object):
|
||||
updated_paths.append(p)
|
||||
cur.execute('UPDATE bitrot SET mtime=?, hash=?, timestamp=? '
|
||||
'WHERE path=?',
|
||||
(new_mtime, new_sha1, ts(), p_uni))
|
||||
(new_mtime, new_sha1, ts(), normalize_path(p_uni)))
|
||||
self.maybe_commit(conn)
|
||||
continue
|
||||
|
||||
@ -258,7 +268,7 @@ class Bitrot(object):
|
||||
)
|
||||
|
||||
for path in missing_paths:
|
||||
cur.execute('DELETE FROM bitrot WHERE path=?', (path,))
|
||||
cur.execute('DELETE FROM bitrot WHERE path=?', (normalize_path(path),)) # it is expected that content of missing_paths is already normalized, but just to be sure
|
||||
|
||||
conn.commit()
|
||||
|
||||
@ -358,7 +368,7 @@ class Bitrot(object):
|
||||
if self.test and self.verbosity:
|
||||
print('warning: database file not updated on disk (test mode).')
|
||||
|
||||
def handle_unknown_path(self, cur, new_path, new_mtime, new_sha1, paths, hashes):
|
||||
def handle_unknown_path(self, cur, new_path, new_mtime, new_sha1, paths_decoded_and_normalized, hashes):
|
||||
"""Either add a new entry to the database or update the existing entry
|
||||
on rename.
|
||||
|
||||
@ -367,12 +377,12 @@ class Bitrot(object):
|
||||
"""
|
||||
|
||||
try: # if the path isn't in the database
|
||||
found = [path for path in hashes[new_sha1] if path not in paths]
|
||||
found = [path for path in hashes[new_sha1] if path not in paths_decoded_and_normalized]
|
||||
renamed = found.pop()
|
||||
# update the path in the database
|
||||
cur.execute(
|
||||
'UPDATE bitrot SET mtime=?, path=?, timestamp=? WHERE path=?',
|
||||
(new_mtime, new_path, ts(), renamed),
|
||||
(new_mtime, normalize_path(new_path), ts(), normalize_path(renamed)),
|
||||
)
|
||||
|
||||
return renamed
|
||||
@ -381,7 +391,7 @@ class Bitrot(object):
|
||||
except (KeyError,IndexError):
|
||||
cur.execute(
|
||||
'INSERT INTO bitrot VALUES (?, ?, ?, ?)',
|
||||
(new_path, new_mtime, new_sha1, ts()),
|
||||
(normalize_path(new_path), new_mtime, new_sha1, ts()),
|
||||
)
|
||||
return new_path
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user