From a8e52626efde820d39f6f8a0e2ed39f55284704f Mon Sep 17 00:00:00 2001 From: liloman Date: Fri, 3 Mar 2017 19:16:46 +0100 Subject: [PATCH] Swap sqlite cursor with dictionary and set data structures (#24) 1. Use 2 new data structures: -paths (set) contains all the files in the actual filesystem -hashes (dictionary) substitute the sqlite query with dict[hash] = set(db paths) 2. Minimal unitary tests created with bats (bash script) See https://github.com/ambv/bitrot/issues/23 for details. --- src/bitrot.py | 53 +++++----- tests/test-bitrot.bats | 220 +++++++++++++++++++++++++++++++++++++++++ tests/test_helper.bash | 61 ++++++++++++ 3 files changed, 310 insertions(+), 24 deletions(-) mode change 100644 => 100755 src/bitrot.py create mode 100755 tests/test-bitrot.bats create mode 100644 tests/test_helper.bash diff --git a/src/bitrot.py b/src/bitrot.py old mode 100644 new mode 100755 index 48337fb..470c4f0 --- a/src/bitrot.py +++ b/src/bitrot.py @@ -104,7 +104,7 @@ def list_existing_paths(directory, expected=(), ignored=(), follow_links=False): `follow_links` is False (the default). All entries present in `expected` must be files (can't be directories or symlinks). """ - paths = [] + paths = set() total_size = 0 for path, _, files in os.walk(directory): for f in files: @@ -129,9 +129,8 @@ def list_existing_paths(directory, expected=(), ignored=(), follow_links=False): else: if not stat.S_ISREG(st.st_mode) or p in ignored: continue - paths.append(p) + paths.add(p) total_size += st.st_size - paths.sort() return paths, total_size @@ -180,12 +179,13 @@ class Bitrot(object): errors = [] current_size = 0 missing_paths = self.select_all_paths(cur) + hashes = self.select_all_hashes(cur) paths, total_size = list_existing_paths( b'.', expected=missing_paths, ignored={bitrot_db, bitrot_sha512}, follow_links=self.follow_links, ) - for p in paths: + for p in sorted(paths): p_uni = p.decode(FSENCODING) try: st = os.stat(p) @@ -227,7 +227,7 @@ class Bitrot(object): row = cur.fetchone() if not row: stored_path = self.handle_unknown_path( - cur, p_uni, new_mtime, new_sha1, + cur, p_uni, new_mtime, new_sha1, paths, hashes ) self.maybe_commit(conn) @@ -291,6 +291,16 @@ class Bitrot(object): row = cur.fetchone() return result + def select_all_hashes(self, cur): + result = {} + cur.execute('SELECT hash, path FROM bitrot') + row = cur.fetchone() + while row: + rhash, rpath = row + result.setdefault(rhash, set()).add(rpath) + row = cur.fetchone() + return result + def report_progress(self, current_size, total_size): size_fmt = '\r{:>6.1%}'.format(current_size/(total_size or 1)) if size_fmt == self._last_reported_size: @@ -345,37 +355,32 @@ class Bitrot(object): if self.test and self.verbosity: print('warning: database file not updated on disk (test mode).') - def handle_unknown_path(self, cur, new_path, new_mtime, new_sha1): + def handle_unknown_path(self, cur, new_path, new_mtime, new_sha1, paths, hashes): """Either add a new entry to the database or update the existing entry on rename. Returns `new_path` if the entry was indeed new or the `stored_path` (e.g. outdated path) if there was a rename. """ - cur.execute('SELECT mtime, path, timestamp FROM bitrot WHERE hash=?', - (new_sha1,)) - rows = cur.fetchall() - for row in rows: - stored_mtime, stored_path, stored_ts = row - if os.path.exists(stored_path): - # file still exists, move on - continue + try: # if the path isn't in the database + found = [path for path in hashes[new_sha1] if path not in paths] + renamed = found.pop() # update the path in the database cur.execute( 'UPDATE bitrot SET mtime=?, path=?, timestamp=? WHERE path=?', - (new_mtime, new_path, ts(), stored_path), + (new_mtime, new_path, ts(), renamed), ) - return stored_path - - # no rename, just a new file with the same hash - cur.execute( - 'INSERT INTO bitrot VALUES (?, ?, ?, ?)', - (new_path, new_mtime, new_sha1, ts()), - ) - return new_path - + return renamed + + # From hashes[new_sha1] or found.pop() + except (KeyError,IndexError): + cur.execute( + 'INSERT INTO bitrot VALUES (?, ?, ?, ?)', + (new_path, new_mtime, new_sha1, ts()), + ) + return new_path def get_path(directory=b'.', ext=b'db'): """Compose the path to the selected bitrot file.""" diff --git a/tests/test-bitrot.bats b/tests/test-bitrot.bats new file mode 100755 index 0000000..840c2be --- /dev/null +++ b/tests/test-bitrot.bats @@ -0,0 +1,220 @@ +#!/usr/bin/env bats + +load test_helper + + +#change it to your testing bitrot +cmd=~/Clones/bitrot/src/bitrot.py + +# cmd=bitrot + +test_dir=/tmp/bitrot_dir-$USER +mkdir -p $test_dir +cd $test_dir || exit + +########### +# BASIC # +########### + +@test "bitrot detects new files in a tree dir" { +mkdir -p notemptydirs/dir2/ +touch notemptydirs/dir2/new-file-{a,b}.txt +echo $RANDOM >> notemptydirs/dir2/new-file-b.txt +run $cmd -v +# check_fail "${lines[@]}" + +(( $status == 0 )) +# [[ ${lines[0]} = "Finished. 0.00 MiB of data read. 0 errors found." ]] +[[ ${lines[1]} = "2 entries in the database. 2 entries new:" ]] +[[ ${lines[2]} = " ./notemptydirs/dir2/new-file-a.txt" ]] +[[ ${lines[3]} = " ./notemptydirs/dir2/new-file-b.txt" ]] +[[ ${lines[4]} = "Updating bitrot.sha512... done." ]] + +} + + +@test "bitrot detects modified files in a tree dir" { +sleep 1 +echo $RANDOM >> notemptydirs/dir2/new-file-a.txt +run $cmd -v +# check_fail "${lines[@]}" + +(( $status == 0 )) +[[ ${lines[0]} = "Checking bitrot.db integrity... ok." ]] +# [[ ${lines[1]} = "Finished. 0.00 MiB of data read. 0 errors found." ]] +[[ ${lines[2]} = "2 entries in the database. 1 entries updated:" ]] +[[ ${lines[3]} = " ./notemptydirs/dir2/new-file-a.txt" ]] +[[ ${lines[4]} = "Updating bitrot.sha512... done." ]] + +} + +@test "bitrot detects renamed files in a tree dir" { +sleep 1 +mv notemptydirs/dir2/new-file-a.txt notemptydirs/dir2/new-file-a.txt2 +run $cmd -v +# check_fail "${lines[@]}" + +(( $status == 0 )) +[[ ${lines[0]} = "Checking bitrot.db integrity... ok." ]] +# [[ ${lines[1]} = "Finished. 0.00 MiB of data read. 0 errors found." ]] +[[ ${lines[2]} = "2 entries in the database. 1 entries renamed:" ]] +[[ ${lines[3]} = " from ./notemptydirs/dir2/new-file-a.txt to ./notemptydirs/dir2/new-file-a.txt2" ]] +[[ ${lines[4]} = "Updating bitrot.sha512... done." ]] + +} + +@test "bitrot detects delete files in a tree dir" { +sleep 1 +rm notemptydirs/dir2/new-file-a.txt2 +run $cmd -v +# check_fail "${lines[@]}" + +(( $status == 0 )) +[[ ${lines[0]} = "Checking bitrot.db integrity... ok." ]] +# [[ ${lines[1]} = "Finished. 0.00 MiB of data read. 0 errors found." ]] +[[ ${lines[2]} = "1 entries in the database. 1 entries missing:" ]] +[[ ${lines[3]} = " ./notemptydirs/dir2/new-file-a.txt2" ]] +[[ ${lines[4]} = "Updating bitrot.sha512... done." ]] + +} + + +@test "bitrot detects new files and modified in a tree dir " { +sleep 1 +touch more-files-{a,b,c,d,e,f,g}.txt +echo $RANDOM >> notemptydirs/dir2/new-file-b.txt +run $cmd -v +#check_fail "${lines[@]}" + +(( $status == 0 )) + +# [[ ${lines[1]} = "Finished. 0.00 MiB of data read. 0 errors found." ]] +[[ ${lines[2]} = "8 entries in the database. 7 entries new:" ]] +[[ ${lines[3]} = " ./more-files-a.txt" ]] +[[ ${lines[4]} = " ./more-files-b.txt" ]] +[[ ${lines[5]} = " ./more-files-c.txt" ]] +[[ ${lines[6]} = " ./more-files-d.txt" ]] +[[ ${lines[7]} = " ./more-files-e.txt" ]] +[[ ${lines[8]} = " ./more-files-f.txt" ]] +[[ ${lines[9]} = " ./more-files-g.txt" ]] +[[ ${lines[10]} = "1 entries updated:" ]] +[[ ${lines[11]} = " ./notemptydirs/dir2/new-file-b.txt" ]] +[[ ${lines[12]} = "Updating bitrot.sha512... done." ]] +} + +@test "bitrot detects new files, modified, deleted and moved in a tree dir " { +sleep 1 +for fil in {a,b,c,d,e,f,g}; do +echo $RANDOM >> notemptydirs/pl-more-files-$fil.txt +done +echo $RANDOM >> notemptydirs/dir2/new-file-b.txt +mv more-files-a.txt more-files-a.txt2 +rm more-files-g.txt +run $cmd -v + +(( $status == 0 )) + +# [[ ${lines[1]} = "Finished. 0.00 MiB of data read. 0 errors found." ]] +[[ ${lines[2]} = "14 entries in the database. 7 entries new:" ]] +[[ ${lines[3]} = " ./notemptydirs/pl-more-files-a.txt" ]] +[[ ${lines[4]} = " ./notemptydirs/pl-more-files-b.txt" ]] +[[ ${lines[5]} = " ./notemptydirs/pl-more-files-c.txt" ]] +[[ ${lines[6]} = " ./notemptydirs/pl-more-files-d.txt" ]] +[[ ${lines[7]} = " ./notemptydirs/pl-more-files-e.txt" ]] +[[ ${lines[8]} = " ./notemptydirs/pl-more-files-f.txt" ]] +[[ ${lines[9]} = " ./notemptydirs/pl-more-files-g.txt" ]] +[[ ${lines[10]} = "1 entries updated:" ]] +[[ ${lines[11]} = " ./notemptydirs/dir2/new-file-b.txt" ]] +[[ ${lines[12]} = "1 entries renamed:" ]] +[[ ${lines[13]} = " from ./more-files-a.txt to ./more-files-a.txt2" ]] +[[ ${lines[14]} = "1 entries missing:" ]] +[[ ${lines[15]} = " ./more-files-g.txt" ]] +[[ ${lines[16]} = "Updating bitrot.sha512... done." ]] +} + + +@test "bitrot detects new files, modified, deleted and moved in a tree dir 2" { +sleep 1 +for fil in {a,b,c,d,e,f,g}; do +echo $RANDOM >> notemptydirs/pl2-more-files-$fil.txt +done +echo $RANDOM >> notemptydirs/pl-more-files-a.txt + +mv notemptydirs/pl-more-files-b.txt notemptydirs/pl-more-files-b.txt2 +cp notemptydirs/pl-more-files-g.txt notemptydirs/pl2-more-files-g.txt2 +cp notemptydirs/pl-more-files-d.txt notemptydirs/pl2-more-files-d.txt2 + +rm more-files-f.txt notemptydirs/pl-more-files-c.txt + +run $cmd -v + +# check_fail "${lines[@]}" + +(( $status == 0 )) + +# [[ ${lines[1]} = "Finished. 0.00 MiB of data read. 0 errors found." ]] +[[ ${lines[2]} = "21 entries in the database. 9 entries new:" ]] +[[ ${lines[3]} = " ./notemptydirs/pl2-more-files-a.txt" ]] +[[ ${lines[4]} = " ./notemptydirs/pl2-more-files-b.txt" ]] +[[ ${lines[5]} = " ./notemptydirs/pl2-more-files-c.txt" ]] +[[ ${lines[6]} = " ./notemptydirs/pl2-more-files-d.txt" ]] +[[ ${lines[7]} = " ./notemptydirs/pl2-more-files-d.txt2" ]] +[[ ${lines[8]} = " ./notemptydirs/pl2-more-files-e.txt" ]] +[[ ${lines[9]} = " ./notemptydirs/pl2-more-files-f.txt" ]] +[[ ${lines[10]} = " ./notemptydirs/pl2-more-files-g.txt" ]] +[[ ${lines[11]} = " ./notemptydirs/pl2-more-files-g.txt2" ]] +[[ ${lines[12]} = "1 entries updated:" ]] +[[ ${lines[13]} = " ./notemptydirs/pl-more-files-a.txt" ]] +[[ ${lines[14]} = "1 entries renamed:" ]] +[[ ${lines[15]} = " from ./notemptydirs/pl-more-files-b.txt to ./notemptydirs/pl-more-files-b.txt2" ]] +[[ ${lines[16]} = "2 entries missing:" ]] +[[ ${lines[17]} = " ./more-files-f.txt" ]] +[[ ${lines[18]} = " ./notemptydirs/pl-more-files-c.txt" ]] +[[ ${lines[19]} = "Updating bitrot.sha512... done." ]] +} + + +@test "bitrot can operate with 3278 files easily in a dir" { +sleep 1 +mkdir -p alotfiles/here; cd alotfiles/here +#create a 320KB file +dd if=/dev/urandom of=masterfile bs=1 count=327680 +#split it in 3277 files (instantly) + masterfile = 3278 +split -b 100 -a 10 masterfile +cd $test_dir +run $cmd + +(( $status == 0 )) +[[ ${lines[2]} = "3299 entries in the database, 3278 new, 0 updated, 0 renamed, 0 missing." ]] + +} + +@test "bitrot can operate with 3278 files easily in a dir 2 " { +sleep 1 +mv alotfiles/here alotfiles/here-moved +run $cmd +#check_fail "${lines[@]}" + +(( $status == 0 )) +[[ ${lines[2]} = "3299 entries in the database, 0 new, 0 updated, 3278 renamed, 0 missing." ]] + +} + +@test "bitrot can detetect a bitrot in a dir ! " { +sleep 1 +generate_bitrot ./bitrot-file 10 2 $cmd +run $cmd -q + +#check_fail "${lines[@]}" + +(( $status == 1 )) +[[ ${lines[0]} = *"error: SHA1 mismatch for ./bitrot-file: expected"* ]] +[[ ${lines[1]} = "error: There were 1 errors found." ]] +} + + +@test "Clean everything" { +run chmod -f a+w * +\rm -rf * $test_dir $BITROT_BACKUPS +} + diff --git a/tests/test_helper.bash b/tests/test_helper.bash new file mode 100644 index 0000000..c8008b3 --- /dev/null +++ b/tests/test_helper.bash @@ -0,0 +1,61 @@ +#!/usr/bin/env bash + +# LC_ALL=en_US.UTF-8 +# LANGUAGE=en_US.UTF-8 +LANG=C + +check_fail() { + local temp=/tmp/bats.log + > $temp + for line; do + echo "$line" >> $temp + done + # cat /tmp/.bitrot.log >> $temp +} + + +generate_bitrot() { + local dest=$1 temp=/tmp/temp-base + local -i count=$(($2*100)) percent=${3:-5} + local cmd=$4 + mkdir -p "${dest%/*}" + local dir_base=${dest%%/*} + touch "$dest" $temp + #let's make sure they shared the same timestamp + touch "$dest" -r $temp + + dd if=/dev/zero of="$dest" bs=1k count=$count &>/dev/null + run $cmd + #modify it and change modify date to base-file, simulate real bitrot so + dd seek=1k if=/dev/urandom of="$dest" bs=1k count=$((count*percent/100)) conv=notrunc &>/dev/null + touch "$dest" -r $temp + \rm -f $tmp + run $cmd +} + +generate_bitrots() { + local dest=$1 dest2=$2 temp=/tmp/temp-base + local -i count=$(($3*100)) percent=${4:-5} + mkdir -p "${dest%/*}" + mkdir -p "${dest2%/*}" + local dir_base=${dest%/*} + local dir_base2=${dest2%/*} + touch "$dest2" "$dest" $temp + #let's make sure they shared the same timestamp + touch "$dest" -r $temp + touch "$dest2" -r $temp + + dd if=/dev/zero of="$dest" bs=1k count=$count &>/dev/null + dd if=/dev/zero of="$dest2" bs=1k count=$count &>/dev/null + run $r "$dir_base" "$dir_base2" + #modify it and change modify date to base-file, simulate bitrot so + dd seek=1k if=/dev/urandom of="$dest" bs=1k count=$((count*percent/100)) conv=notrunc &>/dev/null + dd seek=1k if=/dev/urandom of="$dest2" bs=1k count=$((count*percent/100)) conv=notrunc &>/dev/null + touch "$dest" -r $temp + touch "$dest2" -r $temp + \rm -f $tmp + echo $status > /tmp/status + run $r "$dir_base" "$dir_base2" + echo $status >> /tmp/status +} +