Swap sqlite cursor with dictionary and set data structures (#24)

1. Use 2 new data structures:
-paths (set) contains all the files in the actual filesystem
-hashes (dictionary) substitute the sqlite query with dict[hash] = set(db paths)

2. Minimal unitary tests created with bats (bash script)

See https://github.com/ambv/bitrot/issues/23 for details.
This commit is contained in:
liloman 2017-03-03 19:16:46 +01:00 committed by Łukasz Langa
parent 6b4a1fd46a
commit a8e52626ef
3 changed files with 310 additions and 24 deletions

41
src/bitrot.py Normal file → Executable file
View File

@ -104,7 +104,7 @@ def list_existing_paths(directory, expected=(), ignored=(), follow_links=False):
`follow_links` is False (the default). All entries present in `expected`
must be files (can't be directories or symlinks).
"""
paths = []
paths = set()
total_size = 0
for path, _, files in os.walk(directory):
for f in files:
@ -129,9 +129,8 @@ def list_existing_paths(directory, expected=(), ignored=(), follow_links=False):
else:
if not stat.S_ISREG(st.st_mode) or p in ignored:
continue
paths.append(p)
paths.add(p)
total_size += st.st_size
paths.sort()
return paths, total_size
@ -180,12 +179,13 @@ class Bitrot(object):
errors = []
current_size = 0
missing_paths = self.select_all_paths(cur)
hashes = self.select_all_hashes(cur)
paths, total_size = list_existing_paths(
b'.', expected=missing_paths, ignored={bitrot_db, bitrot_sha512},
follow_links=self.follow_links,
)
for p in paths:
for p in sorted(paths):
p_uni = p.decode(FSENCODING)
try:
st = os.stat(p)
@ -227,7 +227,7 @@ class Bitrot(object):
row = cur.fetchone()
if not row:
stored_path = self.handle_unknown_path(
cur, p_uni, new_mtime, new_sha1,
cur, p_uni, new_mtime, new_sha1, paths, hashes
)
self.maybe_commit(conn)
@ -291,6 +291,16 @@ class Bitrot(object):
row = cur.fetchone()
return result
def select_all_hashes(self, cur):
result = {}
cur.execute('SELECT hash, path FROM bitrot')
row = cur.fetchone()
while row:
rhash, rpath = row
result.setdefault(rhash, set()).add(rpath)
row = cur.fetchone()
return result
def report_progress(self, current_size, total_size):
size_fmt = '\r{:>6.1%}'.format(current_size/(total_size or 1))
if size_fmt == self._last_reported_size:
@ -345,38 +355,33 @@ class Bitrot(object):
if self.test and self.verbosity:
print('warning: database file not updated on disk (test mode).')
def handle_unknown_path(self, cur, new_path, new_mtime, new_sha1):
def handle_unknown_path(self, cur, new_path, new_mtime, new_sha1, paths, hashes):
"""Either add a new entry to the database or update the existing entry
on rename.
Returns `new_path` if the entry was indeed new or the `stored_path` (e.g.
outdated path) if there was a rename.
"""
cur.execute('SELECT mtime, path, timestamp FROM bitrot WHERE hash=?',
(new_sha1,))
rows = cur.fetchall()
for row in rows:
stored_mtime, stored_path, stored_ts = row
if os.path.exists(stored_path):
# file still exists, move on
continue
try: # if the path isn't in the database
found = [path for path in hashes[new_sha1] if path not in paths]
renamed = found.pop()
# update the path in the database
cur.execute(
'UPDATE bitrot SET mtime=?, path=?, timestamp=? WHERE path=?',
(new_mtime, new_path, ts(), stored_path),
(new_mtime, new_path, ts(), renamed),
)
return stored_path
return renamed
# no rename, just a new file with the same hash
# From hashes[new_sha1] or found.pop()
except (KeyError,IndexError):
cur.execute(
'INSERT INTO bitrot VALUES (?, ?, ?, ?)',
(new_path, new_mtime, new_sha1, ts()),
)
return new_path
def get_path(directory=b'.', ext=b'db'):
"""Compose the path to the selected bitrot file."""
return os.path.join(directory, b'.bitrot.' + ext)

220
tests/test-bitrot.bats Executable file
View File

@ -0,0 +1,220 @@
#!/usr/bin/env bats
load test_helper
#change it to your testing bitrot
cmd=~/Clones/bitrot/src/bitrot.py
# cmd=bitrot
test_dir=/tmp/bitrot_dir-$USER
mkdir -p $test_dir
cd $test_dir || exit
###########
# BASIC #
###########
@test "bitrot detects new files in a tree dir" {
mkdir -p notemptydirs/dir2/
touch notemptydirs/dir2/new-file-{a,b}.txt
echo $RANDOM >> notemptydirs/dir2/new-file-b.txt
run $cmd -v
# check_fail "${lines[@]}"
(( $status == 0 ))
# [[ ${lines[0]} = "Finished. 0.00 MiB of data read. 0 errors found." ]]
[[ ${lines[1]} = "2 entries in the database. 2 entries new:" ]]
[[ ${lines[2]} = " ./notemptydirs/dir2/new-file-a.txt" ]]
[[ ${lines[3]} = " ./notemptydirs/dir2/new-file-b.txt" ]]
[[ ${lines[4]} = "Updating bitrot.sha512... done." ]]
}
@test "bitrot detects modified files in a tree dir" {
sleep 1
echo $RANDOM >> notemptydirs/dir2/new-file-a.txt
run $cmd -v
# check_fail "${lines[@]}"
(( $status == 0 ))
[[ ${lines[0]} = "Checking bitrot.db integrity... ok." ]]
# [[ ${lines[1]} = "Finished. 0.00 MiB of data read. 0 errors found." ]]
[[ ${lines[2]} = "2 entries in the database. 1 entries updated:" ]]
[[ ${lines[3]} = " ./notemptydirs/dir2/new-file-a.txt" ]]
[[ ${lines[4]} = "Updating bitrot.sha512... done." ]]
}
@test "bitrot detects renamed files in a tree dir" {
sleep 1
mv notemptydirs/dir2/new-file-a.txt notemptydirs/dir2/new-file-a.txt2
run $cmd -v
# check_fail "${lines[@]}"
(( $status == 0 ))
[[ ${lines[0]} = "Checking bitrot.db integrity... ok." ]]
# [[ ${lines[1]} = "Finished. 0.00 MiB of data read. 0 errors found." ]]
[[ ${lines[2]} = "2 entries in the database. 1 entries renamed:" ]]
[[ ${lines[3]} = " from ./notemptydirs/dir2/new-file-a.txt to ./notemptydirs/dir2/new-file-a.txt2" ]]
[[ ${lines[4]} = "Updating bitrot.sha512... done." ]]
}
@test "bitrot detects delete files in a tree dir" {
sleep 1
rm notemptydirs/dir2/new-file-a.txt2
run $cmd -v
# check_fail "${lines[@]}"
(( $status == 0 ))
[[ ${lines[0]} = "Checking bitrot.db integrity... ok." ]]
# [[ ${lines[1]} = "Finished. 0.00 MiB of data read. 0 errors found." ]]
[[ ${lines[2]} = "1 entries in the database. 1 entries missing:" ]]
[[ ${lines[3]} = " ./notemptydirs/dir2/new-file-a.txt2" ]]
[[ ${lines[4]} = "Updating bitrot.sha512... done." ]]
}
@test "bitrot detects new files and modified in a tree dir " {
sleep 1
touch more-files-{a,b,c,d,e,f,g}.txt
echo $RANDOM >> notemptydirs/dir2/new-file-b.txt
run $cmd -v
#check_fail "${lines[@]}"
(( $status == 0 ))
# [[ ${lines[1]} = "Finished. 0.00 MiB of data read. 0 errors found." ]]
[[ ${lines[2]} = "8 entries in the database. 7 entries new:" ]]
[[ ${lines[3]} = " ./more-files-a.txt" ]]
[[ ${lines[4]} = " ./more-files-b.txt" ]]
[[ ${lines[5]} = " ./more-files-c.txt" ]]
[[ ${lines[6]} = " ./more-files-d.txt" ]]
[[ ${lines[7]} = " ./more-files-e.txt" ]]
[[ ${lines[8]} = " ./more-files-f.txt" ]]
[[ ${lines[9]} = " ./more-files-g.txt" ]]
[[ ${lines[10]} = "1 entries updated:" ]]
[[ ${lines[11]} = " ./notemptydirs/dir2/new-file-b.txt" ]]
[[ ${lines[12]} = "Updating bitrot.sha512... done." ]]
}
@test "bitrot detects new files, modified, deleted and moved in a tree dir " {
sleep 1
for fil in {a,b,c,d,e,f,g}; do
echo $RANDOM >> notemptydirs/pl-more-files-$fil.txt
done
echo $RANDOM >> notemptydirs/dir2/new-file-b.txt
mv more-files-a.txt more-files-a.txt2
rm more-files-g.txt
run $cmd -v
(( $status == 0 ))
# [[ ${lines[1]} = "Finished. 0.00 MiB of data read. 0 errors found." ]]
[[ ${lines[2]} = "14 entries in the database. 7 entries new:" ]]
[[ ${lines[3]} = " ./notemptydirs/pl-more-files-a.txt" ]]
[[ ${lines[4]} = " ./notemptydirs/pl-more-files-b.txt" ]]
[[ ${lines[5]} = " ./notemptydirs/pl-more-files-c.txt" ]]
[[ ${lines[6]} = " ./notemptydirs/pl-more-files-d.txt" ]]
[[ ${lines[7]} = " ./notemptydirs/pl-more-files-e.txt" ]]
[[ ${lines[8]} = " ./notemptydirs/pl-more-files-f.txt" ]]
[[ ${lines[9]} = " ./notemptydirs/pl-more-files-g.txt" ]]
[[ ${lines[10]} = "1 entries updated:" ]]
[[ ${lines[11]} = " ./notemptydirs/dir2/new-file-b.txt" ]]
[[ ${lines[12]} = "1 entries renamed:" ]]
[[ ${lines[13]} = " from ./more-files-a.txt to ./more-files-a.txt2" ]]
[[ ${lines[14]} = "1 entries missing:" ]]
[[ ${lines[15]} = " ./more-files-g.txt" ]]
[[ ${lines[16]} = "Updating bitrot.sha512... done." ]]
}
@test "bitrot detects new files, modified, deleted and moved in a tree dir 2" {
sleep 1
for fil in {a,b,c,d,e,f,g}; do
echo $RANDOM >> notemptydirs/pl2-more-files-$fil.txt
done
echo $RANDOM >> notemptydirs/pl-more-files-a.txt
mv notemptydirs/pl-more-files-b.txt notemptydirs/pl-more-files-b.txt2
cp notemptydirs/pl-more-files-g.txt notemptydirs/pl2-more-files-g.txt2
cp notemptydirs/pl-more-files-d.txt notemptydirs/pl2-more-files-d.txt2
rm more-files-f.txt notemptydirs/pl-more-files-c.txt
run $cmd -v
# check_fail "${lines[@]}"
(( $status == 0 ))
# [[ ${lines[1]} = "Finished. 0.00 MiB of data read. 0 errors found." ]]
[[ ${lines[2]} = "21 entries in the database. 9 entries new:" ]]
[[ ${lines[3]} = " ./notemptydirs/pl2-more-files-a.txt" ]]
[[ ${lines[4]} = " ./notemptydirs/pl2-more-files-b.txt" ]]
[[ ${lines[5]} = " ./notemptydirs/pl2-more-files-c.txt" ]]
[[ ${lines[6]} = " ./notemptydirs/pl2-more-files-d.txt" ]]
[[ ${lines[7]} = " ./notemptydirs/pl2-more-files-d.txt2" ]]
[[ ${lines[8]} = " ./notemptydirs/pl2-more-files-e.txt" ]]
[[ ${lines[9]} = " ./notemptydirs/pl2-more-files-f.txt" ]]
[[ ${lines[10]} = " ./notemptydirs/pl2-more-files-g.txt" ]]
[[ ${lines[11]} = " ./notemptydirs/pl2-more-files-g.txt2" ]]
[[ ${lines[12]} = "1 entries updated:" ]]
[[ ${lines[13]} = " ./notemptydirs/pl-more-files-a.txt" ]]
[[ ${lines[14]} = "1 entries renamed:" ]]
[[ ${lines[15]} = " from ./notemptydirs/pl-more-files-b.txt to ./notemptydirs/pl-more-files-b.txt2" ]]
[[ ${lines[16]} = "2 entries missing:" ]]
[[ ${lines[17]} = " ./more-files-f.txt" ]]
[[ ${lines[18]} = " ./notemptydirs/pl-more-files-c.txt" ]]
[[ ${lines[19]} = "Updating bitrot.sha512... done." ]]
}
@test "bitrot can operate with 3278 files easily in a dir" {
sleep 1
mkdir -p alotfiles/here; cd alotfiles/here
#create a 320KB file
dd if=/dev/urandom of=masterfile bs=1 count=327680
#split it in 3277 files (instantly) + masterfile = 3278
split -b 100 -a 10 masterfile
cd $test_dir
run $cmd
(( $status == 0 ))
[[ ${lines[2]} = "3299 entries in the database, 3278 new, 0 updated, 0 renamed, 0 missing." ]]
}
@test "bitrot can operate with 3278 files easily in a dir 2 " {
sleep 1
mv alotfiles/here alotfiles/here-moved
run $cmd
#check_fail "${lines[@]}"
(( $status == 0 ))
[[ ${lines[2]} = "3299 entries in the database, 0 new, 0 updated, 3278 renamed, 0 missing." ]]
}
@test "bitrot can detetect a bitrot in a dir ! " {
sleep 1
generate_bitrot ./bitrot-file 10 2 $cmd
run $cmd -q
#check_fail "${lines[@]}"
(( $status == 1 ))
[[ ${lines[0]} = *"error: SHA1 mismatch for ./bitrot-file: expected"* ]]
[[ ${lines[1]} = "error: There were 1 errors found." ]]
}
@test "Clean everything" {
run chmod -f a+w *
\rm -rf * $test_dir $BITROT_BACKUPS
}

61
tests/test_helper.bash Normal file
View File

@ -0,0 +1,61 @@
#!/usr/bin/env bash
# LC_ALL=en_US.UTF-8
# LANGUAGE=en_US.UTF-8
LANG=C
check_fail() {
local temp=/tmp/bats.log
> $temp
for line; do
echo "$line" >> $temp
done
# cat /tmp/.bitrot.log >> $temp
}
generate_bitrot() {
local dest=$1 temp=/tmp/temp-base
local -i count=$(($2*100)) percent=${3:-5}
local cmd=$4
mkdir -p "${dest%/*}"
local dir_base=${dest%%/*}
touch "$dest" $temp
#let's make sure they shared the same timestamp
touch "$dest" -r $temp
dd if=/dev/zero of="$dest" bs=1k count=$count &>/dev/null
run $cmd
#modify it and change modify date to base-file, simulate real bitrot so
dd seek=1k if=/dev/urandom of="$dest" bs=1k count=$((count*percent/100)) conv=notrunc &>/dev/null
touch "$dest" -r $temp
\rm -f $tmp
run $cmd
}
generate_bitrots() {
local dest=$1 dest2=$2 temp=/tmp/temp-base
local -i count=$(($3*100)) percent=${4:-5}
mkdir -p "${dest%/*}"
mkdir -p "${dest2%/*}"
local dir_base=${dest%/*}
local dir_base2=${dest2%/*}
touch "$dest2" "$dest" $temp
#let's make sure they shared the same timestamp
touch "$dest" -r $temp
touch "$dest2" -r $temp
dd if=/dev/zero of="$dest" bs=1k count=$count &>/dev/null
dd if=/dev/zero of="$dest2" bs=1k count=$count &>/dev/null
run $r "$dir_base" "$dir_base2"
#modify it and change modify date to base-file, simulate bitrot so
dd seek=1k if=/dev/urandom of="$dest" bs=1k count=$((count*percent/100)) conv=notrunc &>/dev/null
dd seek=1k if=/dev/urandom of="$dest2" bs=1k count=$((count*percent/100)) conv=notrunc &>/dev/null
touch "$dest" -r $temp
touch "$dest2" -r $temp
\rm -f $tmp
echo $status > /tmp/status
run $r "$dir_base" "$dir_base2"
echo $status >> /tmp/status
}