add sha512, see #3

This commit is contained in:
Christian Zangl 2022-02-20 19:11:29 +01:00
parent 457d38b19b
commit d0c78ec3ee
No known key found for this signature in database
GPG Key ID: 6D468AC36E2A4B3D
6 changed files with 92 additions and 26 deletions

View File

@ -39,22 +39,23 @@ Run `chkbit -u PATH` to create/update the chkbit index.
chkbit will chkbit will
- create a `.chkbit` index in every subdirectory of the path it was given. - create a `.chkbit` index in every subdirectory of the path it was given.
- update the index with md5 hashes for every file. - update the index with md5/sha512 hashes for every file.
- report damage for files that failed the integrity check since the last run (check the exit status). - report damage for files that failed the integrity check since the last run (check the exit status).
Run `chkbit PATH` to verify only. Run `chkbit PATH` to verify only.
``` ```
usage: chkbit.py [-h] [-u] [-f] [-i] [-w N] [-q] [-v] [PATH [PATH ...]] usage: chkbit.py [-h] [-u] [--algo ALGO] [-f] [-i] [-w N] [-q] [-v] [PATH ...]
Checks the data integrity of your files. See https://github.com/laktak/chkbit-py Checks the data integrity of your files. See https://github.com/laktak/chkbit-py
positional arguments: positional arguments:
PATH directories to check PATH directories to check
optional arguments: options:
-h, --help show this help message and exit -h, --help show this help message and exit
-u, --update update indices (without this chkbit will only verify files) -u, --update update indices (without this chkbit will only verify files)
--algo ALGO hash algorithm: md5, sha512
-f, --force force update of damaged items -f, --force force update of damaged items
-i, --verify-index verify files in the index only (will not report new files) -i, --verify-index verify files in the index only (will not report new files)
-w N, --workers N number of workers to use, default=5 -w N, --workers N number of workers to use, default=5
@ -112,12 +113,28 @@ The disadvantage is obviously that you get hidden `.chkbit` files in your conten
chkbit operates on files. chkbit operates on files.
When run for the first time it records a md5 hash of the file contents as well as the file modification time. When run for the first time it records a hash of the file contents as well as the file modification time.
When you run it again it first checks the modification time, When you run it again it first checks the modification time,
- if the time changed (because you made an edit) it records a new md5 hash. - if the time changed (because you made an edit) it records a new hash.
- otherwise it will compare the current md5 to the recorded value and report an error if they do not match. - otherwise it will compare the current hash to the recorded value and report an error if they do not match.
### I wish to use a stronger hash algorithm
chkbit now supports sha512. You can specify it with `--algo sha512`.
Note that existing index files will use the hash that they were created with. If you wish to update all hashes you need to delete your existing indexes first.
### How can I delete the index files?
List them with
```
find . -name .chkbit
```
and add `-delete` to delete.
### Can I test if chkbit is working correctly? ### Can I test if chkbit is working correctly?

13
chkbit/context.py Normal file
View File

@ -0,0 +1,13 @@
import hashlib
class Context:
def __init__(self, verify_index, update, force, hash_algo):
self.verify_index = verify_index
self.update = update
self.force = force
self.hash_algo = hash_algo
if hash_algo not in ["md5", "sha512"]:
raise Exception(f"{hash_algo} is unknown.")

View File

@ -1,18 +1,25 @@
import hashlib import hashlib
BLOCKSIZE = 2 ** 10 * 128 # kb BLOCKSIZE = 2**10 * 128 # kb
def hashfile(path): def hashfile(path, hash_algo=None):
md5 = hashlib.md5()
if not hash_algo or hash_algo == "md5":
h = hashlib.md5()
elif hash_algo == "sha512":
h = hashlib.sha512()
else:
raise Exception(f"{hash_algo} is unknown.")
with open(path, "rb") as f: with open(path, "rb") as f:
while True: while True:
buf = f.read(BLOCKSIZE) buf = f.read(BLOCKSIZE)
if len(buf) <= 0: if len(buf) <= 0:
break break
md5.update(buf) h.update(buf)
return md5.hexdigest() return h.hexdigest()
def hashtext(text): def hashtext(text):

View File

@ -58,12 +58,22 @@ class Index:
self.log(stat, os.path.join(self.path, name)) self.log(stat, os.path.join(self.path, name))
# calc new hashes for this index # calc new hashes for this index
def update(self): def update(self, context):
for name in self.files: for name in self.files:
if self.should_ignore(name): if self.should_ignore(name):
self._log(Stat.SKIP, name) self._log(Stat.SKIP, name)
continue continue
self.new[name] = self._calc_file(name)
a = context.hash_algo
# check previously used hash
if name in self.old:
old = self.old[name]
if "md5" in old:
a = "md5" # legacy structure
self.old[name] = {"mod": old["mod"], "a": a, "h": old["md5"]}
elif "a" in old:
a = old["a"]
self.new[name] = self._calc_file(name, a)
# check/update the index (old vs new) # check/update the index (old vs new)
def check_fix(self, force): def check_fix(self, force):
@ -77,7 +87,7 @@ class Index:
b = self.new[name] b = self.new[name]
amod = a["mod"] amod = a["mod"]
bmod = b["mod"] bmod = b["mod"]
if a["md5"] == b["md5"]: if a["h"] == b["h"]:
# ok, if the content stays the same the mod time does not matter # ok, if the content stays the same the mod time does not matter
self._log(Stat.OK, name) self._log(Stat.OK, name)
if amod != bmod: if amod != bmod:
@ -101,11 +111,11 @@ class Index:
self._log(Stat.WARN_OLD, name) self._log(Stat.WARN_OLD, name)
self._setmod() self._setmod()
def _calc_file(self, name): def _calc_file(self, name, a):
path = os.path.join(self.path, name) path = os.path.join(self.path, name)
info = os.stat(path) info = os.stat(path)
mtime = int(info.st_mtime * 1000) mtime = int(info.st_mtime * 1000)
return {"mod": mtime, "md5": hashfile(path)} return {"mod": mtime, "a": a, "h": hashfile(path, a)}
def save(self): def save(self):
if self.modified: if self.modified:
@ -114,7 +124,7 @@ class Index:
data["idx_hash"] = hashtext(text) data["idx_hash"] = hashtext(text)
with open(self.idx_file, "w", encoding="utf-8") as f: with open(self.idx_file, "w", encoding="utf-8") as f:
json.dump(data, f) json.dump(data, f, separators=(",", ":"))
self.modified = False self.modified = False
return True return True
else: else:
@ -129,7 +139,11 @@ class Index:
if "data" in data: if "data" in data:
# extract old format from js version # extract old format from js version
for item in json.loads(data["data"]): for item in json.loads(data["data"]):
self.old[item["name"]] = {"mod": item["mod"], "md5": item["md5"]} self.old[item["name"]] = {
"mod": item["mod"],
"a": "md5",
"h": item["md5"],
}
elif "idx" in data: elif "idx" in data:
self.old = data["idx"] self.old = data["idx"]
text = json.dumps(self.old, separators=(",", ":")) text = json.dumps(self.old, separators=(",", ":"))

View File

@ -6,11 +6,11 @@ from chkbit import Index, Stat
class IndexThread: class IndexThread:
def __init__(self, idx, args, res_queue, todo_queue): def __init__(self, idx, context, res_queue, todo_queue):
self.idx = idx self.idx = idx
self.verify_index_only = args.verify_index self.verify_index_only = context.verify_index
self.update = args.update and not self.verify_index_only self.update = context.update and not self.verify_index_only
self.force = args.force self.context = context
self.todo_queue = todo_queue self.todo_queue = todo_queue
self.res_queue = res_queue self.res_queue = res_queue
self.t = threading.Thread(target=self.run) self.t = threading.Thread(target=self.run)
@ -40,10 +40,10 @@ class IndexThread:
if e.load() or not self.verify_index_only: if e.load() or not self.verify_index_only:
# calc the new hashes # calc the new hashes
e.update() e.update(self.context)
# compare # compare
e.check_fix(self.force) e.check_fix(self.context.force)
# save if update is set # save if update is set
if self.update: if self.update:

View File

@ -4,7 +4,7 @@ import time
import argparse import argparse
import queue import queue
import threading import threading
from chkbit import IndexThread, Stat from chkbit import Context, IndexThread, Stat
STATUS_CODES = """ STATUS_CODES = """
Status codes: Status codes:
@ -63,6 +63,13 @@ class Main:
help="update indices (without this chkbit will only verify files)", help="update indices (without this chkbit will only verify files)",
) )
parser.add_argument(
"--algo",
type=str,
default="md5",
help="hash algorithm: md5, sha512",
)
parser.add_argument( parser.add_argument(
"-f", "--force", action="store_true", help="force update of damaged items" "-f", "--force", action="store_true", help="force update of damaged items"
) )
@ -90,6 +97,7 @@ class Main:
action="store_true", action="store_true",
help="quiet, don't show progress/information", help="quiet, don't show progress/information",
) )
parser.add_argument( parser.add_argument(
"-v", "--verbose", action="store_true", help="verbose output" "-v", "--verbose", action="store_true", help="verbose output"
) )
@ -120,9 +128,16 @@ class Main:
for path in self.args.paths: for path in self.args.paths:
todo_queue.put(path) todo_queue.put(path)
context = Context(
self.args.verify_index,
self.args.update,
self.args.force,
self.args.algo,
)
# start indexing # start indexing
workers = [ workers = [
IndexThread(idx, self.args, self.res_queue, todo_queue) IndexThread(idx, context, self.res_queue, todo_queue)
for idx in range(self.args.workers) for idx in range(self.args.workers)
] ]