.chkbitignore support for subdirectories #8

This commit is contained in:
Christian Zangl 2024-01-03 18:33:36 +01:00
parent 0f55a94658
commit df44bc7bf1
No known key found for this signature in database
GPG Key ID: 6D468AC36E2A4B3D
11 changed files with 183 additions and 81 deletions

View File

@ -74,7 +74,7 @@ chkbit will
Run `chkbit PATH` to verify only. Run `chkbit PATH` to verify only.
``` ```
usage: chkbit [-h] [-u] [--algo ALGO] [-f] [-s] [--index-name NAME] [--ignore-name NAME] [-w N] [--plain] [-q] [-v] [PATH ...] usage: chkbit [-h] [-u] [--show-ignored-only] [--algo ALGO] [-f] [-s] [--index-name NAME] [--ignore-name NAME] [-w N] [--plain] [-q] [-v] [PATH ...]
Checks the data integrity of your files. See https://github.com/laktak/chkbit-py Checks the data integrity of your files. See https://github.com/laktak/chkbit-py
@ -84,6 +84,7 @@ positional arguments:
options: options:
-h, --help show this help message and exit -h, --help show this help message and exit
-u, --update update indices (without this chkbit will verify files in readonly mode) -u, --update update indices (without this chkbit will verify files in readonly mode)
--show-ignored-only only show ignored files
--algo ALGO hash algorithm: md5, sha512, blake3 (default: blake3) --algo ALGO hash algorithm: md5, sha512, blake3 (default: blake3)
-f, --force force update of damaged items -f, --force force update of damaged items
-s, --skip-symlinks do not follow symlinks -s, --skip-symlinks do not follow symlinks
@ -94,6 +95,12 @@ options:
-q, --quiet quiet, don't show progress/information -q, --quiet quiet, don't show progress/information
-v, --verbose verbose output -v, --verbose verbose output
.chkbitignore rules:
each line should contain exactly one name
you may use Unix shell-style wildcards (see README)
lines starting with `#` are skipped
lines starting with `/` are only applied to the current directory
Status codes: Status codes:
DMG: error, data damage detected DMG: error, data damage detected
EIX: error, index damaged EIX: error, index damaged
@ -101,7 +108,7 @@ Status codes:
new: new file new: new file
upd: file updated upd: file updated
ok : check ok ok : check ok
skp: skipped (see .chkbitignore) ign: ignored (see .chkbitignore)
EXC: internal exception EXC: internal exception
``` ```
@ -123,9 +130,13 @@ You should
Add a `.chkbitignore` file containing the names of the files/directories you wish to ignore Add a `.chkbitignore` file containing the names of the files/directories you wish to ignore
- each line should contain exactly one name - each line should contain exactly one name
- you may use [Unix shell-style wildcards](https://docs.python.org/3/library/fnmatch.html)
- `*` matches everything
- `?` matches any single character
- `[seq]` matches any character in seq
- `[!seq]` matches any character not in seq
- lines starting with `#` are skipped - lines starting with `#` are skipped
- you may use [Unix shell-style wildcards](https://docs.python.org/3.8/library/fnmatch.html) - lines starting with `/` are only applied to the current directory
- at the moment does not allow to match files in subdirectories (PR welcome)
## FAQ ## FAQ

View File

@ -1,4 +1,6 @@
from chkbit.status import Status from chkbit.status import Status
from chkbit.ignore import Ignore
from chkbit.input_item import InputItem
from chkbit.context import Context from chkbit.context import Context
from chkbit.hashfile import hashfile, hashtext from chkbit.hashfile import hashfile, hashtext
from chkbit.index import Index from chkbit.index import Index

View File

@ -1,5 +1,8 @@
from __future__ import annotations
import queue import queue
from chkbit import Status import chkbit
from typing import Optional
from chkbit import InputItem
class Context: class Context:
@ -9,6 +12,7 @@ class Context:
num_workers=5, num_workers=5,
force=False, force=False,
update=False, update=False,
show_ignored_only=False,
hash_algo="blake3", hash_algo="blake3",
skip_symlinks=False, skip_symlinks=False,
index_filename=".chkbit", index_filename=".chkbit",
@ -17,19 +21,30 @@ class Context:
self.num_workers = num_workers self.num_workers = num_workers
self.force = force self.force = force
self.update = update self.update = update
self.show_ignored_only = show_ignored_only
self.hash_algo = hash_algo self.hash_algo = hash_algo
self.skip_symlinks = skip_symlinks self.skip_symlinks = skip_symlinks
self.index_filename = index_filename self.index_filename = index_filename
self.ignore_filename = ignore_filename self.ignore_filename = ignore_filename
# the input queue is used to distribute the work
# to the index threads
self.input_queue = queue.Queue()
self.result_queue = queue.Queue() self.result_queue = queue.Queue()
self.hit_queue = queue.Queue() self.hit_queue = queue.Queue()
if hash_algo not in ["md5", "sha512", "blake3"]: if hash_algo not in ["md5", "sha512", "blake3"]:
raise Exception(f"{hash_algo} is unknown.") raise Exception(f"{hash_algo} is unknown.")
def log(self, stat: Status, path: str): def log(self, stat: chkbit.Status, path: str):
self.result_queue.put((0, stat, path)) self.result_queue.put((0, stat, path))
def hit(self, *, cfiles: int = 0, cbytes: int = 0): def hit(self, *, cfiles: int = 0, cbytes: int = 0):
self.result_queue.put((1, cfiles, cbytes)) self.result_queue.put((1, cfiles, cbytes))
def add_input(self, path: str, *, ignore: Optional[chkbit.Ignore] = None):
self.input_queue.put(InputItem(path, ignore=ignore))
def end_input(self):
self.input_queue.put(None)

View File

@ -29,7 +29,7 @@ def hashfile(path: str, hash_algo: str, *, hit: Callable[[str], None]):
return h.hexdigest() return h.hexdigest()
def hashtext(text): def hashtext(text: str):
md5 = hashlib.md5() md5 = hashlib.md5()
md5.update(text.encode("utf-8")) md5.update(text.encode("utf-8"))
return md5.hexdigest() return md5.hexdigest()

56
chkbit/ignore.py Normal file
View File

@ -0,0 +1,56 @@
from __future__ import annotations
import fnmatch
import os
import sys
import chkbit
from enum import Enum
from typing import Optional
class Ignore:
def __init__(
self,
context: chkbit.Context,
path: str,
*,
parent_ignore: Optional[chkbit.Ignore],
):
self.parent_ignore = parent_ignore
self.context = context
self.path = path
self.name = os.path.basename(path) + "/"
self.ignore = []
self.load_ignore()
@property
def ignore_filepath(self):
return os.path.join(self.path, self.context.ignore_filename)
def load_ignore(self):
if not os.path.exists(self.ignore_filepath):
return
with open(self.ignore_filepath, "r", encoding="utf-8") as f:
text = f.read()
self.ignore = list(
filter(
lambda x: x and x[0] != "#" and len(x.strip()) > 0, text.splitlines()
)
)
def should_ignore(self, name: str, *, fullname: str = None):
for ignore in self.ignore:
if ignore.startswith("/"):
if fullname:
continue
else:
ignore = ignore[1:]
if fnmatch.fnmatch(name, ignore):
return True
if fullname and fnmatch.fnmatch(fullname, ignore):
return True
if self.parent_ignore:
return self.parent_ignore.should_ignore(
fullname or name, fullname=self.name + (fullname or name)
)
return False

View File

@ -1,40 +1,38 @@
from __future__ import annotations
import fnmatch import fnmatch
import os import os
import subprocess import subprocess
import sys import sys
import json import json
import chkbit
from chkbit import hashfile, hashtext, Status from chkbit import hashfile, hashtext, Status
from typing import Optional
VERSION = 2 # index version VERSION = 2 # index version
class Index: class Index:
def __init__(self, context, path, files, *, readonly=False): def __init__(
self,
context: chkbit.Context,
path: str,
files: list[str],
*,
readonly: bool = False,
):
self.context = context self.context = context
self.path = path self.path = path
self.files = files self.files = files
self.old = {} self.old = {}
self.new = {} self.new = {}
self.ignore = []
self.load_ignore()
self.updates = [] self.updates = []
self.modified = None self.modified = None
self.readonly = readonly self.readonly = readonly
@property
def ignore_filepath(self):
return os.path.join(self.path, self.context.ignore_filename)
@property @property
def index_filepath(self): def index_filepath(self):
return os.path.join(self.path, self.context.index_filename) return os.path.join(self.path, self.context.index_filename)
def should_ignore(self, name):
for ignore in self.ignore:
if fnmatch.fnmatch(name, ignore):
return True
return False
def _setmod(self, value=True): def _setmod(self, value=True):
self.modified = value self.modified = value
@ -42,10 +40,10 @@ class Index:
self.context.log(stat, os.path.join(self.path, name)) self.context.log(stat, os.path.join(self.path, name))
# calc new hashes for this index # calc new hashes for this index
def update(self): def calc_hashes(self, *, ignore: Optional[chkbit.Ignore] = None):
for name in self.files: for name in self.files:
if self.should_ignore(name): if ignore and ignore.should_ignore(name):
self._log(Status.SKIP, name) self._log(Status.IGNORE, name)
continue continue
a = self.context.hash_algo a = self.context.hash_algo
@ -65,8 +63,13 @@ class Index:
else: else:
self.new[name] = self._calc_file(name, a) self.new[name] = self._calc_file(name, a)
def show_ignored_only(self, ignore: chkbit.Ignore):
for name in self.files:
if ignore.should_ignore(name):
self._log(Status.IGNORE, name)
# check/update the index (old vs new) # check/update the index (old vs new)
def check_fix(self, force): def check_fix(self, force: bool):
for name in self.new.keys(): for name in self.new.keys():
if not name in self.old: if not name in self.old:
self._log(Status.NEW, name) self._log(Status.NEW, name)
@ -101,7 +104,7 @@ class Index:
self._log(Status.WARN_OLD, name) self._log(Status.WARN_OLD, name)
self._setmod() self._setmod()
def _list_file(self, name, a): def _list_file(self, name: str, a: str):
# produce a dummy entry for new files when the index is not updated # produce a dummy entry for new files when the index is not updated
return { return {
"mod": None, "mod": None,
@ -109,7 +112,7 @@ class Index:
"h": None, "h": None,
} }
def _calc_file(self, name, a): def _calc_file(self, name: str, a: str):
path = os.path.join(self.path, name) path = os.path.join(self.path, name)
info = os.stat(path) info = os.stat(path)
mtime = int(info.st_mtime * 1000) mtime = int(info.st_mtime * 1000)
@ -158,15 +161,3 @@ class Index:
self._setmod() self._setmod()
self._log(Status.ERR_IDX, self.index_filepath) self._log(Status.ERR_IDX, self.index_filepath)
return True return True
def load_ignore(self):
if not os.path.exists(self.ignore_filepath):
return
with open(self.ignore_filepath, "r", encoding="utf-8") as f:
text = f.read()
self.ignore = list(
filter(
lambda x: x and x[0] != "#" and len(x.strip()) > 0, text.splitlines()
)
)

View File

@ -1,27 +1,29 @@
from __future__ import annotations
import os import os
import sys import sys
import time import time
import threading import threading
from chkbit import Index, Status import chkbit
from chkbit import Index, Status, Ignore
class IndexThread: class IndexThread:
def __init__(self, thread_no, context, input_queue): def __init__(self, thread_no: int, context: chkbit.Context):
self.thread_no = thread_no self.thread_no = thread_no
self.update = context.update self.update = context.update
self.context = context self.context = context
self.input_queue = input_queue self.input_queue = context.input_queue
self.t = threading.Thread(target=self._run) self.t = threading.Thread(target=self._run)
self.t.daemon = True self.t.daemon = True
self.t.start() self.t.start()
def _process_root(self, parent): def _process_root(self, iitem: chkbit.InputItem):
files = [] files = []
dirs = [] dirs = []
# load files and subdirs # load files and subdirs
for name in os.listdir(path=parent): for name in os.listdir(path=iitem.path):
path = os.path.join(parent, name) path = os.path.join(iitem.path, name)
if name[0] == ".": if name[0] == ".":
continue continue
if os.path.isdir(path): if os.path.isdir(path):
@ -33,36 +35,42 @@ class IndexThread:
files.append(name) files.append(name)
# load index # load index
index = Index(self.context, parent, files, readonly=not self.update) index = Index(self.context, iitem.path, files, readonly=not self.update)
index.load() index.load()
# calc the new hashes # load ignore
index.update() ignore = Ignore(self.context, iitem.path, parent_ignore=iitem.ignore)
# compare if self.context.show_ignored_only:
index.check_fix(self.context.force) index.show_ignored_only(ignore)
else:
# calc the new hashes
index.calc_hashes(ignore=ignore)
# save if update is set # compare
if self.update: index.check_fix(self.context.force)
if index.save():
self.context.log(Status.UPDATE_INDEX, "") # save if update is set
if self.update:
if index.save():
self.context.log(Status.UPDATE_INDEX, "")
# process subdirs # process subdirs
for name in dirs: for name in dirs:
if not index.should_ignore(name): if not ignore.should_ignore(name):
self.input_queue.put(os.path.join(parent, name)) self.context.add_input(os.path.join(iitem.path, name), ignore=ignore)
else: else:
self.context.log(Status.SKIP, name + "/") self.context.log(Status.IGNORE, name + "/")
def _run(self): def _run(self):
while True: while True:
parent = self.input_queue.get() iitem = self.input_queue.get()
if parent is None: if iitem is None:
break break
try: try:
self._process_root(parent) self._process_root(iitem)
except Exception as e: except Exception as e:
self.context.log(Status.INTERNALEXCEPTION, f"{parent}: {e}") self.context.log(Status.INTERNALEXCEPTION, f"{iitem.path}: {e}")
self.input_queue.task_done() self.input_queue.task_done()
def join(self): def join(self):

9
chkbit/input_item.py Normal file
View File

@ -0,0 +1,9 @@
from __future__ import annotations
from typing import Optional
import chkbit
class InputItem:
def __init__(self, path: str, *, ignore: Optional[chkbit.Ignore] = None):
self.path = path
self.ignore = ignore

View File

@ -8,6 +8,6 @@ class Status(Enum):
NEW = "new" NEW = "new"
UPDATE = "upd" UPDATE = "upd"
OK = "ok " OK = "ok "
SKIP = "skp" IGNORE = "ign"
INTERNALEXCEPTION = "EXC" INTERNALEXCEPTION = "EXC"
UPDATE_INDEX = "iup" UPDATE_INDEX = "iup"

View File

@ -10,7 +10,13 @@ from chkbit import Context, Status, IndexThread
from chkbit_cli import CLI, Progress, RateCalc, sparkify from chkbit_cli import CLI, Progress, RateCalc, sparkify
STATUS_CODES = """ EPILOG = """
.chkbitignore rules:
each line should contain exactly one name
you may use Unix shell-style wildcards (see README)
lines starting with `#` are skipped
lines starting with `/` are only applied to the current directory
Status codes: Status codes:
DMG: error, data damage detected DMG: error, data damage detected
EIX: error, index damaged EIX: error, index damaged
@ -18,7 +24,7 @@ Status codes:
new: new file new: new file
upd: file updated upd: file updated
ok : check ok ok : check ok
skp: skipped (see .chkbitignore) ign: ignored (see .chkbitignore)
EXC: internal exception EXC: internal exception
""" """
@ -67,7 +73,7 @@ class Main:
elif stat == Status.NEW: elif stat == Status.NEW:
self.num_new += 1 self.num_new += 1
if self.verbose or not stat in [Status.OK, Status.SKIP]: if self.verbose or not stat in [Status.OK, Status.IGNORE]:
CLI.printline(stat.value, " ", path) CLI.printline(stat.value, " ", path)
def _res_worker(self, context: Context): def _res_worker(self, context: Context):
@ -113,18 +119,15 @@ class Main:
print(self.total, end="\r") print(self.total, end="\r")
def process(self, args): def process(self, args):
# the input queue is used to distribute the work if args.update and args.show_ignored_only:
# to the index threads print("Error: use either --update or --show-ignored-only!", file=sys.stderr)
input_queue = queue.Queue() return None
# put the initial paths into the queue
for path in args.paths:
input_queue.put(path)
context = Context( context = Context(
num_workers=args.workers, num_workers=args.workers,
force=args.force, force=args.force,
update=args.update, update=args.update,
show_ignored_only=args.show_ignored_only,
hash_algo=args.algo, hash_algo=args.algo,
skip_symlinks=args.skip_symlinks, skip_symlinks=args.skip_symlinks,
index_filename=args.index_name, index_filename=args.index_name,
@ -132,10 +135,12 @@ class Main:
) )
self.result_queue = context.result_queue self.result_queue = context.result_queue
# put the initial paths into the queue
for path in args.paths:
context.add_input(path)
# start indexing # start indexing
workers = [ workers = [IndexThread(i, context) for i in range(context.num_workers)]
IndexThread(i, context, input_queue) for i in range(context.num_workers)
]
# log the results from the workers # log the results from the workers
res_worker = threading.Thread(target=self._res_worker, args=(context,)) res_worker = threading.Thread(target=self._res_worker, args=(context,))
@ -143,11 +148,11 @@ class Main:
res_worker.start() res_worker.start()
# wait for work to finish # wait for work to finish
input_queue.join() context.input_queue.join()
# signal workers to exit # signal workers to exit
for worker in workers: for worker in workers:
input_queue.put(None) context.end_input()
# signal res_worker to exit # signal res_worker to exit
self.result_queue.put(None) self.result_queue.put(None)
@ -231,7 +236,7 @@ class Main:
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
prog="chkbit", prog="chkbit",
description="Checks the data integrity of your files. See https://github.com/laktak/chkbit-py", description="Checks the data integrity of your files. See https://github.com/laktak/chkbit-py",
epilog=STATUS_CODES, epilog=EPILOG,
formatter_class=argparse.RawDescriptionHelpFormatter, formatter_class=argparse.RawDescriptionHelpFormatter,
) )
@ -246,6 +251,10 @@ class Main:
help="update indices (without this chkbit will verify files in readonly mode)", help="update indices (without this chkbit will verify files in readonly mode)",
) )
parser.add_argument(
"--show-ignored-only", action="store_true", help="only show ignored files"
)
parser.add_argument( parser.add_argument(
"--algo", "--algo",
type=str, type=str,
@ -305,7 +314,7 @@ class Main:
args = parser.parse_args() args = parser.parse_args()
self.verbose = args.verbose self.verbose = args.verbose or args.show_ignored_only
if args.quiet: if args.quiet:
self.progress = Progress.Quiet self.progress = Progress.Quiet
elif not sys.stdout.isatty(): elif not sys.stdout.isatty():
@ -315,7 +324,8 @@ class Main:
if args.paths: if args.paths:
context = self.process(args) context = self.process(args)
self.print_result(context) if context and not context.show_ignored_only:
self.print_result(context)
else: else:
parser.print_help() parser.print_help()

View File

@ -1,6 +1,6 @@
[project] [project]
name = "chkbit" name = "chkbit"
version = "3.0.2" version = "4.0.0"
description = "chkbit checks the data integrity of your files" description = "chkbit checks the data integrity of your files"
authors = [ authors = [
{name = "Christian Zangl", email = "laktak@cdak.net"}, {name = "Christian Zangl", email = "laktak@cdak.net"},
@ -8,7 +8,7 @@ authors = [
dependencies = [ dependencies = [
"blake3>=0.3.4", "blake3>=0.3.4",
] ]
requires-python = ">=3.6.0" requires-python = ">=3.7.0"
readme = "README.md" readme = "README.md"
license = {file = "LICENSE"} license = {file = "LICENSE"}