From df44bc7bf11d21ec61a3aa0d494c65131f059801 Mon Sep 17 00:00:00 2001 From: Christian Zangl Date: Wed, 3 Jan 2024 18:33:36 +0100 Subject: [PATCH] .chkbitignore support for subdirectories #8 --- README.md | 19 +++++++++++--- chkbit/__init__.py | 2 ++ chkbit/context.py | 19 ++++++++++++-- chkbit/hashfile.py | 2 +- chkbit/ignore.py | 56 ++++++++++++++++++++++++++++++++++++++++++ chkbit/index.py | 53 +++++++++++++++++---------------------- chkbit/index_thread.py | 52 ++++++++++++++++++++++----------------- chkbit/input_item.py | 9 +++++++ chkbit/status.py | 2 +- chkbit_cli/main.py | 46 ++++++++++++++++++++-------------- pyproject.toml | 4 +-- 11 files changed, 183 insertions(+), 81 deletions(-) create mode 100644 chkbit/ignore.py create mode 100644 chkbit/input_item.py diff --git a/README.md b/README.md index cbf8733..9edc047 100644 --- a/README.md +++ b/README.md @@ -74,7 +74,7 @@ chkbit will Run `chkbit PATH` to verify only. ``` -usage: chkbit [-h] [-u] [--algo ALGO] [-f] [-s] [--index-name NAME] [--ignore-name NAME] [-w N] [--plain] [-q] [-v] [PATH ...] +usage: chkbit [-h] [-u] [--show-ignored-only] [--algo ALGO] [-f] [-s] [--index-name NAME] [--ignore-name NAME] [-w N] [--plain] [-q] [-v] [PATH ...] Checks the data integrity of your files. See https://github.com/laktak/chkbit-py @@ -84,6 +84,7 @@ positional arguments: options: -h, --help show this help message and exit -u, --update update indices (without this chkbit will verify files in readonly mode) + --show-ignored-only only show ignored files --algo ALGO hash algorithm: md5, sha512, blake3 (default: blake3) -f, --force force update of damaged items -s, --skip-symlinks do not follow symlinks @@ -94,6 +95,12 @@ options: -q, --quiet quiet, don't show progress/information -v, --verbose verbose output +.chkbitignore rules: + each line should contain exactly one name + you may use Unix shell-style wildcards (see README) + lines starting with `#` are skipped + lines starting with `/` are only applied to the current directory + Status codes: DMG: error, data damage detected EIX: error, index damaged @@ -101,7 +108,7 @@ Status codes: new: new file upd: file updated ok : check ok - skp: skipped (see .chkbitignore) + ign: ignored (see .chkbitignore) EXC: internal exception ``` @@ -123,9 +130,13 @@ You should Add a `.chkbitignore` file containing the names of the files/directories you wish to ignore - each line should contain exactly one name +- you may use [Unix shell-style wildcards](https://docs.python.org/3/library/fnmatch.html) + - `*` matches everything + - `?` matches any single character + - `[seq]` matches any character in seq + - `[!seq]` matches any character not in seq - lines starting with `#` are skipped -- you may use [Unix shell-style wildcards](https://docs.python.org/3.8/library/fnmatch.html) -- at the moment does not allow to match files in subdirectories (PR welcome) +- lines starting with `/` are only applied to the current directory ## FAQ diff --git a/chkbit/__init__.py b/chkbit/__init__.py index 915004f..1c7ba82 100644 --- a/chkbit/__init__.py +++ b/chkbit/__init__.py @@ -1,4 +1,6 @@ from chkbit.status import Status +from chkbit.ignore import Ignore +from chkbit.input_item import InputItem from chkbit.context import Context from chkbit.hashfile import hashfile, hashtext from chkbit.index import Index diff --git a/chkbit/context.py b/chkbit/context.py index 7abbf34..71e8be8 100644 --- a/chkbit/context.py +++ b/chkbit/context.py @@ -1,5 +1,8 @@ +from __future__ import annotations import queue -from chkbit import Status +import chkbit +from typing import Optional +from chkbit import InputItem class Context: @@ -9,6 +12,7 @@ class Context: num_workers=5, force=False, update=False, + show_ignored_only=False, hash_algo="blake3", skip_symlinks=False, index_filename=".chkbit", @@ -17,19 +21,30 @@ class Context: self.num_workers = num_workers self.force = force self.update = update + self.show_ignored_only = show_ignored_only self.hash_algo = hash_algo self.skip_symlinks = skip_symlinks self.index_filename = index_filename self.ignore_filename = ignore_filename + # the input queue is used to distribute the work + # to the index threads + self.input_queue = queue.Queue() + self.result_queue = queue.Queue() self.hit_queue = queue.Queue() if hash_algo not in ["md5", "sha512", "blake3"]: raise Exception(f"{hash_algo} is unknown.") - def log(self, stat: Status, path: str): + def log(self, stat: chkbit.Status, path: str): self.result_queue.put((0, stat, path)) def hit(self, *, cfiles: int = 0, cbytes: int = 0): self.result_queue.put((1, cfiles, cbytes)) + + def add_input(self, path: str, *, ignore: Optional[chkbit.Ignore] = None): + self.input_queue.put(InputItem(path, ignore=ignore)) + + def end_input(self): + self.input_queue.put(None) diff --git a/chkbit/hashfile.py b/chkbit/hashfile.py index 0d66d17..a7d2a06 100644 --- a/chkbit/hashfile.py +++ b/chkbit/hashfile.py @@ -29,7 +29,7 @@ def hashfile(path: str, hash_algo: str, *, hit: Callable[[str], None]): return h.hexdigest() -def hashtext(text): +def hashtext(text: str): md5 = hashlib.md5() md5.update(text.encode("utf-8")) return md5.hexdigest() diff --git a/chkbit/ignore.py b/chkbit/ignore.py new file mode 100644 index 0000000..050e429 --- /dev/null +++ b/chkbit/ignore.py @@ -0,0 +1,56 @@ +from __future__ import annotations +import fnmatch +import os +import sys +import chkbit +from enum import Enum +from typing import Optional + + +class Ignore: + def __init__( + self, + context: chkbit.Context, + path: str, + *, + parent_ignore: Optional[chkbit.Ignore], + ): + self.parent_ignore = parent_ignore + self.context = context + self.path = path + self.name = os.path.basename(path) + "/" + self.ignore = [] + self.load_ignore() + + @property + def ignore_filepath(self): + return os.path.join(self.path, self.context.ignore_filename) + + def load_ignore(self): + if not os.path.exists(self.ignore_filepath): + return + with open(self.ignore_filepath, "r", encoding="utf-8") as f: + text = f.read() + + self.ignore = list( + filter( + lambda x: x and x[0] != "#" and len(x.strip()) > 0, text.splitlines() + ) + ) + + def should_ignore(self, name: str, *, fullname: str = None): + for ignore in self.ignore: + if ignore.startswith("/"): + if fullname: + continue + else: + ignore = ignore[1:] + if fnmatch.fnmatch(name, ignore): + return True + if fullname and fnmatch.fnmatch(fullname, ignore): + return True + if self.parent_ignore: + return self.parent_ignore.should_ignore( + fullname or name, fullname=self.name + (fullname or name) + ) + return False diff --git a/chkbit/index.py b/chkbit/index.py index 4f9bb7e..3fd8827 100644 --- a/chkbit/index.py +++ b/chkbit/index.py @@ -1,40 +1,38 @@ +from __future__ import annotations import fnmatch import os import subprocess import sys import json +import chkbit from chkbit import hashfile, hashtext, Status +from typing import Optional VERSION = 2 # index version class Index: - def __init__(self, context, path, files, *, readonly=False): + def __init__( + self, + context: chkbit.Context, + path: str, + files: list[str], + *, + readonly: bool = False, + ): self.context = context self.path = path self.files = files self.old = {} self.new = {} - self.ignore = [] - self.load_ignore() self.updates = [] self.modified = None self.readonly = readonly - @property - def ignore_filepath(self): - return os.path.join(self.path, self.context.ignore_filename) - @property def index_filepath(self): return os.path.join(self.path, self.context.index_filename) - def should_ignore(self, name): - for ignore in self.ignore: - if fnmatch.fnmatch(name, ignore): - return True - return False - def _setmod(self, value=True): self.modified = value @@ -42,10 +40,10 @@ class Index: self.context.log(stat, os.path.join(self.path, name)) # calc new hashes for this index - def update(self): + def calc_hashes(self, *, ignore: Optional[chkbit.Ignore] = None): for name in self.files: - if self.should_ignore(name): - self._log(Status.SKIP, name) + if ignore and ignore.should_ignore(name): + self._log(Status.IGNORE, name) continue a = self.context.hash_algo @@ -65,8 +63,13 @@ class Index: else: self.new[name] = self._calc_file(name, a) + def show_ignored_only(self, ignore: chkbit.Ignore): + for name in self.files: + if ignore.should_ignore(name): + self._log(Status.IGNORE, name) + # check/update the index (old vs new) - def check_fix(self, force): + def check_fix(self, force: bool): for name in self.new.keys(): if not name in self.old: self._log(Status.NEW, name) @@ -101,7 +104,7 @@ class Index: self._log(Status.WARN_OLD, name) self._setmod() - def _list_file(self, name, a): + def _list_file(self, name: str, a: str): # produce a dummy entry for new files when the index is not updated return { "mod": None, @@ -109,7 +112,7 @@ class Index: "h": None, } - def _calc_file(self, name, a): + def _calc_file(self, name: str, a: str): path = os.path.join(self.path, name) info = os.stat(path) mtime = int(info.st_mtime * 1000) @@ -158,15 +161,3 @@ class Index: self._setmod() self._log(Status.ERR_IDX, self.index_filepath) return True - - def load_ignore(self): - if not os.path.exists(self.ignore_filepath): - return - with open(self.ignore_filepath, "r", encoding="utf-8") as f: - text = f.read() - - self.ignore = list( - filter( - lambda x: x and x[0] != "#" and len(x.strip()) > 0, text.splitlines() - ) - ) diff --git a/chkbit/index_thread.py b/chkbit/index_thread.py index f22ce95..684af93 100644 --- a/chkbit/index_thread.py +++ b/chkbit/index_thread.py @@ -1,27 +1,29 @@ +from __future__ import annotations import os import sys import time import threading -from chkbit import Index, Status +import chkbit +from chkbit import Index, Status, Ignore class IndexThread: - def __init__(self, thread_no, context, input_queue): + def __init__(self, thread_no: int, context: chkbit.Context): self.thread_no = thread_no self.update = context.update self.context = context - self.input_queue = input_queue + self.input_queue = context.input_queue self.t = threading.Thread(target=self._run) self.t.daemon = True self.t.start() - def _process_root(self, parent): + def _process_root(self, iitem: chkbit.InputItem): files = [] dirs = [] # load files and subdirs - for name in os.listdir(path=parent): - path = os.path.join(parent, name) + for name in os.listdir(path=iitem.path): + path = os.path.join(iitem.path, name) if name[0] == ".": continue if os.path.isdir(path): @@ -33,36 +35,42 @@ class IndexThread: files.append(name) # load index - index = Index(self.context, parent, files, readonly=not self.update) + index = Index(self.context, iitem.path, files, readonly=not self.update) index.load() - # calc the new hashes - index.update() + # load ignore + ignore = Ignore(self.context, iitem.path, parent_ignore=iitem.ignore) - # compare - index.check_fix(self.context.force) + if self.context.show_ignored_only: + index.show_ignored_only(ignore) + else: + # calc the new hashes + index.calc_hashes(ignore=ignore) - # save if update is set - if self.update: - if index.save(): - self.context.log(Status.UPDATE_INDEX, "") + # compare + index.check_fix(self.context.force) + + # save if update is set + if self.update: + if index.save(): + self.context.log(Status.UPDATE_INDEX, "") # process subdirs for name in dirs: - if not index.should_ignore(name): - self.input_queue.put(os.path.join(parent, name)) + if not ignore.should_ignore(name): + self.context.add_input(os.path.join(iitem.path, name), ignore=ignore) else: - self.context.log(Status.SKIP, name + "/") + self.context.log(Status.IGNORE, name + "/") def _run(self): while True: - parent = self.input_queue.get() - if parent is None: + iitem = self.input_queue.get() + if iitem is None: break try: - self._process_root(parent) + self._process_root(iitem) except Exception as e: - self.context.log(Status.INTERNALEXCEPTION, f"{parent}: {e}") + self.context.log(Status.INTERNALEXCEPTION, f"{iitem.path}: {e}") self.input_queue.task_done() def join(self): diff --git a/chkbit/input_item.py b/chkbit/input_item.py new file mode 100644 index 0000000..b3c504f --- /dev/null +++ b/chkbit/input_item.py @@ -0,0 +1,9 @@ +from __future__ import annotations +from typing import Optional +import chkbit + + +class InputItem: + def __init__(self, path: str, *, ignore: Optional[chkbit.Ignore] = None): + self.path = path + self.ignore = ignore diff --git a/chkbit/status.py b/chkbit/status.py index 30dfeb6..be9de0b 100644 --- a/chkbit/status.py +++ b/chkbit/status.py @@ -8,6 +8,6 @@ class Status(Enum): NEW = "new" UPDATE = "upd" OK = "ok " - SKIP = "skp" + IGNORE = "ign" INTERNALEXCEPTION = "EXC" UPDATE_INDEX = "iup" diff --git a/chkbit_cli/main.py b/chkbit_cli/main.py index 1e7b0bc..9dd2092 100644 --- a/chkbit_cli/main.py +++ b/chkbit_cli/main.py @@ -10,7 +10,13 @@ from chkbit import Context, Status, IndexThread from chkbit_cli import CLI, Progress, RateCalc, sparkify -STATUS_CODES = """ +EPILOG = """ +.chkbitignore rules: + each line should contain exactly one name + you may use Unix shell-style wildcards (see README) + lines starting with `#` are skipped + lines starting with `/` are only applied to the current directory + Status codes: DMG: error, data damage detected EIX: error, index damaged @@ -18,7 +24,7 @@ Status codes: new: new file upd: file updated ok : check ok - skp: skipped (see .chkbitignore) + ign: ignored (see .chkbitignore) EXC: internal exception """ @@ -67,7 +73,7 @@ class Main: elif stat == Status.NEW: self.num_new += 1 - if self.verbose or not stat in [Status.OK, Status.SKIP]: + if self.verbose or not stat in [Status.OK, Status.IGNORE]: CLI.printline(stat.value, " ", path) def _res_worker(self, context: Context): @@ -113,18 +119,15 @@ class Main: print(self.total, end="\r") def process(self, args): - # the input queue is used to distribute the work - # to the index threads - input_queue = queue.Queue() - - # put the initial paths into the queue - for path in args.paths: - input_queue.put(path) + if args.update and args.show_ignored_only: + print("Error: use either --update or --show-ignored-only!", file=sys.stderr) + return None context = Context( num_workers=args.workers, force=args.force, update=args.update, + show_ignored_only=args.show_ignored_only, hash_algo=args.algo, skip_symlinks=args.skip_symlinks, index_filename=args.index_name, @@ -132,10 +135,12 @@ class Main: ) self.result_queue = context.result_queue + # put the initial paths into the queue + for path in args.paths: + context.add_input(path) + # start indexing - workers = [ - IndexThread(i, context, input_queue) for i in range(context.num_workers) - ] + workers = [IndexThread(i, context) for i in range(context.num_workers)] # log the results from the workers res_worker = threading.Thread(target=self._res_worker, args=(context,)) @@ -143,11 +148,11 @@ class Main: res_worker.start() # wait for work to finish - input_queue.join() + context.input_queue.join() # signal workers to exit for worker in workers: - input_queue.put(None) + context.end_input() # signal res_worker to exit self.result_queue.put(None) @@ -231,7 +236,7 @@ class Main: parser = argparse.ArgumentParser( prog="chkbit", description="Checks the data integrity of your files. See https://github.com/laktak/chkbit-py", - epilog=STATUS_CODES, + epilog=EPILOG, formatter_class=argparse.RawDescriptionHelpFormatter, ) @@ -246,6 +251,10 @@ class Main: help="update indices (without this chkbit will verify files in readonly mode)", ) + parser.add_argument( + "--show-ignored-only", action="store_true", help="only show ignored files" + ) + parser.add_argument( "--algo", type=str, @@ -305,7 +314,7 @@ class Main: args = parser.parse_args() - self.verbose = args.verbose + self.verbose = args.verbose or args.show_ignored_only if args.quiet: self.progress = Progress.Quiet elif not sys.stdout.isatty(): @@ -315,7 +324,8 @@ class Main: if args.paths: context = self.process(args) - self.print_result(context) + if context and not context.show_ignored_only: + self.print_result(context) else: parser.print_help() diff --git a/pyproject.toml b/pyproject.toml index 6775ef3..f110284 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "chkbit" -version = "3.0.2" +version = "4.0.0" description = "chkbit checks the data integrity of your files" authors = [ {name = "Christian Zangl", email = "laktak@cdak.net"}, @@ -8,7 +8,7 @@ authors = [ dependencies = [ "blake3>=0.3.4", ] -requires-python = ">=3.6.0" +requires-python = ">=3.7.0" readme = "README.md" license = {file = "LICENSE"}