.chkbitignore support for subdirectories #8

This commit is contained in:
Christian Zangl 2024-01-03 18:33:36 +01:00
parent 0f55a94658
commit df44bc7bf1
No known key found for this signature in database
GPG Key ID: 6D468AC36E2A4B3D
11 changed files with 183 additions and 81 deletions

View File

@ -74,7 +74,7 @@ chkbit will
Run `chkbit PATH` to verify only.
```
usage: chkbit [-h] [-u] [--algo ALGO] [-f] [-s] [--index-name NAME] [--ignore-name NAME] [-w N] [--plain] [-q] [-v] [PATH ...]
usage: chkbit [-h] [-u] [--show-ignored-only] [--algo ALGO] [-f] [-s] [--index-name NAME] [--ignore-name NAME] [-w N] [--plain] [-q] [-v] [PATH ...]
Checks the data integrity of your files. See https://github.com/laktak/chkbit-py
@ -84,6 +84,7 @@ positional arguments:
options:
-h, --help show this help message and exit
-u, --update update indices (without this chkbit will verify files in readonly mode)
--show-ignored-only only show ignored files
--algo ALGO hash algorithm: md5, sha512, blake3 (default: blake3)
-f, --force force update of damaged items
-s, --skip-symlinks do not follow symlinks
@ -94,6 +95,12 @@ options:
-q, --quiet quiet, don't show progress/information
-v, --verbose verbose output
.chkbitignore rules:
each line should contain exactly one name
you may use Unix shell-style wildcards (see README)
lines starting with `#` are skipped
lines starting with `/` are only applied to the current directory
Status codes:
DMG: error, data damage detected
EIX: error, index damaged
@ -101,7 +108,7 @@ Status codes:
new: new file
upd: file updated
ok : check ok
skp: skipped (see .chkbitignore)
ign: ignored (see .chkbitignore)
EXC: internal exception
```
@ -123,9 +130,13 @@ You should
Add a `.chkbitignore` file containing the names of the files/directories you wish to ignore
- each line should contain exactly one name
- you may use [Unix shell-style wildcards](https://docs.python.org/3/library/fnmatch.html)
- `*` matches everything
- `?` matches any single character
- `[seq]` matches any character in seq
- `[!seq]` matches any character not in seq
- lines starting with `#` are skipped
- you may use [Unix shell-style wildcards](https://docs.python.org/3.8/library/fnmatch.html)
- at the moment does not allow to match files in subdirectories (PR welcome)
- lines starting with `/` are only applied to the current directory
## FAQ

View File

@ -1,4 +1,6 @@
from chkbit.status import Status
from chkbit.ignore import Ignore
from chkbit.input_item import InputItem
from chkbit.context import Context
from chkbit.hashfile import hashfile, hashtext
from chkbit.index import Index

View File

@ -1,5 +1,8 @@
from __future__ import annotations
import queue
from chkbit import Status
import chkbit
from typing import Optional
from chkbit import InputItem
class Context:
@ -9,6 +12,7 @@ class Context:
num_workers=5,
force=False,
update=False,
show_ignored_only=False,
hash_algo="blake3",
skip_symlinks=False,
index_filename=".chkbit",
@ -17,19 +21,30 @@ class Context:
self.num_workers = num_workers
self.force = force
self.update = update
self.show_ignored_only = show_ignored_only
self.hash_algo = hash_algo
self.skip_symlinks = skip_symlinks
self.index_filename = index_filename
self.ignore_filename = ignore_filename
# the input queue is used to distribute the work
# to the index threads
self.input_queue = queue.Queue()
self.result_queue = queue.Queue()
self.hit_queue = queue.Queue()
if hash_algo not in ["md5", "sha512", "blake3"]:
raise Exception(f"{hash_algo} is unknown.")
def log(self, stat: Status, path: str):
def log(self, stat: chkbit.Status, path: str):
self.result_queue.put((0, stat, path))
def hit(self, *, cfiles: int = 0, cbytes: int = 0):
self.result_queue.put((1, cfiles, cbytes))
def add_input(self, path: str, *, ignore: Optional[chkbit.Ignore] = None):
self.input_queue.put(InputItem(path, ignore=ignore))
def end_input(self):
self.input_queue.put(None)

View File

@ -29,7 +29,7 @@ def hashfile(path: str, hash_algo: str, *, hit: Callable[[str], None]):
return h.hexdigest()
def hashtext(text):
def hashtext(text: str):
md5 = hashlib.md5()
md5.update(text.encode("utf-8"))
return md5.hexdigest()

56
chkbit/ignore.py Normal file
View File

@ -0,0 +1,56 @@
from __future__ import annotations
import fnmatch
import os
import sys
import chkbit
from enum import Enum
from typing import Optional
class Ignore:
def __init__(
self,
context: chkbit.Context,
path: str,
*,
parent_ignore: Optional[chkbit.Ignore],
):
self.parent_ignore = parent_ignore
self.context = context
self.path = path
self.name = os.path.basename(path) + "/"
self.ignore = []
self.load_ignore()
@property
def ignore_filepath(self):
return os.path.join(self.path, self.context.ignore_filename)
def load_ignore(self):
if not os.path.exists(self.ignore_filepath):
return
with open(self.ignore_filepath, "r", encoding="utf-8") as f:
text = f.read()
self.ignore = list(
filter(
lambda x: x and x[0] != "#" and len(x.strip()) > 0, text.splitlines()
)
)
def should_ignore(self, name: str, *, fullname: str = None):
for ignore in self.ignore:
if ignore.startswith("/"):
if fullname:
continue
else:
ignore = ignore[1:]
if fnmatch.fnmatch(name, ignore):
return True
if fullname and fnmatch.fnmatch(fullname, ignore):
return True
if self.parent_ignore:
return self.parent_ignore.should_ignore(
fullname or name, fullname=self.name + (fullname or name)
)
return False

View File

@ -1,40 +1,38 @@
from __future__ import annotations
import fnmatch
import os
import subprocess
import sys
import json
import chkbit
from chkbit import hashfile, hashtext, Status
from typing import Optional
VERSION = 2 # index version
class Index:
def __init__(self, context, path, files, *, readonly=False):
def __init__(
self,
context: chkbit.Context,
path: str,
files: list[str],
*,
readonly: bool = False,
):
self.context = context
self.path = path
self.files = files
self.old = {}
self.new = {}
self.ignore = []
self.load_ignore()
self.updates = []
self.modified = None
self.readonly = readonly
@property
def ignore_filepath(self):
return os.path.join(self.path, self.context.ignore_filename)
@property
def index_filepath(self):
return os.path.join(self.path, self.context.index_filename)
def should_ignore(self, name):
for ignore in self.ignore:
if fnmatch.fnmatch(name, ignore):
return True
return False
def _setmod(self, value=True):
self.modified = value
@ -42,10 +40,10 @@ class Index:
self.context.log(stat, os.path.join(self.path, name))
# calc new hashes for this index
def update(self):
def calc_hashes(self, *, ignore: Optional[chkbit.Ignore] = None):
for name in self.files:
if self.should_ignore(name):
self._log(Status.SKIP, name)
if ignore and ignore.should_ignore(name):
self._log(Status.IGNORE, name)
continue
a = self.context.hash_algo
@ -65,8 +63,13 @@ class Index:
else:
self.new[name] = self._calc_file(name, a)
def show_ignored_only(self, ignore: chkbit.Ignore):
for name in self.files:
if ignore.should_ignore(name):
self._log(Status.IGNORE, name)
# check/update the index (old vs new)
def check_fix(self, force):
def check_fix(self, force: bool):
for name in self.new.keys():
if not name in self.old:
self._log(Status.NEW, name)
@ -101,7 +104,7 @@ class Index:
self._log(Status.WARN_OLD, name)
self._setmod()
def _list_file(self, name, a):
def _list_file(self, name: str, a: str):
# produce a dummy entry for new files when the index is not updated
return {
"mod": None,
@ -109,7 +112,7 @@ class Index:
"h": None,
}
def _calc_file(self, name, a):
def _calc_file(self, name: str, a: str):
path = os.path.join(self.path, name)
info = os.stat(path)
mtime = int(info.st_mtime * 1000)
@ -158,15 +161,3 @@ class Index:
self._setmod()
self._log(Status.ERR_IDX, self.index_filepath)
return True
def load_ignore(self):
if not os.path.exists(self.ignore_filepath):
return
with open(self.ignore_filepath, "r", encoding="utf-8") as f:
text = f.read()
self.ignore = list(
filter(
lambda x: x and x[0] != "#" and len(x.strip()) > 0, text.splitlines()
)
)

View File

@ -1,27 +1,29 @@
from __future__ import annotations
import os
import sys
import time
import threading
from chkbit import Index, Status
import chkbit
from chkbit import Index, Status, Ignore
class IndexThread:
def __init__(self, thread_no, context, input_queue):
def __init__(self, thread_no: int, context: chkbit.Context):
self.thread_no = thread_no
self.update = context.update
self.context = context
self.input_queue = input_queue
self.input_queue = context.input_queue
self.t = threading.Thread(target=self._run)
self.t.daemon = True
self.t.start()
def _process_root(self, parent):
def _process_root(self, iitem: chkbit.InputItem):
files = []
dirs = []
# load files and subdirs
for name in os.listdir(path=parent):
path = os.path.join(parent, name)
for name in os.listdir(path=iitem.path):
path = os.path.join(iitem.path, name)
if name[0] == ".":
continue
if os.path.isdir(path):
@ -33,11 +35,17 @@ class IndexThread:
files.append(name)
# load index
index = Index(self.context, parent, files, readonly=not self.update)
index = Index(self.context, iitem.path, files, readonly=not self.update)
index.load()
# load ignore
ignore = Ignore(self.context, iitem.path, parent_ignore=iitem.ignore)
if self.context.show_ignored_only:
index.show_ignored_only(ignore)
else:
# calc the new hashes
index.update()
index.calc_hashes(ignore=ignore)
# compare
index.check_fix(self.context.force)
@ -49,20 +57,20 @@ class IndexThread:
# process subdirs
for name in dirs:
if not index.should_ignore(name):
self.input_queue.put(os.path.join(parent, name))
if not ignore.should_ignore(name):
self.context.add_input(os.path.join(iitem.path, name), ignore=ignore)
else:
self.context.log(Status.SKIP, name + "/")
self.context.log(Status.IGNORE, name + "/")
def _run(self):
while True:
parent = self.input_queue.get()
if parent is None:
iitem = self.input_queue.get()
if iitem is None:
break
try:
self._process_root(parent)
self._process_root(iitem)
except Exception as e:
self.context.log(Status.INTERNALEXCEPTION, f"{parent}: {e}")
self.context.log(Status.INTERNALEXCEPTION, f"{iitem.path}: {e}")
self.input_queue.task_done()
def join(self):

9
chkbit/input_item.py Normal file
View File

@ -0,0 +1,9 @@
from __future__ import annotations
from typing import Optional
import chkbit
class InputItem:
def __init__(self, path: str, *, ignore: Optional[chkbit.Ignore] = None):
self.path = path
self.ignore = ignore

View File

@ -8,6 +8,6 @@ class Status(Enum):
NEW = "new"
UPDATE = "upd"
OK = "ok "
SKIP = "skp"
IGNORE = "ign"
INTERNALEXCEPTION = "EXC"
UPDATE_INDEX = "iup"

View File

@ -10,7 +10,13 @@ from chkbit import Context, Status, IndexThread
from chkbit_cli import CLI, Progress, RateCalc, sparkify
STATUS_CODES = """
EPILOG = """
.chkbitignore rules:
each line should contain exactly one name
you may use Unix shell-style wildcards (see README)
lines starting with `#` are skipped
lines starting with `/` are only applied to the current directory
Status codes:
DMG: error, data damage detected
EIX: error, index damaged
@ -18,7 +24,7 @@ Status codes:
new: new file
upd: file updated
ok : check ok
skp: skipped (see .chkbitignore)
ign: ignored (see .chkbitignore)
EXC: internal exception
"""
@ -67,7 +73,7 @@ class Main:
elif stat == Status.NEW:
self.num_new += 1
if self.verbose or not stat in [Status.OK, Status.SKIP]:
if self.verbose or not stat in [Status.OK, Status.IGNORE]:
CLI.printline(stat.value, " ", path)
def _res_worker(self, context: Context):
@ -113,18 +119,15 @@ class Main:
print(self.total, end="\r")
def process(self, args):
# the input queue is used to distribute the work
# to the index threads
input_queue = queue.Queue()
# put the initial paths into the queue
for path in args.paths:
input_queue.put(path)
if args.update and args.show_ignored_only:
print("Error: use either --update or --show-ignored-only!", file=sys.stderr)
return None
context = Context(
num_workers=args.workers,
force=args.force,
update=args.update,
show_ignored_only=args.show_ignored_only,
hash_algo=args.algo,
skip_symlinks=args.skip_symlinks,
index_filename=args.index_name,
@ -132,10 +135,12 @@ class Main:
)
self.result_queue = context.result_queue
# put the initial paths into the queue
for path in args.paths:
context.add_input(path)
# start indexing
workers = [
IndexThread(i, context, input_queue) for i in range(context.num_workers)
]
workers = [IndexThread(i, context) for i in range(context.num_workers)]
# log the results from the workers
res_worker = threading.Thread(target=self._res_worker, args=(context,))
@ -143,11 +148,11 @@ class Main:
res_worker.start()
# wait for work to finish
input_queue.join()
context.input_queue.join()
# signal workers to exit
for worker in workers:
input_queue.put(None)
context.end_input()
# signal res_worker to exit
self.result_queue.put(None)
@ -231,7 +236,7 @@ class Main:
parser = argparse.ArgumentParser(
prog="chkbit",
description="Checks the data integrity of your files. See https://github.com/laktak/chkbit-py",
epilog=STATUS_CODES,
epilog=EPILOG,
formatter_class=argparse.RawDescriptionHelpFormatter,
)
@ -246,6 +251,10 @@ class Main:
help="update indices (without this chkbit will verify files in readonly mode)",
)
parser.add_argument(
"--show-ignored-only", action="store_true", help="only show ignored files"
)
parser.add_argument(
"--algo",
type=str,
@ -305,7 +314,7 @@ class Main:
args = parser.parse_args()
self.verbose = args.verbose
self.verbose = args.verbose or args.show_ignored_only
if args.quiet:
self.progress = Progress.Quiet
elif not sys.stdout.isatty():
@ -315,6 +324,7 @@ class Main:
if args.paths:
context = self.process(args)
if context and not context.show_ignored_only:
self.print_result(context)
else:
parser.print_help()

View File

@ -1,6 +1,6 @@
[project]
name = "chkbit"
version = "3.0.2"
version = "4.0.0"
description = "chkbit checks the data integrity of your files"
authors = [
{name = "Christian Zangl", email = "laktak@cdak.net"},
@ -8,7 +8,7 @@ authors = [
dependencies = [
"blake3>=0.3.4",
]
requires-python = ">=3.6.0"
requires-python = ">=3.7.0"
readme = "README.md"
license = {file = "LICENSE"}