From ab77c1b6d93ac62c54f3e3bdd013848faaf0a300 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Fri, 11 Nov 2016 10:30:48 -0800 Subject: [PATCH] refactor autoindex: switch to gevent-based simple polling, as watchdog doesn't work with gevent #200 --- pywb/manager/autoindex.py | 67 ++++++++++++++++++++++++--------------- pywb/manager/manager.py | 10 +++--- tests/test_auto_colls.py | 33 +++++++++++-------- 3 files changed, 67 insertions(+), 43 deletions(-) diff --git a/pywb/manager/autoindex.py b/pywb/manager/autoindex.py index 75052411..6949211a 100644 --- a/pywb/manager/autoindex.py +++ b/pywb/manager/autoindex.py @@ -1,45 +1,60 @@ -import sys +import gevent import time -from watchdog.observers import Observer -from watchdog.events import RegexMatchingEventHandler +import re +import os #============================================================================= -EXT_REGEX = '.*\.w?arc(\.gz)?$' +EXT_RX = re.compile('.*\.w?arc(\.gz)?$') keep_running = True + #============================================================================= -class CDXAutoIndexer(RegexMatchingEventHandler): +class CDXAutoIndexer(object): def __init__(self, updater, path): - super(CDXAutoIndexer, self).__init__(regexes=[EXT_REGEX], - ignore_directories=True) self.updater = updater - self.cdx_path = path + self.root_path = path - def on_created(self, event): - self.updater(event.src_path) + self.mtimes = {} - def on_modified(self, event): - self.updater(event.src_path) + def has_changed(self, *paths): + full_path = os.path.join(*paths) + try: + mtime = os.path.getmtime(full_path) + except: + return False - def start_watch(self): - self.observer = Observer() - self.observer.schedule(self, self.cdx_path, recursive=True) - self.observer.start() + if mtime == self.mtimes.get(full_path): + return False - def do_loop(self, sleep_time=1): + self.mtimes[full_path] = mtime + return full_path + + def check_path(self): + for dirName, subdirList, fileList in os.walk(self.root_path): + if not subdirList and not self.has_changed(dirName): + return False + + for filename in fileList: + if not EXT_RX.match(filename): + continue + + path = self.has_changed(self.root_path, dirName, filename) + if not path: + continue + + self.updater(os.path.join(dirName, filename)) + + def do_loop(self, interval): try: while keep_running: - time.sleep(sleep_time) + self.check_path() + time.sleep(interval) except KeyboardInterrupt: # pragma: no cover - self.observer.stop() - self.observer.join() + return + + def start(self, interval): + self.ge = gevent.spawn(self.do_loop, interval) -#============================================================================= -if __name__ == "__main__": - w = Watcher(sys.argv[1] if len(sys.argv) > 1 else '.') - def p(x): - print(x) - w.run(p) diff --git a/pywb/manager/manager.py b/pywb/manager/manager.py index 288b0475..54722d0c 100644 --- a/pywb/manager/manager.py +++ b/pywb/manager/manager.py @@ -327,7 +327,7 @@ directory structure expected by pywb migrate.convert_to_cdxj() - def autoindex(self, do_loop=True): + def autoindex(self, interval=30.0, do_loop=True): from pywb.manager.autoindex import CDXAutoIndexer if self.coll_name: @@ -351,9 +351,10 @@ directory structure expected by pywb indexer = CDXAutoIndexer(do_index, path) - indexer.start_watch() + indexer.start(interval) + #indexer.start_watch() if do_loop: - indexer.do_loop() + indexer.do_loop(interval) #============================================================================= @@ -468,11 +469,12 @@ Create manage file based web archive collections # Auto Index def do_autoindex(r): m = CollectionsManager(r.coll_name, must_exist=False) - m.autoindex(True) + m.autoindex(r.interval, True) autoindex_help = 'Automatically index any change archive files' autoindex = subparsers.add_parser('autoindex', help=autoindex_help) autoindex.add_argument('coll_name', nargs='?', default='') + autoindex.add_argument('--interval', type=float, default=30.0) autoindex.set_defaults(func=do_autoindex) r = parser.parse_args(args=args) diff --git a/tests/test_auto_colls.py b/tests/test_auto_colls.py index 0f502453..ff88d947 100644 --- a/tests/test_auto_colls.py +++ b/tests/test_auto_colls.py @@ -1,3 +1,5 @@ +from gevent.monkey import patch_all; patch_all() + import os import tempfile import shutil @@ -6,7 +8,8 @@ import sys import webtest import time -import threading +#import threading +import gevent from six import StringIO @@ -488,20 +491,22 @@ class TestManagedColls(object): def do_copy(): try: - time.sleep(1) + time.sleep(1.0) shutil.copy(self._get_sample_warc('example.warc.gz'), archive_dir) shutil.copy(self._get_sample_warc('example-extra.warc'), archive_sub_dir) - time.sleep(1) + time.sleep(1.0) finally: pywb.manager.autoindex.keep_running = False - thread = threading.Thread(target=do_copy) - thread.daemon = True - thread.start() + #thread = threading.Thread(target=do_copy) + #thread.daemon = True + #thread.start() + ge = gevent.spawn(do_copy) - main(['autoindex']) + main(['autoindex', 'auto', '--interval', '0.25']) - thread.join() + #thread.join() + ge.join() index_file = os.path.join(auto_dir, INDEX_DIR, AUTOINDEX_FILE) assert os.path.isfile(index_file) @@ -519,13 +524,15 @@ class TestManagedColls(object): os.remove(index_file) - thread = threading.Thread(target=do_copy) - thread.daemon = True - thread.start() + #thread = threading.Thread(target=do_copy) + #thread.daemon = True + #thread.start() + ge = gevent.spawn(do_copy) - main(['autoindex', 'auto']) + main(['autoindex', 'auto', '--interval', '0.25']) - thread.join() + #thread.join() + ge.join() # assert file was update assert os.path.getmtime(index_file) > mtime