1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-28 00:25:21 +01:00

refactor autoindex: switch to gevent-based simple polling, as watchdog doesn't work with gevent #200

This commit is contained in:
Ilya Kreymer 2016-11-11 10:30:48 -08:00
parent fa247b8fe5
commit ab77c1b6d9
3 changed files with 67 additions and 43 deletions

View File

@ -1,45 +1,60 @@
import sys import gevent
import time import time
from watchdog.observers import Observer import re
from watchdog.events import RegexMatchingEventHandler import os
#============================================================================= #=============================================================================
EXT_REGEX = '.*\.w?arc(\.gz)?$' EXT_RX = re.compile('.*\.w?arc(\.gz)?$')
keep_running = True keep_running = True
#============================================================================= #=============================================================================
class CDXAutoIndexer(RegexMatchingEventHandler): class CDXAutoIndexer(object):
def __init__(self, updater, path): def __init__(self, updater, path):
super(CDXAutoIndexer, self).__init__(regexes=[EXT_REGEX],
ignore_directories=True)
self.updater = updater self.updater = updater
self.cdx_path = path self.root_path = path
def on_created(self, event): self.mtimes = {}
self.updater(event.src_path)
def on_modified(self, event): def has_changed(self, *paths):
self.updater(event.src_path) full_path = os.path.join(*paths)
try:
mtime = os.path.getmtime(full_path)
except:
return False
def start_watch(self): if mtime == self.mtimes.get(full_path):
self.observer = Observer() return False
self.observer.schedule(self, self.cdx_path, recursive=True)
self.observer.start()
def do_loop(self, sleep_time=1): self.mtimes[full_path] = mtime
return full_path
def check_path(self):
for dirName, subdirList, fileList in os.walk(self.root_path):
if not subdirList and not self.has_changed(dirName):
return False
for filename in fileList:
if not EXT_RX.match(filename):
continue
path = self.has_changed(self.root_path, dirName, filename)
if not path:
continue
self.updater(os.path.join(dirName, filename))
def do_loop(self, interval):
try: try:
while keep_running: while keep_running:
time.sleep(sleep_time) self.check_path()
time.sleep(interval)
except KeyboardInterrupt: # pragma: no cover except KeyboardInterrupt: # pragma: no cover
self.observer.stop() return
self.observer.join()
def start(self, interval):
self.ge = gevent.spawn(self.do_loop, interval)
#=============================================================================
if __name__ == "__main__":
w = Watcher(sys.argv[1] if len(sys.argv) > 1 else '.')
def p(x):
print(x)
w.run(p)

View File

@ -327,7 +327,7 @@ directory structure expected by pywb
migrate.convert_to_cdxj() migrate.convert_to_cdxj()
def autoindex(self, do_loop=True): def autoindex(self, interval=30.0, do_loop=True):
from pywb.manager.autoindex import CDXAutoIndexer from pywb.manager.autoindex import CDXAutoIndexer
if self.coll_name: if self.coll_name:
@ -351,9 +351,10 @@ directory structure expected by pywb
indexer = CDXAutoIndexer(do_index, path) indexer = CDXAutoIndexer(do_index, path)
indexer.start_watch() indexer.start(interval)
#indexer.start_watch()
if do_loop: if do_loop:
indexer.do_loop() indexer.do_loop(interval)
#============================================================================= #=============================================================================
@ -468,11 +469,12 @@ Create manage file based web archive collections
# Auto Index # Auto Index
def do_autoindex(r): def do_autoindex(r):
m = CollectionsManager(r.coll_name, must_exist=False) m = CollectionsManager(r.coll_name, must_exist=False)
m.autoindex(True) m.autoindex(r.interval, True)
autoindex_help = 'Automatically index any change archive files' autoindex_help = 'Automatically index any change archive files'
autoindex = subparsers.add_parser('autoindex', help=autoindex_help) autoindex = subparsers.add_parser('autoindex', help=autoindex_help)
autoindex.add_argument('coll_name', nargs='?', default='') autoindex.add_argument('coll_name', nargs='?', default='')
autoindex.add_argument('--interval', type=float, default=30.0)
autoindex.set_defaults(func=do_autoindex) autoindex.set_defaults(func=do_autoindex)
r = parser.parse_args(args=args) r = parser.parse_args(args=args)

View File

@ -1,3 +1,5 @@
from gevent.monkey import patch_all; patch_all()
import os import os
import tempfile import tempfile
import shutil import shutil
@ -6,7 +8,8 @@ import sys
import webtest import webtest
import time import time
import threading #import threading
import gevent
from six import StringIO from six import StringIO
@ -488,20 +491,22 @@ class TestManagedColls(object):
def do_copy(): def do_copy():
try: try:
time.sleep(1) time.sleep(1.0)
shutil.copy(self._get_sample_warc('example.warc.gz'), archive_dir) shutil.copy(self._get_sample_warc('example.warc.gz'), archive_dir)
shutil.copy(self._get_sample_warc('example-extra.warc'), archive_sub_dir) shutil.copy(self._get_sample_warc('example-extra.warc'), archive_sub_dir)
time.sleep(1) time.sleep(1.0)
finally: finally:
pywb.manager.autoindex.keep_running = False pywb.manager.autoindex.keep_running = False
thread = threading.Thread(target=do_copy) #thread = threading.Thread(target=do_copy)
thread.daemon = True #thread.daemon = True
thread.start() #thread.start()
ge = gevent.spawn(do_copy)
main(['autoindex']) main(['autoindex', 'auto', '--interval', '0.25'])
thread.join() #thread.join()
ge.join()
index_file = os.path.join(auto_dir, INDEX_DIR, AUTOINDEX_FILE) index_file = os.path.join(auto_dir, INDEX_DIR, AUTOINDEX_FILE)
assert os.path.isfile(index_file) assert os.path.isfile(index_file)
@ -519,13 +524,15 @@ class TestManagedColls(object):
os.remove(index_file) os.remove(index_file)
thread = threading.Thread(target=do_copy) #thread = threading.Thread(target=do_copy)
thread.daemon = True #thread.daemon = True
thread.start() #thread.start()
ge = gevent.spawn(do_copy)
main(['autoindex', 'auto']) main(['autoindex', 'auto', '--interval', '0.25'])
thread.join() #thread.join()
ge.join()
# assert file was update # assert file was update
assert os.path.getmtime(index_file) > mtime assert os.path.getmtime(index_file) > mtime