1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

refactor autoindex: switch to gevent-based simple polling, as watchdog doesn't work with gevent #200

This commit is contained in:
Ilya Kreymer 2016-11-11 10:30:48 -08:00
parent fa247b8fe5
commit ab77c1b6d9
3 changed files with 67 additions and 43 deletions

View File

@ -1,45 +1,60 @@
import sys
import gevent
import time
from watchdog.observers import Observer
from watchdog.events import RegexMatchingEventHandler
import re
import os
#=============================================================================
EXT_REGEX = '.*\.w?arc(\.gz)?$'
EXT_RX = re.compile('.*\.w?arc(\.gz)?$')
keep_running = True
#=============================================================================
class CDXAutoIndexer(RegexMatchingEventHandler):
class CDXAutoIndexer(object):
def __init__(self, updater, path):
super(CDXAutoIndexer, self).__init__(regexes=[EXT_REGEX],
ignore_directories=True)
self.updater = updater
self.cdx_path = path
self.root_path = path
def on_created(self, event):
self.updater(event.src_path)
self.mtimes = {}
def on_modified(self, event):
self.updater(event.src_path)
def has_changed(self, *paths):
full_path = os.path.join(*paths)
try:
mtime = os.path.getmtime(full_path)
except:
return False
def start_watch(self):
self.observer = Observer()
self.observer.schedule(self, self.cdx_path, recursive=True)
self.observer.start()
if mtime == self.mtimes.get(full_path):
return False
def do_loop(self, sleep_time=1):
self.mtimes[full_path] = mtime
return full_path
def check_path(self):
for dirName, subdirList, fileList in os.walk(self.root_path):
if not subdirList and not self.has_changed(dirName):
return False
for filename in fileList:
if not EXT_RX.match(filename):
continue
path = self.has_changed(self.root_path, dirName, filename)
if not path:
continue
self.updater(os.path.join(dirName, filename))
def do_loop(self, interval):
try:
while keep_running:
time.sleep(sleep_time)
self.check_path()
time.sleep(interval)
except KeyboardInterrupt: # pragma: no cover
self.observer.stop()
self.observer.join()
return
def start(self, interval):
self.ge = gevent.spawn(self.do_loop, interval)
#=============================================================================
if __name__ == "__main__":
w = Watcher(sys.argv[1] if len(sys.argv) > 1 else '.')
def p(x):
print(x)
w.run(p)

View File

@ -327,7 +327,7 @@ directory structure expected by pywb
migrate.convert_to_cdxj()
def autoindex(self, do_loop=True):
def autoindex(self, interval=30.0, do_loop=True):
from pywb.manager.autoindex import CDXAutoIndexer
if self.coll_name:
@ -351,9 +351,10 @@ directory structure expected by pywb
indexer = CDXAutoIndexer(do_index, path)
indexer.start_watch()
indexer.start(interval)
#indexer.start_watch()
if do_loop:
indexer.do_loop()
indexer.do_loop(interval)
#=============================================================================
@ -468,11 +469,12 @@ Create manage file based web archive collections
# Auto Index
def do_autoindex(r):
m = CollectionsManager(r.coll_name, must_exist=False)
m.autoindex(True)
m.autoindex(r.interval, True)
autoindex_help = 'Automatically index any change archive files'
autoindex = subparsers.add_parser('autoindex', help=autoindex_help)
autoindex.add_argument('coll_name', nargs='?', default='')
autoindex.add_argument('--interval', type=float, default=30.0)
autoindex.set_defaults(func=do_autoindex)
r = parser.parse_args(args=args)

View File

@ -1,3 +1,5 @@
from gevent.monkey import patch_all; patch_all()
import os
import tempfile
import shutil
@ -6,7 +8,8 @@ import sys
import webtest
import time
import threading
#import threading
import gevent
from six import StringIO
@ -488,20 +491,22 @@ class TestManagedColls(object):
def do_copy():
try:
time.sleep(1)
time.sleep(1.0)
shutil.copy(self._get_sample_warc('example.warc.gz'), archive_dir)
shutil.copy(self._get_sample_warc('example-extra.warc'), archive_sub_dir)
time.sleep(1)
time.sleep(1.0)
finally:
pywb.manager.autoindex.keep_running = False
thread = threading.Thread(target=do_copy)
thread.daemon = True
thread.start()
#thread = threading.Thread(target=do_copy)
#thread.daemon = True
#thread.start()
ge = gevent.spawn(do_copy)
main(['autoindex'])
main(['autoindex', 'auto', '--interval', '0.25'])
thread.join()
#thread.join()
ge.join()
index_file = os.path.join(auto_dir, INDEX_DIR, AUTOINDEX_FILE)
assert os.path.isfile(index_file)
@ -519,13 +524,15 @@ class TestManagedColls(object):
os.remove(index_file)
thread = threading.Thread(target=do_copy)
thread.daemon = True
thread.start()
#thread = threading.Thread(target=do_copy)
#thread.daemon = True
#thread.start()
ge = gevent.spawn(do_copy)
main(['autoindex', 'auto'])
main(['autoindex', 'auto', '--interval', '0.25'])
thread.join()
#thread.join()
ge.join()
# assert file was update
assert os.path.getmtime(index_file) > mtime