1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

directory agg: add CacheDirectoryAggregator to cache file listing, rescan dir only if changed

This commit is contained in:
Ilya Kreymer 2016-03-19 20:34:09 -07:00
parent f5ee3c7bca
commit 4cf935abd1
5 changed files with 75 additions and 14 deletions

View File

@ -221,24 +221,53 @@ class BaseDirectoryIndexSource(BaseAggregator):
def _load_files(self, glob_dir): def _load_files(self, glob_dir):
for the_dir in glob.iglob(glob_dir): for the_dir in glob.iglob(glob_dir):
for name in os.listdir(the_dir): for result in self._load_files_single_dir(the_dir):
filename = os.path.join(the_dir, name) yield result
if filename.endswith(self.CDX_EXT): def _load_files_single_dir(self, the_dir):
print('Adding ' + filename) for name in os.listdir(the_dir):
rel_path = os.path.relpath(the_dir, self.base_prefix) filename = os.path.join(the_dir, name)
if rel_path == '.':
full_name = name
else:
full_name = rel_path + '/' + name
yield full_name, FileIndexSource(filename) if filename.endswith(self.CDX_EXT):
print('Adding ' + filename)
rel_path = os.path.relpath(the_dir, self.base_prefix)
if rel_path == '.':
full_name = name
else:
full_name = rel_path + '/' + name
yield full_name, FileIndexSource(filename)
def __str__(self): def __str__(self):
return 'file_dir' return 'file_dir'
#=============================================================================
class DirectoryIndexSource(SeqAggMixin, BaseDirectoryIndexSource): class DirectoryIndexSource(SeqAggMixin, BaseDirectoryIndexSource):
pass pass
#=============================================================================
class CacheDirectoryIndexSource(DirectoryIndexSource):
def __init__(self, *args, **kwargs):
super(CacheDirectoryIndexSource, self).__init__(*args, **kwargs)
self.cached_file_list = {}
def _load_files_single_dir(self, the_dir):
try:
stat = os.stat(the_dir)
except Exception as e:
stat = 0
result = self.cached_file_list.get(the_dir)
if result:
last_stat, files = result
if stat and last_stat == stat:
print('Dir {0} unchanged'.format(the_dir))
return files
files = super(CacheDirectoryIndexSource, self)._load_files_single_dir(the_dir)
files = list(files)
self.cached_file_list[the_dir] = (stat, files)
return files

View File

@ -254,7 +254,7 @@ class LiveWebLoader(BaseLoader):
try: try:
fp = upstream_res.raw._fp.fp fp = upstream_res.raw._fp.fp
if hasattr(fp, 'raw'): if hasattr(fp, 'raw'): #pragma: no cover
fp = fp.raw fp = fp.raw
remote_ip = fp._sock.getpeername()[0] remote_ip = fp._sock.getpeername()[0]
except: #pragma: no cover except: #pragma: no cover

View File

@ -7,7 +7,10 @@ from .testutils import to_path, to_json_list, TempDirTests, BaseTestClass
from mock import patch from mock import patch
from webagg.aggregator import DirectoryIndexSource, SimpleAggregator import time
from webagg.aggregator import DirectoryIndexSource, CacheDirectoryIndexSource
from webagg.aggregator import SimpleAggregator
from webagg.indexsource import MementoIndexSource from webagg.indexsource import MementoIndexSource
@ -44,6 +47,7 @@ class TestDirAgg(TempDirTests, BaseTestClass):
fh.write('foo') fh.write('foo')
cls.dir_loader = DirectoryIndexSource(dir_prefix, dir_path) cls.dir_loader = DirectoryIndexSource(dir_prefix, dir_path)
cls.cache_dir_loader = CacheDirectoryIndexSource(dir_prefix, dir_path)
def test_agg_no_coll_set(self): def test_agg_no_coll_set(self):
res, errs = self.dir_loader(dict(url='example.com/')) res, errs = self.dir_loader(dict(url='example.com/'))
@ -188,3 +192,25 @@ class TestDirAgg(TempDirTests, BaseTestClass):
def test_cache_dir_sources_1(self):
exp = {'sources': {'colls/A/indexes/example.cdxj': 'file',
'colls/B/indexes/iana.cdxj': 'file',
'colls/C/indexes/dupes.cdxj': 'file'}
}
res = self.cache_dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '*'})
assert(res == exp)
res = self.cache_dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '*'})
assert(res == exp)
new_file = os.path.join(self.root_dir, 'colls/C/indexes/empty.cdxj')
with open(new_file, 'a') as fh:
os.utime(new_file)
res = self.cache_dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '*'})
# New File Included
exp['sources']['colls/C/indexes/empty.cdxj'] = 'file'
assert(res == exp)

View File

@ -36,7 +36,7 @@ class TestUpstream(LiveServerTests, BaseTestClass):
def test_live_paths(self): def test_live_paths(self):
res = requests.get(self.base_url + '/') res = requests.get(self.base_url + '/')
assert set(res.json().keys()) == {'/live/postreq', '/live'} assert set(res.json().keys()) == {'/live/postreq', '/live', '/replay/postreq', '/replay'}
def test_upstream_paths(self): def test_upstream_paths(self):
res = self.testapp.get('/') res = self.testapp.get('/')

View File

@ -7,7 +7,7 @@ from multiprocessing import Process
from wsgiref.simple_server import make_server from wsgiref.simple_server import make_server
from webagg.aggregator import SimpleAggregator from webagg.aggregator import SimpleAggregator, CacheDirectoryIndexSource
from webagg.app import ResAggApp from webagg.app import ResAggApp
from webagg.handlers import DefaultResourceHandler from webagg.handlers import DefaultResourceHandler
from webagg.indexsource import LiveIndexSource from webagg.indexsource import LiveIndexSource
@ -66,6 +66,12 @@ class LiveServerTests(object):
{'live': LiveIndexSource()}) {'live': LiveIndexSource()})
) )
) )
app.add_route('/replay',
DefaultResourceHandler(SimpleAggregator(
{'replay': CacheDirectoryIndexSource('./testdata/')}),
'./testdata/'
)
)
return app.application return app.application
@classmethod @classmethod