1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 08:04:49 +01:00

directory agg: add CacheDirectoryAggregator to cache file listing, rescan dir only if changed

This commit is contained in:
Ilya Kreymer 2016-03-19 20:34:09 -07:00
parent f5ee3c7bca
commit 4cf935abd1
5 changed files with 75 additions and 14 deletions

View File

@ -221,24 +221,53 @@ class BaseDirectoryIndexSource(BaseAggregator):
def _load_files(self, glob_dir):
for the_dir in glob.iglob(glob_dir):
for name in os.listdir(the_dir):
filename = os.path.join(the_dir, name)
for result in self._load_files_single_dir(the_dir):
yield result
if filename.endswith(self.CDX_EXT):
print('Adding ' + filename)
rel_path = os.path.relpath(the_dir, self.base_prefix)
if rel_path == '.':
full_name = name
else:
full_name = rel_path + '/' + name
def _load_files_single_dir(self, the_dir):
for name in os.listdir(the_dir):
filename = os.path.join(the_dir, name)
yield full_name, FileIndexSource(filename)
if filename.endswith(self.CDX_EXT):
print('Adding ' + filename)
rel_path = os.path.relpath(the_dir, self.base_prefix)
if rel_path == '.':
full_name = name
else:
full_name = rel_path + '/' + name
yield full_name, FileIndexSource(filename)
def __str__(self):
return 'file_dir'
#=============================================================================
class DirectoryIndexSource(SeqAggMixin, BaseDirectoryIndexSource):
pass
#=============================================================================
class CacheDirectoryIndexSource(DirectoryIndexSource):
def __init__(self, *args, **kwargs):
super(CacheDirectoryIndexSource, self).__init__(*args, **kwargs)
self.cached_file_list = {}
def _load_files_single_dir(self, the_dir):
try:
stat = os.stat(the_dir)
except Exception as e:
stat = 0
result = self.cached_file_list.get(the_dir)
if result:
last_stat, files = result
if stat and last_stat == stat:
print('Dir {0} unchanged'.format(the_dir))
return files
files = super(CacheDirectoryIndexSource, self)._load_files_single_dir(the_dir)
files = list(files)
self.cached_file_list[the_dir] = (stat, files)
return files

View File

@ -254,7 +254,7 @@ class LiveWebLoader(BaseLoader):
try:
fp = upstream_res.raw._fp.fp
if hasattr(fp, 'raw'):
if hasattr(fp, 'raw'): #pragma: no cover
fp = fp.raw
remote_ip = fp._sock.getpeername()[0]
except: #pragma: no cover

View File

@ -7,7 +7,10 @@ from .testutils import to_path, to_json_list, TempDirTests, BaseTestClass
from mock import patch
from webagg.aggregator import DirectoryIndexSource, SimpleAggregator
import time
from webagg.aggregator import DirectoryIndexSource, CacheDirectoryIndexSource
from webagg.aggregator import SimpleAggregator
from webagg.indexsource import MementoIndexSource
@ -44,6 +47,7 @@ class TestDirAgg(TempDirTests, BaseTestClass):
fh.write('foo')
cls.dir_loader = DirectoryIndexSource(dir_prefix, dir_path)
cls.cache_dir_loader = CacheDirectoryIndexSource(dir_prefix, dir_path)
def test_agg_no_coll_set(self):
res, errs = self.dir_loader(dict(url='example.com/'))
@ -188,3 +192,25 @@ class TestDirAgg(TempDirTests, BaseTestClass):
def test_cache_dir_sources_1(self):
exp = {'sources': {'colls/A/indexes/example.cdxj': 'file',
'colls/B/indexes/iana.cdxj': 'file',
'colls/C/indexes/dupes.cdxj': 'file'}
}
res = self.cache_dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '*'})
assert(res == exp)
res = self.cache_dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '*'})
assert(res == exp)
new_file = os.path.join(self.root_dir, 'colls/C/indexes/empty.cdxj')
with open(new_file, 'a') as fh:
os.utime(new_file)
res = self.cache_dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '*'})
# New File Included
exp['sources']['colls/C/indexes/empty.cdxj'] = 'file'
assert(res == exp)

View File

@ -36,7 +36,7 @@ class TestUpstream(LiveServerTests, BaseTestClass):
def test_live_paths(self):
res = requests.get(self.base_url + '/')
assert set(res.json().keys()) == {'/live/postreq', '/live'}
assert set(res.json().keys()) == {'/live/postreq', '/live', '/replay/postreq', '/replay'}
def test_upstream_paths(self):
res = self.testapp.get('/')

View File

@ -7,7 +7,7 @@ from multiprocessing import Process
from wsgiref.simple_server import make_server
from webagg.aggregator import SimpleAggregator
from webagg.aggregator import SimpleAggregator, CacheDirectoryIndexSource
from webagg.app import ResAggApp
from webagg.handlers import DefaultResourceHandler
from webagg.indexsource import LiveIndexSource
@ -66,6 +66,12 @@ class LiveServerTests(object):
{'live': LiveIndexSource()})
)
)
app.add_route('/replay',
DefaultResourceHandler(SimpleAggregator(
{'replay': CacheDirectoryIndexSource('./testdata/')}),
'./testdata/'
)
)
return app.application
@classmethod