mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 08:04:49 +01:00
directory agg: add CacheDirectoryAggregator to cache file listing, rescan dir only if changed
This commit is contained in:
parent
f5ee3c7bca
commit
4cf935abd1
@ -221,24 +221,53 @@ class BaseDirectoryIndexSource(BaseAggregator):
|
||||
|
||||
def _load_files(self, glob_dir):
|
||||
for the_dir in glob.iglob(glob_dir):
|
||||
for name in os.listdir(the_dir):
|
||||
filename = os.path.join(the_dir, name)
|
||||
for result in self._load_files_single_dir(the_dir):
|
||||
yield result
|
||||
|
||||
if filename.endswith(self.CDX_EXT):
|
||||
print('Adding ' + filename)
|
||||
rel_path = os.path.relpath(the_dir, self.base_prefix)
|
||||
if rel_path == '.':
|
||||
full_name = name
|
||||
else:
|
||||
full_name = rel_path + '/' + name
|
||||
def _load_files_single_dir(self, the_dir):
|
||||
for name in os.listdir(the_dir):
|
||||
filename = os.path.join(the_dir, name)
|
||||
|
||||
yield full_name, FileIndexSource(filename)
|
||||
if filename.endswith(self.CDX_EXT):
|
||||
print('Adding ' + filename)
|
||||
rel_path = os.path.relpath(the_dir, self.base_prefix)
|
||||
if rel_path == '.':
|
||||
full_name = name
|
||||
else:
|
||||
full_name = rel_path + '/' + name
|
||||
|
||||
yield full_name, FileIndexSource(filename)
|
||||
|
||||
def __str__(self):
|
||||
return 'file_dir'
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class DirectoryIndexSource(SeqAggMixin, BaseDirectoryIndexSource):
|
||||
pass
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class CacheDirectoryIndexSource(DirectoryIndexSource):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(CacheDirectoryIndexSource, self).__init__(*args, **kwargs)
|
||||
self.cached_file_list = {}
|
||||
|
||||
def _load_files_single_dir(self, the_dir):
|
||||
try:
|
||||
stat = os.stat(the_dir)
|
||||
except Exception as e:
|
||||
stat = 0
|
||||
|
||||
result = self.cached_file_list.get(the_dir)
|
||||
|
||||
if result:
|
||||
last_stat, files = result
|
||||
if stat and last_stat == stat:
|
||||
print('Dir {0} unchanged'.format(the_dir))
|
||||
return files
|
||||
|
||||
files = super(CacheDirectoryIndexSource, self)._load_files_single_dir(the_dir)
|
||||
files = list(files)
|
||||
self.cached_file_list[the_dir] = (stat, files)
|
||||
return files
|
||||
|
@ -254,7 +254,7 @@ class LiveWebLoader(BaseLoader):
|
||||
|
||||
try:
|
||||
fp = upstream_res.raw._fp.fp
|
||||
if hasattr(fp, 'raw'):
|
||||
if hasattr(fp, 'raw'): #pragma: no cover
|
||||
fp = fp.raw
|
||||
remote_ip = fp._sock.getpeername()[0]
|
||||
except: #pragma: no cover
|
||||
|
@ -7,7 +7,10 @@ from .testutils import to_path, to_json_list, TempDirTests, BaseTestClass
|
||||
|
||||
from mock import patch
|
||||
|
||||
from webagg.aggregator import DirectoryIndexSource, SimpleAggregator
|
||||
import time
|
||||
|
||||
from webagg.aggregator import DirectoryIndexSource, CacheDirectoryIndexSource
|
||||
from webagg.aggregator import SimpleAggregator
|
||||
from webagg.indexsource import MementoIndexSource
|
||||
|
||||
|
||||
@ -44,6 +47,7 @@ class TestDirAgg(TempDirTests, BaseTestClass):
|
||||
fh.write('foo')
|
||||
|
||||
cls.dir_loader = DirectoryIndexSource(dir_prefix, dir_path)
|
||||
cls.cache_dir_loader = CacheDirectoryIndexSource(dir_prefix, dir_path)
|
||||
|
||||
def test_agg_no_coll_set(self):
|
||||
res, errs = self.dir_loader(dict(url='example.com/'))
|
||||
@ -188,3 +192,25 @@ class TestDirAgg(TempDirTests, BaseTestClass):
|
||||
|
||||
|
||||
|
||||
def test_cache_dir_sources_1(self):
|
||||
exp = {'sources': {'colls/A/indexes/example.cdxj': 'file',
|
||||
'colls/B/indexes/iana.cdxj': 'file',
|
||||
'colls/C/indexes/dupes.cdxj': 'file'}
|
||||
}
|
||||
|
||||
res = self.cache_dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '*'})
|
||||
assert(res == exp)
|
||||
|
||||
res = self.cache_dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '*'})
|
||||
assert(res == exp)
|
||||
|
||||
new_file = os.path.join(self.root_dir, 'colls/C/indexes/empty.cdxj')
|
||||
|
||||
with open(new_file, 'a') as fh:
|
||||
os.utime(new_file)
|
||||
|
||||
res = self.cache_dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '*'})
|
||||
|
||||
# New File Included
|
||||
exp['sources']['colls/C/indexes/empty.cdxj'] = 'file'
|
||||
assert(res == exp)
|
||||
|
@ -36,7 +36,7 @@ class TestUpstream(LiveServerTests, BaseTestClass):
|
||||
|
||||
def test_live_paths(self):
|
||||
res = requests.get(self.base_url + '/')
|
||||
assert set(res.json().keys()) == {'/live/postreq', '/live'}
|
||||
assert set(res.json().keys()) == {'/live/postreq', '/live', '/replay/postreq', '/replay'}
|
||||
|
||||
def test_upstream_paths(self):
|
||||
res = self.testapp.get('/')
|
||||
|
@ -7,7 +7,7 @@ from multiprocessing import Process
|
||||
|
||||
from wsgiref.simple_server import make_server
|
||||
|
||||
from webagg.aggregator import SimpleAggregator
|
||||
from webagg.aggregator import SimpleAggregator, CacheDirectoryIndexSource
|
||||
from webagg.app import ResAggApp
|
||||
from webagg.handlers import DefaultResourceHandler
|
||||
from webagg.indexsource import LiveIndexSource
|
||||
@ -66,6 +66,12 @@ class LiveServerTests(object):
|
||||
{'live': LiveIndexSource()})
|
||||
)
|
||||
)
|
||||
app.add_route('/replay',
|
||||
DefaultResourceHandler(SimpleAggregator(
|
||||
{'replay': CacheDirectoryIndexSource('./testdata/')}),
|
||||
'./testdata/'
|
||||
)
|
||||
)
|
||||
return app.application
|
||||
|
||||
@classmethod
|
||||
|
Loading…
x
Reference in New Issue
Block a user