mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
directory agg: add CacheDirectoryAggregator to cache file listing, rescan dir only if changed
This commit is contained in:
parent
f5ee3c7bca
commit
4cf935abd1
@ -221,24 +221,53 @@ class BaseDirectoryIndexSource(BaseAggregator):
|
|||||||
|
|
||||||
def _load_files(self, glob_dir):
|
def _load_files(self, glob_dir):
|
||||||
for the_dir in glob.iglob(glob_dir):
|
for the_dir in glob.iglob(glob_dir):
|
||||||
for name in os.listdir(the_dir):
|
for result in self._load_files_single_dir(the_dir):
|
||||||
filename = os.path.join(the_dir, name)
|
yield result
|
||||||
|
|
||||||
if filename.endswith(self.CDX_EXT):
|
def _load_files_single_dir(self, the_dir):
|
||||||
print('Adding ' + filename)
|
for name in os.listdir(the_dir):
|
||||||
rel_path = os.path.relpath(the_dir, self.base_prefix)
|
filename = os.path.join(the_dir, name)
|
||||||
if rel_path == '.':
|
|
||||||
full_name = name
|
|
||||||
else:
|
|
||||||
full_name = rel_path + '/' + name
|
|
||||||
|
|
||||||
yield full_name, FileIndexSource(filename)
|
if filename.endswith(self.CDX_EXT):
|
||||||
|
print('Adding ' + filename)
|
||||||
|
rel_path = os.path.relpath(the_dir, self.base_prefix)
|
||||||
|
if rel_path == '.':
|
||||||
|
full_name = name
|
||||||
|
else:
|
||||||
|
full_name = rel_path + '/' + name
|
||||||
|
|
||||||
|
yield full_name, FileIndexSource(filename)
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return 'file_dir'
|
return 'file_dir'
|
||||||
|
|
||||||
|
|
||||||
|
#=============================================================================
|
||||||
class DirectoryIndexSource(SeqAggMixin, BaseDirectoryIndexSource):
|
class DirectoryIndexSource(SeqAggMixin, BaseDirectoryIndexSource):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
#=============================================================================
|
||||||
|
class CacheDirectoryIndexSource(DirectoryIndexSource):
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super(CacheDirectoryIndexSource, self).__init__(*args, **kwargs)
|
||||||
|
self.cached_file_list = {}
|
||||||
|
|
||||||
|
def _load_files_single_dir(self, the_dir):
|
||||||
|
try:
|
||||||
|
stat = os.stat(the_dir)
|
||||||
|
except Exception as e:
|
||||||
|
stat = 0
|
||||||
|
|
||||||
|
result = self.cached_file_list.get(the_dir)
|
||||||
|
|
||||||
|
if result:
|
||||||
|
last_stat, files = result
|
||||||
|
if stat and last_stat == stat:
|
||||||
|
print('Dir {0} unchanged'.format(the_dir))
|
||||||
|
return files
|
||||||
|
|
||||||
|
files = super(CacheDirectoryIndexSource, self)._load_files_single_dir(the_dir)
|
||||||
|
files = list(files)
|
||||||
|
self.cached_file_list[the_dir] = (stat, files)
|
||||||
|
return files
|
||||||
|
@ -254,7 +254,7 @@ class LiveWebLoader(BaseLoader):
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
fp = upstream_res.raw._fp.fp
|
fp = upstream_res.raw._fp.fp
|
||||||
if hasattr(fp, 'raw'):
|
if hasattr(fp, 'raw'): #pragma: no cover
|
||||||
fp = fp.raw
|
fp = fp.raw
|
||||||
remote_ip = fp._sock.getpeername()[0]
|
remote_ip = fp._sock.getpeername()[0]
|
||||||
except: #pragma: no cover
|
except: #pragma: no cover
|
||||||
|
@ -7,7 +7,10 @@ from .testutils import to_path, to_json_list, TempDirTests, BaseTestClass
|
|||||||
|
|
||||||
from mock import patch
|
from mock import patch
|
||||||
|
|
||||||
from webagg.aggregator import DirectoryIndexSource, SimpleAggregator
|
import time
|
||||||
|
|
||||||
|
from webagg.aggregator import DirectoryIndexSource, CacheDirectoryIndexSource
|
||||||
|
from webagg.aggregator import SimpleAggregator
|
||||||
from webagg.indexsource import MementoIndexSource
|
from webagg.indexsource import MementoIndexSource
|
||||||
|
|
||||||
|
|
||||||
@ -44,6 +47,7 @@ class TestDirAgg(TempDirTests, BaseTestClass):
|
|||||||
fh.write('foo')
|
fh.write('foo')
|
||||||
|
|
||||||
cls.dir_loader = DirectoryIndexSource(dir_prefix, dir_path)
|
cls.dir_loader = DirectoryIndexSource(dir_prefix, dir_path)
|
||||||
|
cls.cache_dir_loader = CacheDirectoryIndexSource(dir_prefix, dir_path)
|
||||||
|
|
||||||
def test_agg_no_coll_set(self):
|
def test_agg_no_coll_set(self):
|
||||||
res, errs = self.dir_loader(dict(url='example.com/'))
|
res, errs = self.dir_loader(dict(url='example.com/'))
|
||||||
@ -188,3 +192,25 @@ class TestDirAgg(TempDirTests, BaseTestClass):
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def test_cache_dir_sources_1(self):
|
||||||
|
exp = {'sources': {'colls/A/indexes/example.cdxj': 'file',
|
||||||
|
'colls/B/indexes/iana.cdxj': 'file',
|
||||||
|
'colls/C/indexes/dupes.cdxj': 'file'}
|
||||||
|
}
|
||||||
|
|
||||||
|
res = self.cache_dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '*'})
|
||||||
|
assert(res == exp)
|
||||||
|
|
||||||
|
res = self.cache_dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '*'})
|
||||||
|
assert(res == exp)
|
||||||
|
|
||||||
|
new_file = os.path.join(self.root_dir, 'colls/C/indexes/empty.cdxj')
|
||||||
|
|
||||||
|
with open(new_file, 'a') as fh:
|
||||||
|
os.utime(new_file)
|
||||||
|
|
||||||
|
res = self.cache_dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '*'})
|
||||||
|
|
||||||
|
# New File Included
|
||||||
|
exp['sources']['colls/C/indexes/empty.cdxj'] = 'file'
|
||||||
|
assert(res == exp)
|
||||||
|
@ -36,7 +36,7 @@ class TestUpstream(LiveServerTests, BaseTestClass):
|
|||||||
|
|
||||||
def test_live_paths(self):
|
def test_live_paths(self):
|
||||||
res = requests.get(self.base_url + '/')
|
res = requests.get(self.base_url + '/')
|
||||||
assert set(res.json().keys()) == {'/live/postreq', '/live'}
|
assert set(res.json().keys()) == {'/live/postreq', '/live', '/replay/postreq', '/replay'}
|
||||||
|
|
||||||
def test_upstream_paths(self):
|
def test_upstream_paths(self):
|
||||||
res = self.testapp.get('/')
|
res = self.testapp.get('/')
|
||||||
|
@ -7,7 +7,7 @@ from multiprocessing import Process
|
|||||||
|
|
||||||
from wsgiref.simple_server import make_server
|
from wsgiref.simple_server import make_server
|
||||||
|
|
||||||
from webagg.aggregator import SimpleAggregator
|
from webagg.aggregator import SimpleAggregator, CacheDirectoryIndexSource
|
||||||
from webagg.app import ResAggApp
|
from webagg.app import ResAggApp
|
||||||
from webagg.handlers import DefaultResourceHandler
|
from webagg.handlers import DefaultResourceHandler
|
||||||
from webagg.indexsource import LiveIndexSource
|
from webagg.indexsource import LiveIndexSource
|
||||||
@ -66,6 +66,12 @@ class LiveServerTests(object):
|
|||||||
{'live': LiveIndexSource()})
|
{'live': LiveIndexSource()})
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
app.add_route('/replay',
|
||||||
|
DefaultResourceHandler(SimpleAggregator(
|
||||||
|
{'replay': CacheDirectoryIndexSource('./testdata/')}),
|
||||||
|
'./testdata/'
|
||||||
|
)
|
||||||
|
)
|
||||||
return app.application
|
return app.application
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
Loading…
x
Reference in New Issue
Block a user