mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
dyn collection and all coll improvements: (#69)
support dynamic collections, all collection with remote archives (eg. s3:// paths) - warcserver: allow custom dynamic collections index and archive path templates via 'dyn_index_path' and 'dyn_archive_path' - pathresolver: allow resolving wildcard path prefixes with collection, to support remote paths and avoid globbing - warcserver: don't add fixed collections dir to source to support resolving wildcard - pathresolver: add wildcard resolving s3 path test - referrer unrewrite: ensure referrer not empty
This commit is contained in:
parent
02f8fa9ff3
commit
924b983a8f
@ -556,8 +556,9 @@ class RewriterApp(object):
|
||||
|
||||
if referrer.startswith(full_prefix):
|
||||
referrer = referrer[len(full_prefix):]
|
||||
environ['HTTP_REFERER'] = WbUrl(referrer).url
|
||||
return True
|
||||
if referrer:
|
||||
environ['HTTP_REFERER'] = WbUrl(referrer).url
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
@ -33,6 +33,7 @@ class PrefixResolver(object):
|
||||
|
||||
def __call__(self, filename, cdx):
|
||||
full_path = self.template
|
||||
|
||||
if hasattr(cdx, '_formatter') and cdx._formatter:
|
||||
full_path = cdx._formatter.format(full_path)
|
||||
|
||||
@ -40,6 +41,10 @@ class PrefixResolver(object):
|
||||
if '*' not in path:
|
||||
return path
|
||||
|
||||
res_path = self.resolve_coll(path, cdx.get('source'))
|
||||
if res_path:
|
||||
return res_path
|
||||
|
||||
if '://' in path:
|
||||
return path
|
||||
|
||||
@ -49,6 +54,13 @@ class PrefixResolver(object):
|
||||
else:
|
||||
return path
|
||||
|
||||
def resolve_coll(self, path, source):
|
||||
if not source:
|
||||
return
|
||||
|
||||
coll = source.split('/', 1)[0]
|
||||
return path.replace('*', coll)
|
||||
|
||||
def __repr__(self):
|
||||
return "PrefixResolver('{0}')".format(self.template)
|
||||
|
||||
|
@ -30,6 +30,15 @@ class TestPathIndex(object):
|
||||
assert len(res) == 1
|
||||
assert res[0] == os.path.join(get_test_dir(), 'warcs', 'example.warc.gz')
|
||||
|
||||
def test_resolver_dir_wildcard_with_coll(self):
|
||||
resolver = DefaultResolverMixin.make_best_resolver('s3://bucket/colls/*/archives/')
|
||||
|
||||
cdx = CDXObject()
|
||||
cdx['source'] = 'my-coll/indexes/index.cdxj'
|
||||
|
||||
res = resolver('example.warc.gz', cdx)
|
||||
assert res == 's3://bucket/colls/my-coll/archives/example.warc.gz'
|
||||
|
||||
def test_resolver_dir_wildcard_as_file_url(self):
|
||||
url = to_file_url(get_test_dir()) + '/*/'
|
||||
resolver = DefaultResolverMixin.make_best_resolver(url)
|
||||
|
@ -84,14 +84,14 @@ class WarcServer(BaseWarcServer):
|
||||
print('No Root Dir, Skip Auto Colls!')
|
||||
return
|
||||
|
||||
#indexes_templ = os.path.join('{coll}', 'indexes') + os.path.sep
|
||||
self.indexes_templ = self.AUTO_DIR_INDEX_PATH.replace('/', os.path.sep)
|
||||
dir_source = CacheDirectoryIndexSource(base_prefix=self.root_dir,
|
||||
base_dir=self.indexes_templ,
|
||||
name=self.root_dir)
|
||||
self.indexes_templ = self.config.get('dyn_index_path', self.AUTO_DIR_INDEX_PATH).replace('/', os.path.sep)
|
||||
|
||||
self.archive_templ = self.AUTO_DIR_ARCHIVE_PATH.replace('/', os.path.sep)
|
||||
self.archive_templ = os.path.join(self.root_dir, self.archive_templ)
|
||||
dir_source = CacheDirectoryIndexSource(base_prefix=self.root_dir,
|
||||
base_dir=self.indexes_templ)
|
||||
|
||||
self.archive_templ = self.config.get('dyn_archive_path', self.AUTO_DIR_ARCHIVE_PATH).replace('/', os.path.sep)
|
||||
if '://' not in self.archive_templ:
|
||||
self.archive_templ = os.path.join(self.root_dir, self.archive_templ)
|
||||
|
||||
handler = DefaultResourceHandler(dir_source, self.archive_templ)
|
||||
|
||||
|
@ -93,9 +93,9 @@ class TestRecordReplay(CollsDirMixin, BaseConfigTest):
|
||||
assert cdxj_lines[1]['url'] == 'http://httpbin.org/get?C=D'
|
||||
assert cdxj_lines[2]['url'] == 'http://httpbin.org/get?C=D'
|
||||
|
||||
assert cdxj_lines[0]['source'] == to_path('_test_colls:test/indexes/autoindex.cdxj')
|
||||
assert cdxj_lines[1]['source'] == to_path('_test_colls:test2/indexes/autoindex.cdxj')
|
||||
assert cdxj_lines[2]['source'] == to_path('_test_colls:test/indexes/autoindex.cdxj')
|
||||
assert cdxj_lines[0]['source'] == to_path('test/indexes/autoindex.cdxj')
|
||||
assert cdxj_lines[1]['source'] == to_path('test2/indexes/autoindex.cdxj')
|
||||
assert cdxj_lines[2]['source'] == to_path('test/indexes/autoindex.cdxj')
|
||||
|
||||
assert cdxj_lines[0]['filename'] == cdxj_lines[2]['filename']
|
||||
|
||||
@ -104,7 +104,7 @@ class TestRecordReplay(CollsDirMixin, BaseConfigTest):
|
||||
link_lines = res.text.rstrip().split('\n')
|
||||
assert len(link_lines) == 5
|
||||
|
||||
assert to_path('_test_colls:test2/indexes/autoindex.cdxj') in link_lines[3]
|
||||
assert to_path('_test_colls:test/indexes/autoindex.cdxj') in link_lines[4]
|
||||
assert to_path('test2/indexes/autoindex.cdxj') in link_lines[3]
|
||||
assert to_path('test/indexes/autoindex.cdxj') in link_lines[4]
|
||||
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user