1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

dyn collection and all coll improvements: (#69)

support dynamic collections, all collection with remote archives (eg. s3:// paths)
- warcserver: allow custom dynamic collections index and archive path templates via 'dyn_index_path' and 'dyn_archive_path'
- pathresolver: allow resolving wildcard path prefixes with collection, to support remote paths and avoid globbing
- warcserver: don't add fixed collections dir to source to support resolving wildcard
- pathresolver: add wildcard resolving s3 path test
- referrer unrewrite: ensure referrer not empty
This commit is contained in:
Ilya Kreymer 2017-09-29 04:20:51 +00:00
parent 02f8fa9ff3
commit 924b983a8f
5 changed files with 36 additions and 14 deletions

View File

@ -556,8 +556,9 @@ class RewriterApp(object):
if referrer.startswith(full_prefix):
referrer = referrer[len(full_prefix):]
environ['HTTP_REFERER'] = WbUrl(referrer).url
return True
if referrer:
environ['HTTP_REFERER'] = WbUrl(referrer).url
return True
return False

View File

@ -33,6 +33,7 @@ class PrefixResolver(object):
def __call__(self, filename, cdx):
full_path = self.template
if hasattr(cdx, '_formatter') and cdx._formatter:
full_path = cdx._formatter.format(full_path)
@ -40,6 +41,10 @@ class PrefixResolver(object):
if '*' not in path:
return path
res_path = self.resolve_coll(path, cdx.get('source'))
if res_path:
return res_path
if '://' in path:
return path
@ -49,6 +54,13 @@ class PrefixResolver(object):
else:
return path
def resolve_coll(self, path, source):
if not source:
return
coll = source.split('/', 1)[0]
return path.replace('*', coll)
def __repr__(self):
return "PrefixResolver('{0}')".format(self.template)

View File

@ -30,6 +30,15 @@ class TestPathIndex(object):
assert len(res) == 1
assert res[0] == os.path.join(get_test_dir(), 'warcs', 'example.warc.gz')
def test_resolver_dir_wildcard_with_coll(self):
resolver = DefaultResolverMixin.make_best_resolver('s3://bucket/colls/*/archives/')
cdx = CDXObject()
cdx['source'] = 'my-coll/indexes/index.cdxj'
res = resolver('example.warc.gz', cdx)
assert res == 's3://bucket/colls/my-coll/archives/example.warc.gz'
def test_resolver_dir_wildcard_as_file_url(self):
url = to_file_url(get_test_dir()) + '/*/'
resolver = DefaultResolverMixin.make_best_resolver(url)

View File

@ -84,14 +84,14 @@ class WarcServer(BaseWarcServer):
print('No Root Dir, Skip Auto Colls!')
return
#indexes_templ = os.path.join('{coll}', 'indexes') + os.path.sep
self.indexes_templ = self.AUTO_DIR_INDEX_PATH.replace('/', os.path.sep)
dir_source = CacheDirectoryIndexSource(base_prefix=self.root_dir,
base_dir=self.indexes_templ,
name=self.root_dir)
self.indexes_templ = self.config.get('dyn_index_path', self.AUTO_DIR_INDEX_PATH).replace('/', os.path.sep)
self.archive_templ = self.AUTO_DIR_ARCHIVE_PATH.replace('/', os.path.sep)
self.archive_templ = os.path.join(self.root_dir, self.archive_templ)
dir_source = CacheDirectoryIndexSource(base_prefix=self.root_dir,
base_dir=self.indexes_templ)
self.archive_templ = self.config.get('dyn_archive_path', self.AUTO_DIR_ARCHIVE_PATH).replace('/', os.path.sep)
if '://' not in self.archive_templ:
self.archive_templ = os.path.join(self.root_dir, self.archive_templ)
handler = DefaultResourceHandler(dir_source, self.archive_templ)

View File

@ -93,9 +93,9 @@ class TestRecordReplay(CollsDirMixin, BaseConfigTest):
assert cdxj_lines[1]['url'] == 'http://httpbin.org/get?C=D'
assert cdxj_lines[2]['url'] == 'http://httpbin.org/get?C=D'
assert cdxj_lines[0]['source'] == to_path('_test_colls:test/indexes/autoindex.cdxj')
assert cdxj_lines[1]['source'] == to_path('_test_colls:test2/indexes/autoindex.cdxj')
assert cdxj_lines[2]['source'] == to_path('_test_colls:test/indexes/autoindex.cdxj')
assert cdxj_lines[0]['source'] == to_path('test/indexes/autoindex.cdxj')
assert cdxj_lines[1]['source'] == to_path('test2/indexes/autoindex.cdxj')
assert cdxj_lines[2]['source'] == to_path('test/indexes/autoindex.cdxj')
assert cdxj_lines[0]['filename'] == cdxj_lines[2]['filename']
@ -104,7 +104,7 @@ class TestRecordReplay(CollsDirMixin, BaseConfigTest):
link_lines = res.text.rstrip().split('\n')
assert len(link_lines) == 5
assert to_path('_test_colls:test2/indexes/autoindex.cdxj') in link_lines[3]
assert to_path('_test_colls:test/indexes/autoindex.cdxj') in link_lines[4]
assert to_path('test2/indexes/autoindex.cdxj') in link_lines[3]
assert to_path('test/indexes/autoindex.cdxj') in link_lines[4]