From 924b983a8f15dc60931f50cad338bbe4934baef6 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Fri, 29 Sep 2017 04:20:51 +0000 Subject: [PATCH] dyn collection and all coll improvements: (#69) support dynamic collections, all collection with remote archives (eg. s3:// paths) - warcserver: allow custom dynamic collections index and archive path templates via 'dyn_index_path' and 'dyn_archive_path' - pathresolver: allow resolving wildcard path prefixes with collection, to support remote paths and avoid globbing - warcserver: don't add fixed collections dir to source to support resolving wildcard - pathresolver: add wildcard resolving s3 path test - referrer unrewrite: ensure referrer not empty --- pywb/apps/rewriterapp.py | 5 +++-- pywb/warcserver/resource/pathresolvers.py | 12 ++++++++++++ .../warcserver/resource/test/test_pathresolvers.py | 9 +++++++++ pywb/warcserver/warcserver.py | 14 +++++++------- tests/test_record_replay.py | 10 +++++----- 5 files changed, 36 insertions(+), 14 deletions(-) diff --git a/pywb/apps/rewriterapp.py b/pywb/apps/rewriterapp.py index a55098c9..13fb9875 100644 --- a/pywb/apps/rewriterapp.py +++ b/pywb/apps/rewriterapp.py @@ -556,8 +556,9 @@ class RewriterApp(object): if referrer.startswith(full_prefix): referrer = referrer[len(full_prefix):] - environ['HTTP_REFERER'] = WbUrl(referrer).url - return True + if referrer: + environ['HTTP_REFERER'] = WbUrl(referrer).url + return True return False diff --git a/pywb/warcserver/resource/pathresolvers.py b/pywb/warcserver/resource/pathresolvers.py index f6f6ad61..eed85857 100644 --- a/pywb/warcserver/resource/pathresolvers.py +++ b/pywb/warcserver/resource/pathresolvers.py @@ -33,6 +33,7 @@ class PrefixResolver(object): def __call__(self, filename, cdx): full_path = self.template + if hasattr(cdx, '_formatter') and cdx._formatter: full_path = cdx._formatter.format(full_path) @@ -40,6 +41,10 @@ class PrefixResolver(object): if '*' not in path: return path + res_path = self.resolve_coll(path, cdx.get('source')) + if res_path: + return res_path + if '://' in path: return path @@ -49,6 +54,13 @@ class PrefixResolver(object): else: return path + def resolve_coll(self, path, source): + if not source: + return + + coll = source.split('/', 1)[0] + return path.replace('*', coll) + def __repr__(self): return "PrefixResolver('{0}')".format(self.template) diff --git a/pywb/warcserver/resource/test/test_pathresolvers.py b/pywb/warcserver/resource/test/test_pathresolvers.py index 9fc14128..02f5214b 100644 --- a/pywb/warcserver/resource/test/test_pathresolvers.py +++ b/pywb/warcserver/resource/test/test_pathresolvers.py @@ -30,6 +30,15 @@ class TestPathIndex(object): assert len(res) == 1 assert res[0] == os.path.join(get_test_dir(), 'warcs', 'example.warc.gz') + def test_resolver_dir_wildcard_with_coll(self): + resolver = DefaultResolverMixin.make_best_resolver('s3://bucket/colls/*/archives/') + + cdx = CDXObject() + cdx['source'] = 'my-coll/indexes/index.cdxj' + + res = resolver('example.warc.gz', cdx) + assert res == 's3://bucket/colls/my-coll/archives/example.warc.gz' + def test_resolver_dir_wildcard_as_file_url(self): url = to_file_url(get_test_dir()) + '/*/' resolver = DefaultResolverMixin.make_best_resolver(url) diff --git a/pywb/warcserver/warcserver.py b/pywb/warcserver/warcserver.py index 96fb63e7..f416690a 100644 --- a/pywb/warcserver/warcserver.py +++ b/pywb/warcserver/warcserver.py @@ -84,14 +84,14 @@ class WarcServer(BaseWarcServer): print('No Root Dir, Skip Auto Colls!') return - #indexes_templ = os.path.join('{coll}', 'indexes') + os.path.sep - self.indexes_templ = self.AUTO_DIR_INDEX_PATH.replace('/', os.path.sep) - dir_source = CacheDirectoryIndexSource(base_prefix=self.root_dir, - base_dir=self.indexes_templ, - name=self.root_dir) + self.indexes_templ = self.config.get('dyn_index_path', self.AUTO_DIR_INDEX_PATH).replace('/', os.path.sep) - self.archive_templ = self.AUTO_DIR_ARCHIVE_PATH.replace('/', os.path.sep) - self.archive_templ = os.path.join(self.root_dir, self.archive_templ) + dir_source = CacheDirectoryIndexSource(base_prefix=self.root_dir, + base_dir=self.indexes_templ) + + self.archive_templ = self.config.get('dyn_archive_path', self.AUTO_DIR_ARCHIVE_PATH).replace('/', os.path.sep) + if '://' not in self.archive_templ: + self.archive_templ = os.path.join(self.root_dir, self.archive_templ) handler = DefaultResourceHandler(dir_source, self.archive_templ) diff --git a/tests/test_record_replay.py b/tests/test_record_replay.py index 2b39db89..6ccd2a1e 100644 --- a/tests/test_record_replay.py +++ b/tests/test_record_replay.py @@ -93,9 +93,9 @@ class TestRecordReplay(CollsDirMixin, BaseConfigTest): assert cdxj_lines[1]['url'] == 'http://httpbin.org/get?C=D' assert cdxj_lines[2]['url'] == 'http://httpbin.org/get?C=D' - assert cdxj_lines[0]['source'] == to_path('_test_colls:test/indexes/autoindex.cdxj') - assert cdxj_lines[1]['source'] == to_path('_test_colls:test2/indexes/autoindex.cdxj') - assert cdxj_lines[2]['source'] == to_path('_test_colls:test/indexes/autoindex.cdxj') + assert cdxj_lines[0]['source'] == to_path('test/indexes/autoindex.cdxj') + assert cdxj_lines[1]['source'] == to_path('test2/indexes/autoindex.cdxj') + assert cdxj_lines[2]['source'] == to_path('test/indexes/autoindex.cdxj') assert cdxj_lines[0]['filename'] == cdxj_lines[2]['filename'] @@ -104,7 +104,7 @@ class TestRecordReplay(CollsDirMixin, BaseConfigTest): link_lines = res.text.rstrip().split('\n') assert len(link_lines) == 5 - assert to_path('_test_colls:test2/indexes/autoindex.cdxj') in link_lines[3] - assert to_path('_test_colls:test/indexes/autoindex.cdxj') in link_lines[4] + assert to_path('test2/indexes/autoindex.cdxj') in link_lines[3] + assert to_path('test/indexes/autoindex.cdxj') in link_lines[4]