diff --git a/config.yaml b/config.yaml index 420f0d26..01827eb2 100644 --- a/config.yaml +++ b/config.yaml @@ -3,6 +3,7 @@ # collections: + all: $all pywb: index_paths: ./sample_archive/cdx/ archive_paths: ./sample_archive/warcs/ diff --git a/docs/manual/configuring.rst b/docs/manual/configuring.rst index 7f96194c..e42b7940 100644 --- a/docs/manual/configuring.rst +++ b/docs/manual/configuring.rst @@ -230,7 +230,43 @@ The aggregate all collections automatically aggregates data from all collections Accessing ``/all/`` will cause an aggregate lookup within the collections directory. -Note: It is not (yet) possible to exclude collections from the all collection, although "special" collections are not included. +Note: It is not (yet) possible to exclude collections from the auto-all collection, although "special" collections are not included. + +Collection Provenance +""""""""""""""""""""" + +When using the auto-all collection, it is possible to determine the original collection of each resource by looking at the ``Link`` header metadata +if Memento API is enabled. The header will include the extra ``rel="collection"``, specifying the collection:: + + Link: ; rel="original", ; rel="timegate", ; rel="timemap"; type="application/link-format", ; rel="memento"; datetime="Wed, 20 Sep 2017 18:20:19 GMT"; collection="coll-1" + + +For example, if two collections ``coll-1`` and ``coll-2`` contain ``http://example.com/``, loading the timemap for +``/all/timemap/link/http://example.com/`` might look like as follows:: + + ; rel="self"; type="application/link-format"; from="Wed, 20 Sep 2017 03:53:27 GMT", + ; rel="timegate", + ; rel="original", + ; rel="memento"; datetime="Wed, 20 Sep 2017 03:53:27 GMT"; collection="coll-1", + ; rel="memento"; datetime="Wed, 20 Sep 2017 04:53:27 GMT"; collection="coll-2", + +Identifiying the Collections +"""""""""""""""""""""""""""" + +When using the "all" collection, it is possible to determine the actual collection of each url by looking at the ``Link`` header metadata, +which in addition to memento relations, include the extra ``rel="collection"``, specifying the collection:: + + Link: ; rel="original", ; rel="timegate", ; rel="timemap"; type="application/link-format", ; rel="memento"; datetime="Wed, 20 Sep 2017 18:20:19 GMT"; collection="coll-1" + + +For example, if two collections ``coll-1`` and ``coll-2`` contain ``http://example.com/``, loading the timemap for +``/all/timemap/link/http://example.com/`` might look like as follows:: + + ; rel="self"; type="application/link-format"; from="Wed, 20 Sep 2017 03:53:27 GMT", + ; rel="timegate", + ; rel="original", + ; rel="memento"; datetime="Wed, 20 Sep 2017 03:53:27 GMT"; collection="coll-1", + ; rel="memento"; datetime="Wed, 20 Sep 2017 04:53:27 GMT"; collection="coll-2", Generic Collection Definitions diff --git a/pywb/apps/rewriterapp.py b/pywb/apps/rewriterapp.py index 8bf64e1f..74966000 100644 --- a/pywb/apps/rewriterapp.py +++ b/pywb/apps/rewriterapp.py @@ -311,7 +311,7 @@ class RewriterApp(object): if not is_ajax and self.enable_memento: self._add_memento_links(cdx['url'], full_prefix, memento_dt, cdx['timestamp'], status_headers, - is_timegate, is_proxy) + is_timegate, is_proxy, cdx.get('source-coll')) set_content_loc = True @@ -344,7 +344,7 @@ class RewriterApp(object): return response def _add_memento_links(self, url, full_prefix, memento_dt, memento_ts, - status_headers, is_timegate, is_proxy): + status_headers, is_timegate, is_proxy, coll=None): # memento url + header if not memento_dt and memento_ts: @@ -370,7 +370,7 @@ class RewriterApp(object): link.append(MementoUtils.make_link(timemap_url, 'timemap')) if memento_dt: - link.append(MementoUtils.make_memento_link(memento_url, 'memento', memento_dt)) + link.append(MementoUtils.make_memento_link(memento_url, 'memento', memento_dt, coll)) link_str = ', '.join(link) diff --git a/pywb/utils/memento.py b/pywb/utils/memento.py index e948e669..b5e9745e 100644 --- a/pywb/utils/memento.py +++ b/pywb/utils/memento.py @@ -70,12 +70,10 @@ class MementoUtils(object): if not url: url = 'file://{0}:{1}:{2}'.format(cdx.get('filename'), cdx.get('offset'), cdx.get('length')) - memento = '<{0}>; rel="{1}"; datetime="{2}"; src="{3}"' + end - if not datetime: datetime = timestamp_to_http_date(cdx['timestamp']) - return memento.format(url, rel, datetime, cdx.get('source', '')) + return cls.make_memento_link(url, rel, datetime, cdx.get('source-coll')) + end @classmethod def make_timemap(cls, cdx_iter): @@ -113,7 +111,11 @@ class MementoUtils(object): return '<{0}>; rel="{1}"'.format(url, type) @classmethod - def make_memento_link(cls, url, type, dt): - return '<{0}>; rel="{1}"; datetime="{2}"'.format(url, type, dt) + def make_memento_link(cls, url, type, dt, coll=None): + res = '<{0}>; rel="{1}"; datetime="{2}"'.format(url, type, dt) + if coll: + res += '; collection="{0}"'.format(coll) + + return res diff --git a/pywb/warcserver/index/aggregator.py b/pywb/warcserver/index/aggregator.py index 36d3e3b3..7b63299c 100644 --- a/pywb/warcserver/index/aggregator.py +++ b/pywb/warcserver/index/aggregator.py @@ -55,7 +55,7 @@ class BaseAggregator(object): cdx_iter = iter([]) err_list = [(name, repr(wbe))] - def add_name(cdx, name): + def add_source(cdx, name): if not cdx.get('url'): return cdx @@ -63,6 +63,9 @@ class BaseAggregator(object): cdx['source'] = name + ':' + cdx['source'] else: cdx['source'] = name + + cdx['source-coll'] = self._get_coll(name) + return cdx if params.get('nosource') != 'true': @@ -70,10 +73,13 @@ class BaseAggregator(object): if src_coll: name += ':' + src_coll - cdx_iter = (add_name(cdx, name) for cdx in cdx_iter) + cdx_iter = (add_source(cdx, name) for cdx in cdx_iter) return cdx_iter, err_list + def _get_coll(self, name): + return name + def load_index(self, params): res_list = self._load_all(params) @@ -295,6 +301,9 @@ class BaseDirectoryIndexSource(BaseAggregator): yield full_name, index_src + def _get_coll(self, name): + return name.split(os.path.sep, 1)[0] + def __repr__(self): return '{0}(file://{1})'.format(self.__class__.__name__, os.path.join(self.base_prefix, self.base_dir)) diff --git a/pywb/warcserver/index/test/test_fuzzymatcher.py b/pywb/warcserver/index/test/test_fuzzymatcher.py index 85319172..08f552e2 100644 --- a/pywb/warcserver/index/test/test_fuzzymatcher.py +++ b/pywb/warcserver/index/test/test_fuzzymatcher.py @@ -41,6 +41,7 @@ class TestFuzzy(object): 'is_fuzzy': True, 'urlkey': canonicalize(url), 'source': 'source', + 'source-coll': 'source', 'url': url, 'mime': mime}] diff --git a/pywb/warcserver/resource/pathresolvers.py b/pywb/warcserver/resource/pathresolvers.py index 01763ebc..608f091b 100644 --- a/pywb/warcserver/resource/pathresolvers.py +++ b/pywb/warcserver/resource/pathresolvers.py @@ -41,9 +41,10 @@ class PrefixResolver(object): if '*' not in path: return path - res_path = self.resolve_coll(path, cdx.get('source')) - if res_path: - return res_path + #res_path = self.resolve_coll(path, cdx.get('source')) + coll = cdx.get('source-coll') + if coll: + return path.replace('*', coll) if '://' in path: return path diff --git a/pywb/warcserver/resource/test/test_pathresolvers.py b/pywb/warcserver/resource/test/test_pathresolvers.py index 02f5214b..9b130402 100644 --- a/pywb/warcserver/resource/test/test_pathresolvers.py +++ b/pywb/warcserver/resource/test/test_pathresolvers.py @@ -35,6 +35,7 @@ class TestPathIndex(object): cdx = CDXObject() cdx['source'] = 'my-coll/indexes/index.cdxj' + cdx['source-coll'] = 'my-coll' res = resolver('example.warc.gz', cdx) assert res == 's3://bucket/colls/my-coll/archives/example.warc.gz' diff --git a/pywb/warcserver/test/test_handlers.py b/pywb/warcserver/test/test_handlers.py index 79e2075c..1fb65b7d 100644 --- a/pywb/warcserver/test/test_handlers.py +++ b/pywb/warcserver/test/test_handlers.py @@ -142,7 +142,8 @@ class TestBaseWarcServer(MementoOverrideTests, FakeRedisTests, BaseTestClass): cdxlist = list([json.loads(cdx) for cdx in resp.text.rstrip().split('\n')]) cdxlist[0]['timestamp'] = '2016' assert(cdxlist == [{'url': 'http://httpbin.org/get', 'urlkey': 'org,httpbin)/get', 'is_live': 'true', - 'mime': '', 'load_url': 'http://httpbin.org/get', 'source': 'live', 'timestamp': '2016'}]) + 'mime': '', 'load_url': 'http://httpbin.org/get', + 'source': 'live', 'source-coll': 'live', 'timestamp': '2016'}]) def test_live_resource(self): headers = {'foo': 'bar'} diff --git a/tests/memento_fixture.py b/tests/memento_fixture.py index 28ed7fc9..17d9fdcb 100644 --- a/tests/memento_fixture.py +++ b/tests/memento_fixture.py @@ -23,8 +23,10 @@ class MementoMixin(object): format_ = '; rel="timegate"' return format_.format(url, fmod_slash, coll) - def make_memento_link(self, url, ts, dt, fmod='', coll='pywb'): + def make_memento_link(self, url, ts, dt, fmod='', coll='pywb', include_coll=True): format_ = '; rel="memento"; datetime="{2}"' + if include_coll: + format_ += '; collection="{4}"' return format_.format(url, ts, dt, fmod, coll) diff --git a/tests/test_cdx_server_app.py b/tests/test_cdx_server_app.py index 6154bf90..c01dee91 100644 --- a/tests/test_cdx_server_app.py +++ b/tests/test_cdx_server_app.py @@ -181,7 +181,7 @@ class TestCDXApp(BaseTestClass): originals = {} for cdx in cdxes: cdx = CDXObject(cdx.encode('utf-8')) - assert len(cdx) == 15 + assert len(cdx) == 16 # orig.* fields are either all '-' or (int, int, filename) # check if orig.* fields are equals to corresponding fields diff --git a/tests/test_memento.py b/tests/test_memento.py index 69276d05..4f876bbc 100644 --- a/tests/test_memento.py +++ b/tests/test_memento.py @@ -54,7 +54,7 @@ class TestMemento(MementoMixin, BaseConfigTest): links = self.get_links(resp) - assert self.make_memento_link(url, '20140127171238', dt, 'mp_') in links + assert self.make_memento_link(url, '20140127171238', dt, 'mp_', include_coll=False) in links #timegate link assert self.make_timegate_link(url, 'mp_') in links @@ -131,8 +131,8 @@ class TestMemento(MementoMixin, BaseConfigTest): ; rel="self"; type="application/link-format"; from="Fri, 03 Jan 2014 03:03:21 GMT", ; rel="timegate", ; rel="original", -; rel="memento"; datetime="Fri, 03 Jan 2014 03:03:21 GMT"; src="pywb:example.cdx", -; rel="memento"; datetime="Fri, 03 Jan 2014 03:03:41 GMT"; src="pywb:example.cdx" +; rel="memento"; datetime="Fri, 03 Jan 2014 03:03:21 GMT"; collection="pywb", +; rel="memento"; datetime="Fri, 03 Jan 2014 03:03:41 GMT"; collection="pywb" """ assert exp == resp.text @@ -148,8 +148,8 @@ class TestMemento(MementoMixin, BaseConfigTest): resp.charset = 'utf-8' exp = """\ -com,example)/?example=1 20140103030321 {"url": "http://example.com?example=1", "mime": "text/html", "status": "200", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "redirect": "-", "robotflags": "-", "length": "1043", "offset": "333", "filename": "example.warc.gz", "source": "pywb:example.cdx"} -com,example)/?example=1 20140103030341 {"url": "http://example.com?example=1", "mime": "warc/revisit", "status": "-", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "redirect": "-", "robotflags": "-", "length": "553", "offset": "1864", "filename": "example.warc.gz", "source": "pywb:example.cdx"} +com,example)/?example=1 20140103030321 {"url": "http://example.com?example=1", "mime": "text/html", "status": "200", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "redirect": "-", "robotflags": "-", "length": "1043", "offset": "333", "filename": "example.warc.gz", "source": "pywb:example.cdx", "source-coll": "pywb"} +com,example)/?example=1 20140103030341 {"url": "http://example.com?example=1", "mime": "warc/revisit", "status": "-", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "redirect": "-", "robotflags": "-", "length": "553", "offset": "1864", "filename": "example.warc.gz", "source": "pywb:example.cdx", "source-coll": "pywb"} """ assert exp == resp.text diff --git a/tests/test_proxy.py b/tests/test_proxy.py index 9cc68e67..3e603d50 100644 --- a/tests/test_proxy.py +++ b/tests/test_proxy.py @@ -60,7 +60,7 @@ class TestProxy(BaseTestProxy): assert 'WB Insert' in res.text assert 'Example Domain' in res.text - assert res.headers['Link'] == '; rel="memento"; datetime="Mon, 27 Jan 2014 17:12:51 GMT"' + assert res.headers['Link'] == '; rel="memento"; datetime="Mon, 27 Jan 2014 17:12:51 GMT"; collection="pywb"' assert res.headers['Memento-Datetime'] == 'Mon, 27 Jan 2014 17:12:51 GMT' diff --git a/tests/test_record_replay.py b/tests/test_record_replay.py index c2bd6815..6c085383 100644 --- a/tests/test_record_replay.py +++ b/tests/test_record_replay.py @@ -97,6 +97,10 @@ class TestRecordReplay(CollsDirMixin, BaseConfigTest): assert cdxj_lines[1]['source'] == to_path('test2/indexes/autoindex.cdxj') assert cdxj_lines[2]['source'] == to_path('test/indexes/autoindex.cdxj') + assert cdxj_lines[0]['source-coll'] == 'test' + assert cdxj_lines[1]['source-coll'] == 'test2' + assert cdxj_lines[2]['source-coll'] == 'test' + assert cdxj_lines[0]['filename'] == cdxj_lines[2]['filename'] def test_timemap_all_coll(self): @@ -104,8 +108,8 @@ class TestRecordReplay(CollsDirMixin, BaseConfigTest): link_lines = res.text.rstrip().split('\n') assert len(link_lines) == 5 - assert to_path('test2/indexes/autoindex.cdxj') in link_lines[3] - assert to_path('test/indexes/autoindex.cdxj') in link_lines[4] + assert to_path('collection="test2"') in link_lines[3] + assert to_path('collection="test"') in link_lines[4] # ============================================================================