include the collection in Memento Link outputs: (#259)

* include the collection in Memento Link outputs: - add new cdx 'source-coll' field, storing only the collection - ensure rel="collection" property included in the TimeMap and Link header - tests: update all tests to include the 'source-coll' property - docs: add 'collection provenance' to auto-all collection configuration docs
2025-03-15 00:03:28 +01:00 · 2017-10-23 15:33:23 -07:00 · 2017-10-23 15:33:23 -07:00 · 459cd706d3
commit 459cd706d3
parent 9d681d1a8a
14 changed files with 83 additions and 25 deletions
--- a/config.yaml
+++ b/config.yaml
@ -3,6 +3,7 @@
 #
 collections:
    all: $all
    pywb:
        index_paths: ./sample_archive/cdx/
        archive_paths: ./sample_archive/warcs/
--- a/docs/manual/configuring.rst
+++ b/docs/manual/configuring.rst
@ -230,7 +230,43 @@ The aggregate all collections automatically aggregates data from all collections
 Accessing ``/all/<url>`` will cause an aggregate lookup within the collections directory.
-Note: It is not (yet) possible to exclude collections from the all collection, although "special" collections are not included.
+Note: It is not (yet) possible to exclude collections from the auto-all collection, although "special" collections are not included.
 Collection Provenance
 """""""""""""""""""""
 When using the auto-all collection, it is possible to determine the original collection of each resource by looking at the ``Link`` header metadata
 if Memento API is enabled. The header will include the extra ``rel="collection"``, specifying the collection::
  Link: <http://example.com/>; rel="original", <http://localhost:8080/all/mp_/http://example.com/>; rel="timegate", <http://localhost:8080/all/timemap/link/http://example.com/>; rel="timemap"; type="application/link-format", <http://localhost:8080/all/20170920185327mp_/http://example.com/>; rel="memento"; datetime="Wed, 20 Sep 2017 18:20:19 GMT"; collection="coll-1"
 For example, if two collections ``coll-1`` and ``coll-2`` contain ``http://example.com/``, loading the timemap for
 ``/all/timemap/link/http://example.com/`` might look like as follows::
  <http://localhost:8080/all/timemap/link/http://example.com/>; rel="self"; type="application/link-format"; from="Wed, 20 Sep 2017 03:53:27 GMT",
  <http://localhost:8080/all/mp_/http://example.com/>; rel="timegate",
  <http://example.com/>; rel="original",
  <http://example.com/>; rel="memento"; datetime="Wed, 20 Sep 2017 03:53:27 GMT"; collection="coll-1",
  <http://example.com/>; rel="memento"; datetime="Wed, 20 Sep 2017 04:53:27 GMT"; collection="coll-2",
 Identifiying the Collections
 """"""""""""""""""""""""""""
 When using the "all" collection, it is possible to determine the actual collection of each url by looking at the ``Link`` header metadata,
 which in addition to memento relations, include the extra ``rel="collection"``, specifying the collection::
  Link: <http://example.com/>; rel="original", <http://localhost:8080/all/mp_/http://example.com/>; rel="timegate", <http://localhost:8080/all/timemap/link/http://example.com/>; rel="timemap"; type="application/link-format", <http://localhost:8080/all/20170920185327mp_/http://example.com/>; rel="memento"; datetime="Wed, 20 Sep 2017 18:20:19 GMT"; collection="coll-1"
 For example, if two collections ``coll-1`` and ``coll-2`` contain ``http://example.com/``, loading the timemap for
 ``/all/timemap/link/http://example.com/`` might look like as follows::
  <http://localhost:8080/all/timemap/link/http://example.com/>; rel="self"; type="application/link-format"; from="Wed, 20 Sep 2017 03:53:27 GMT",
  <http://localhost:8080/all/mp_/http://example.com/>; rel="timegate",
  <http://example.com/>; rel="original",
  <http://example.com/>; rel="memento"; datetime="Wed, 20 Sep 2017 03:53:27 GMT"; collection="coll-1",
  <http://example.com/>; rel="memento"; datetime="Wed, 20 Sep 2017 04:53:27 GMT"; collection="coll-2",
 Generic Collection Definitions
--- a/pywb/apps/rewriterapp.py
+++ b/pywb/apps/rewriterapp.py
@ -311,7 +311,7 @@ class RewriterApp(object):
        if not is_ajax and self.enable_memento:
            self._add_memento_links(cdx['url'], full_prefix,
                                    memento_dt, cdx['timestamp'], status_headers,
-                                    is_timegate, is_proxy)
+                                    is_timegate, is_proxy, cdx.get('source-coll'))
            set_content_loc = True
@ -344,7 +344,7 @@ class RewriterApp(object):
        return response
    def _add_memento_links(self, url, full_prefix, memento_dt, memento_ts,
-                           status_headers, is_timegate, is_proxy):
+                           status_headers, is_timegate, is_proxy, coll=None):
        # memento url + header
        if not memento_dt and memento_ts:
@ -370,7 +370,7 @@ class RewriterApp(object):
            link.append(MementoUtils.make_link(timemap_url, 'timemap'))
        if memento_dt:
-            link.append(MementoUtils.make_memento_link(memento_url, 'memento', memento_dt))
+            link.append(MementoUtils.make_memento_link(memento_url, 'memento', memento_dt, coll))
        link_str = ', '.join(link)
--- a/pywb/utils/memento.py
+++ b/pywb/utils/memento.py
@ -70,12 +70,10 @@ class MementoUtils(object):
        if not url:
            url = 'file://{0}:{1}:{2}'.format(cdx.get('filename'), cdx.get('offset'), cdx.get('length'))
        memento = '<{0}>; rel="{1}"; datetime="{2}"; src="{3}"' + end
        if not datetime:
            datetime = timestamp_to_http_date(cdx['timestamp'])
-        return memento.format(url, rel, datetime, cdx.get('source', ''))
+        return cls.make_memento_link(url, rel, datetime, cdx.get('source-coll')) + end
    @classmethod
    def make_timemap(cls, cdx_iter):
@ -113,7 +111,11 @@ class MementoUtils(object):
        return '<{0}>; rel="{1}"'.format(url, type)
    @classmethod
-    def make_memento_link(cls, url, type, dt):
+    def make_memento_link(cls, url, type, dt, coll=None):
-        return '<{0}>; rel="{1}"; datetime="{2}"'.format(url, type, dt)
+        res = '<{0}>; rel="{1}"; datetime="{2}"'.format(url, type, dt)
        if coll:
            res += '; collection="{0}"'.format(coll)
        return res
--- a/pywb/warcserver/index/aggregator.py
+++ b/pywb/warcserver/index/aggregator.py
@ -55,7 +55,7 @@ class BaseAggregator(object):
            cdx_iter = iter([])
            err_list = [(name, repr(wbe))]
-        def add_name(cdx, name):
+        def add_source(cdx, name):
            if not cdx.get('url'):
                return cdx
@ -63,6 +63,9 @@ class BaseAggregator(object):
                cdx['source'] = name + ':' + cdx['source']
            else:
                cdx['source'] = name
            cdx['source-coll'] = self._get_coll(name)
            return cdx
        if params.get('nosource') != 'true':
@ -70,10 +73,13 @@ class BaseAggregator(object):
            if src_coll:
                name += ':' + src_coll
-            cdx_iter = (add_name(cdx, name) for cdx in cdx_iter)
+            cdx_iter = (add_source(cdx, name) for cdx in cdx_iter)
        return cdx_iter, err_list
    def _get_coll(self, name):
        return name
    def load_index(self, params):
        res_list = self._load_all(params)
@ -295,6 +301,9 @@ class BaseDirectoryIndexSource(BaseAggregator):
                yield full_name, index_src
    def _get_coll(self, name):
        return name.split(os.path.sep, 1)[0]
    def __repr__(self):
        return '{0}(file://{1})'.format(self.__class__.__name__,
                                        os.path.join(self.base_prefix, self.base_dir))
--- a/pywb/warcserver/index/test/test_fuzzymatcher.py
+++ b/pywb/warcserver/index/test/test_fuzzymatcher.py
@ -41,6 +41,7 @@ class TestFuzzy(object):
               'is_fuzzy': True,
               'urlkey': canonicalize(url),
               'source': 'source',
               'source-coll': 'source',
               'url': url,
               'mime': mime}]
--- a/pywb/warcserver/resource/pathresolvers.py
+++ b/pywb/warcserver/resource/pathresolvers.py
@ -41,9 +41,10 @@ class PrefixResolver(object):
        if '*' not in path:
            return path
-        res_path = self.resolve_coll(path, cdx.get('source'))
+        #res_path = self.resolve_coll(path, cdx.get('source'))
-        if res_path:
+        coll = cdx.get('source-coll')
-            return res_path
+        if coll:
            return path.replace('*', coll)
        if '://' in path:
            return path
--- a/pywb/warcserver/resource/test/test_pathresolvers.py
+++ b/pywb/warcserver/resource/test/test_pathresolvers.py
@ -35,6 +35,7 @@ class TestPathIndex(object):
        cdx = CDXObject()
        cdx['source'] = 'my-coll/indexes/index.cdxj'
        cdx['source-coll'] = 'my-coll'
        res = resolver('example.warc.gz', cdx)
        assert res == 's3://bucket/colls/my-coll/archives/example.warc.gz'
--- a/pywb/warcserver/test/test_handlers.py
+++ b/pywb/warcserver/test/test_handlers.py
@ -142,7 +142,8 @@ class TestBaseWarcServer(MementoOverrideTests, FakeRedisTests, BaseTestClass):
        cdxlist = list([json.loads(cdx) for cdx in resp.text.rstrip().split('\n')])
        cdxlist[0]['timestamp'] = '2016'
        assert(cdxlist == [{'url': 'http://httpbin.org/get', 'urlkey': 'org,httpbin)/get', 'is_live': 'true',
-                            'mime': '', 'load_url': 'http://httpbin.org/get', 'source': 'live', 'timestamp': '2016'}])
+                            'mime': '', 'load_url': 'http://httpbin.org/get',
                            'source': 'live', 'source-coll': 'live', 'timestamp': '2016'}])
    def test_live_resource(self):
        headers = {'foo': 'bar'}
--- a/tests/memento_fixture.py
+++ b/tests/memento_fixture.py
@ -23,8 +23,10 @@ class MementoMixin(object):
        format_ = '<http://localhost:80/{2}/{1}{0}>; rel="timegate"'
        return format_.format(url, fmod_slash, coll)
-    def make_memento_link(self, url, ts, dt, fmod='', coll='pywb'):
+    def make_memento_link(self, url, ts, dt, fmod='', coll='pywb', include_coll=True):
        format_ = '<http://localhost:80/{4}/{1}{3}/{0}>; rel="memento"; datetime="{2}"'
        if include_coll:
            format_ += '; collection="{4}"'
        return format_.format(url, ts, dt, fmod, coll)
--- a/tests/test_cdx_server_app.py
+++ b/tests/test_cdx_server_app.py
@ -181,7 +181,7 @@ class TestCDXApp(BaseTestClass):
        originals = {}
        for cdx in cdxes:
            cdx = CDXObject(cdx.encode('utf-8'))
-            assert len(cdx) == 15
+            assert len(cdx) == 16
            # orig.* fields are either all '-' or (int, int, filename)
            # check if orig.* fields are equals to corresponding fields
--- a/tests/test_memento.py
+++ b/tests/test_memento.py
@ -54,7 +54,7 @@ class TestMemento(MementoMixin, BaseConfigTest):
        links = self.get_links(resp)
-        assert self.make_memento_link(url, '20140127171238', dt, 'mp_') in links
+        assert self.make_memento_link(url, '20140127171238', dt, 'mp_', include_coll=False) in links
        #timegate link
        assert self.make_timegate_link(url, 'mp_') in links
@ -131,8 +131,8 @@ class TestMemento(MementoMixin, BaseConfigTest):
 <http://localhost:80/pywb/timemap/link/http://example.com?example=1>; rel="self"; type="application/link-format"; from="Fri, 03 Jan 2014 03:03:21 GMT",
 <http://localhost:80/pywb/mp_/http://example.com?example=1>; rel="timegate",
 <http://example.com?example=1>; rel="original",
-<http://example.com?example=1>; rel="memento"; datetime="Fri, 03 Jan 2014 03:03:21 GMT"; src="pywb:example.cdx",
+<http://example.com?example=1>; rel="memento"; datetime="Fri, 03 Jan 2014 03:03:21 GMT"; collection="pywb",
-<http://example.com?example=1>; rel="memento"; datetime="Fri, 03 Jan 2014 03:03:41 GMT"; src="pywb:example.cdx"
+<http://example.com?example=1>; rel="memento"; datetime="Fri, 03 Jan 2014 03:03:41 GMT"; collection="pywb"
 """
        assert exp == resp.text
@ -148,8 +148,8 @@ class TestMemento(MementoMixin, BaseConfigTest):
        resp.charset = 'utf-8'
        exp = """\
-com,example)/?example=1 20140103030321 {"url": "http://example.com?example=1", "mime": "text/html", "status": "200", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "redirect": "-", "robotflags": "-", "length": "1043", "offset": "333", "filename": "example.warc.gz", "source": "pywb:example.cdx"}
+com,example)/?example=1 20140103030321 {"url": "http://example.com?example=1", "mime": "text/html", "status": "200", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "redirect": "-", "robotflags": "-", "length": "1043", "offset": "333", "filename": "example.warc.gz", "source": "pywb:example.cdx", "source-coll": "pywb"}
-com,example)/?example=1 20140103030341 {"url": "http://example.com?example=1", "mime": "warc/revisit", "status": "-", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "redirect": "-", "robotflags": "-", "length": "553", "offset": "1864", "filename": "example.warc.gz", "source": "pywb:example.cdx"}
+com,example)/?example=1 20140103030341 {"url": "http://example.com?example=1", "mime": "warc/revisit", "status": "-", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "redirect": "-", "robotflags": "-", "length": "553", "offset": "1864", "filename": "example.warc.gz", "source": "pywb:example.cdx", "source-coll": "pywb"}
 """
        assert exp == resp.text
--- a/tests/test_proxy.py
+++ b/tests/test_proxy.py
@ -60,7 +60,7 @@ class TestProxy(BaseTestProxy):
        assert 'WB Insert' in res.text
        assert 'Example Domain' in res.text
-        assert res.headers['Link'] == '<http://example.com>; rel="memento"; datetime="Mon, 27 Jan 2014 17:12:51 GMT"'
+        assert res.headers['Link'] == '<http://example.com>; rel="memento"; datetime="Mon, 27 Jan 2014 17:12:51 GMT"; collection="pywb"'
        assert res.headers['Memento-Datetime'] == 'Mon, 27 Jan 2014 17:12:51 GMT'
--- a/tests/test_record_replay.py
+++ b/tests/test_record_replay.py
@ -97,6 +97,10 @@ class TestRecordReplay(CollsDirMixin, BaseConfigTest):
        assert cdxj_lines[1]['source'] == to_path('test2/indexes/autoindex.cdxj')
        assert cdxj_lines[2]['source'] == to_path('test/indexes/autoindex.cdxj')
        assert cdxj_lines[0]['source-coll'] == 'test'
        assert cdxj_lines[1]['source-coll'] == 'test2'
        assert cdxj_lines[2]['source-coll'] == 'test'
        assert cdxj_lines[0]['filename'] == cdxj_lines[2]['filename']
    def test_timemap_all_coll(self):
@ -104,8 +108,8 @@ class TestRecordReplay(CollsDirMixin, BaseConfigTest):
        link_lines = res.text.rstrip().split('\n')
        assert len(link_lines) == 5
-        assert to_path('test2/indexes/autoindex.cdxj') in link_lines[3]
+        assert to_path('collection="test2"') in link_lines[3]
-        assert to_path('test/indexes/autoindex.cdxj') in link_lines[4]
+        assert to_path('collection="test"') in link_lines[4]
 # ============================================================================