1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

include the collection in Memento Link outputs: (#259)

* include the collection in Memento Link outputs:
- add new cdx 'source-coll' field, storing only the collection
- ensure rel="collection" property included in the TimeMap and Link header
- tests: update all tests to include the 'source-coll' property
- docs: add 'collection provenance' to auto-all collection configuration docs
This commit is contained in:
Ilya Kreymer 2017-10-23 15:33:23 -07:00 committed by GitHub
parent 9d681d1a8a
commit 459cd706d3
14 changed files with 83 additions and 25 deletions

View File

@ -3,6 +3,7 @@
# #
collections: collections:
all: $all
pywb: pywb:
index_paths: ./sample_archive/cdx/ index_paths: ./sample_archive/cdx/
archive_paths: ./sample_archive/warcs/ archive_paths: ./sample_archive/warcs/

View File

@ -230,7 +230,43 @@ The aggregate all collections automatically aggregates data from all collections
Accessing ``/all/<url>`` will cause an aggregate lookup within the collections directory. Accessing ``/all/<url>`` will cause an aggregate lookup within the collections directory.
Note: It is not (yet) possible to exclude collections from the all collection, although "special" collections are not included. Note: It is not (yet) possible to exclude collections from the auto-all collection, although "special" collections are not included.
Collection Provenance
"""""""""""""""""""""
When using the auto-all collection, it is possible to determine the original collection of each resource by looking at the ``Link`` header metadata
if Memento API is enabled. The header will include the extra ``rel="collection"``, specifying the collection::
Link: <http://example.com/>; rel="original", <http://localhost:8080/all/mp_/http://example.com/>; rel="timegate", <http://localhost:8080/all/timemap/link/http://example.com/>; rel="timemap"; type="application/link-format", <http://localhost:8080/all/20170920185327mp_/http://example.com/>; rel="memento"; datetime="Wed, 20 Sep 2017 18:20:19 GMT"; collection="coll-1"
For example, if two collections ``coll-1`` and ``coll-2`` contain ``http://example.com/``, loading the timemap for
``/all/timemap/link/http://example.com/`` might look like as follows::
<http://localhost:8080/all/timemap/link/http://example.com/>; rel="self"; type="application/link-format"; from="Wed, 20 Sep 2017 03:53:27 GMT",
<http://localhost:8080/all/mp_/http://example.com/>; rel="timegate",
<http://example.com/>; rel="original",
<http://example.com/>; rel="memento"; datetime="Wed, 20 Sep 2017 03:53:27 GMT"; collection="coll-1",
<http://example.com/>; rel="memento"; datetime="Wed, 20 Sep 2017 04:53:27 GMT"; collection="coll-2",
Identifiying the Collections
""""""""""""""""""""""""""""
When using the "all" collection, it is possible to determine the actual collection of each url by looking at the ``Link`` header metadata,
which in addition to memento relations, include the extra ``rel="collection"``, specifying the collection::
Link: <http://example.com/>; rel="original", <http://localhost:8080/all/mp_/http://example.com/>; rel="timegate", <http://localhost:8080/all/timemap/link/http://example.com/>; rel="timemap"; type="application/link-format", <http://localhost:8080/all/20170920185327mp_/http://example.com/>; rel="memento"; datetime="Wed, 20 Sep 2017 18:20:19 GMT"; collection="coll-1"
For example, if two collections ``coll-1`` and ``coll-2`` contain ``http://example.com/``, loading the timemap for
``/all/timemap/link/http://example.com/`` might look like as follows::
<http://localhost:8080/all/timemap/link/http://example.com/>; rel="self"; type="application/link-format"; from="Wed, 20 Sep 2017 03:53:27 GMT",
<http://localhost:8080/all/mp_/http://example.com/>; rel="timegate",
<http://example.com/>; rel="original",
<http://example.com/>; rel="memento"; datetime="Wed, 20 Sep 2017 03:53:27 GMT"; collection="coll-1",
<http://example.com/>; rel="memento"; datetime="Wed, 20 Sep 2017 04:53:27 GMT"; collection="coll-2",
Generic Collection Definitions Generic Collection Definitions

View File

@ -311,7 +311,7 @@ class RewriterApp(object):
if not is_ajax and self.enable_memento: if not is_ajax and self.enable_memento:
self._add_memento_links(cdx['url'], full_prefix, self._add_memento_links(cdx['url'], full_prefix,
memento_dt, cdx['timestamp'], status_headers, memento_dt, cdx['timestamp'], status_headers,
is_timegate, is_proxy) is_timegate, is_proxy, cdx.get('source-coll'))
set_content_loc = True set_content_loc = True
@ -344,7 +344,7 @@ class RewriterApp(object):
return response return response
def _add_memento_links(self, url, full_prefix, memento_dt, memento_ts, def _add_memento_links(self, url, full_prefix, memento_dt, memento_ts,
status_headers, is_timegate, is_proxy): status_headers, is_timegate, is_proxy, coll=None):
# memento url + header # memento url + header
if not memento_dt and memento_ts: if not memento_dt and memento_ts:
@ -370,7 +370,7 @@ class RewriterApp(object):
link.append(MementoUtils.make_link(timemap_url, 'timemap')) link.append(MementoUtils.make_link(timemap_url, 'timemap'))
if memento_dt: if memento_dt:
link.append(MementoUtils.make_memento_link(memento_url, 'memento', memento_dt)) link.append(MementoUtils.make_memento_link(memento_url, 'memento', memento_dt, coll))
link_str = ', '.join(link) link_str = ', '.join(link)

View File

@ -70,12 +70,10 @@ class MementoUtils(object):
if not url: if not url:
url = 'file://{0}:{1}:{2}'.format(cdx.get('filename'), cdx.get('offset'), cdx.get('length')) url = 'file://{0}:{1}:{2}'.format(cdx.get('filename'), cdx.get('offset'), cdx.get('length'))
memento = '<{0}>; rel="{1}"; datetime="{2}"; src="{3}"' + end
if not datetime: if not datetime:
datetime = timestamp_to_http_date(cdx['timestamp']) datetime = timestamp_to_http_date(cdx['timestamp'])
return memento.format(url, rel, datetime, cdx.get('source', '')) return cls.make_memento_link(url, rel, datetime, cdx.get('source-coll')) + end
@classmethod @classmethod
def make_timemap(cls, cdx_iter): def make_timemap(cls, cdx_iter):
@ -113,7 +111,11 @@ class MementoUtils(object):
return '<{0}>; rel="{1}"'.format(url, type) return '<{0}>; rel="{1}"'.format(url, type)
@classmethod @classmethod
def make_memento_link(cls, url, type, dt): def make_memento_link(cls, url, type, dt, coll=None):
return '<{0}>; rel="{1}"; datetime="{2}"'.format(url, type, dt) res = '<{0}>; rel="{1}"; datetime="{2}"'.format(url, type, dt)
if coll:
res += '; collection="{0}"'.format(coll)
return res

View File

@ -55,7 +55,7 @@ class BaseAggregator(object):
cdx_iter = iter([]) cdx_iter = iter([])
err_list = [(name, repr(wbe))] err_list = [(name, repr(wbe))]
def add_name(cdx, name): def add_source(cdx, name):
if not cdx.get('url'): if not cdx.get('url'):
return cdx return cdx
@ -63,6 +63,9 @@ class BaseAggregator(object):
cdx['source'] = name + ':' + cdx['source'] cdx['source'] = name + ':' + cdx['source']
else: else:
cdx['source'] = name cdx['source'] = name
cdx['source-coll'] = self._get_coll(name)
return cdx return cdx
if params.get('nosource') != 'true': if params.get('nosource') != 'true':
@ -70,10 +73,13 @@ class BaseAggregator(object):
if src_coll: if src_coll:
name += ':' + src_coll name += ':' + src_coll
cdx_iter = (add_name(cdx, name) for cdx in cdx_iter) cdx_iter = (add_source(cdx, name) for cdx in cdx_iter)
return cdx_iter, err_list return cdx_iter, err_list
def _get_coll(self, name):
return name
def load_index(self, params): def load_index(self, params):
res_list = self._load_all(params) res_list = self._load_all(params)
@ -295,6 +301,9 @@ class BaseDirectoryIndexSource(BaseAggregator):
yield full_name, index_src yield full_name, index_src
def _get_coll(self, name):
return name.split(os.path.sep, 1)[0]
def __repr__(self): def __repr__(self):
return '{0}(file://{1})'.format(self.__class__.__name__, return '{0}(file://{1})'.format(self.__class__.__name__,
os.path.join(self.base_prefix, self.base_dir)) os.path.join(self.base_prefix, self.base_dir))

View File

@ -41,6 +41,7 @@ class TestFuzzy(object):
'is_fuzzy': True, 'is_fuzzy': True,
'urlkey': canonicalize(url), 'urlkey': canonicalize(url),
'source': 'source', 'source': 'source',
'source-coll': 'source',
'url': url, 'url': url,
'mime': mime}] 'mime': mime}]

View File

@ -41,9 +41,10 @@ class PrefixResolver(object):
if '*' not in path: if '*' not in path:
return path return path
res_path = self.resolve_coll(path, cdx.get('source')) #res_path = self.resolve_coll(path, cdx.get('source'))
if res_path: coll = cdx.get('source-coll')
return res_path if coll:
return path.replace('*', coll)
if '://' in path: if '://' in path:
return path return path

View File

@ -35,6 +35,7 @@ class TestPathIndex(object):
cdx = CDXObject() cdx = CDXObject()
cdx['source'] = 'my-coll/indexes/index.cdxj' cdx['source'] = 'my-coll/indexes/index.cdxj'
cdx['source-coll'] = 'my-coll'
res = resolver('example.warc.gz', cdx) res = resolver('example.warc.gz', cdx)
assert res == 's3://bucket/colls/my-coll/archives/example.warc.gz' assert res == 's3://bucket/colls/my-coll/archives/example.warc.gz'

View File

@ -142,7 +142,8 @@ class TestBaseWarcServer(MementoOverrideTests, FakeRedisTests, BaseTestClass):
cdxlist = list([json.loads(cdx) for cdx in resp.text.rstrip().split('\n')]) cdxlist = list([json.loads(cdx) for cdx in resp.text.rstrip().split('\n')])
cdxlist[0]['timestamp'] = '2016' cdxlist[0]['timestamp'] = '2016'
assert(cdxlist == [{'url': 'http://httpbin.org/get', 'urlkey': 'org,httpbin)/get', 'is_live': 'true', assert(cdxlist == [{'url': 'http://httpbin.org/get', 'urlkey': 'org,httpbin)/get', 'is_live': 'true',
'mime': '', 'load_url': 'http://httpbin.org/get', 'source': 'live', 'timestamp': '2016'}]) 'mime': '', 'load_url': 'http://httpbin.org/get',
'source': 'live', 'source-coll': 'live', 'timestamp': '2016'}])
def test_live_resource(self): def test_live_resource(self):
headers = {'foo': 'bar'} headers = {'foo': 'bar'}

View File

@ -23,8 +23,10 @@ class MementoMixin(object):
format_ = '<http://localhost:80/{2}/{1}{0}>; rel="timegate"' format_ = '<http://localhost:80/{2}/{1}{0}>; rel="timegate"'
return format_.format(url, fmod_slash, coll) return format_.format(url, fmod_slash, coll)
def make_memento_link(self, url, ts, dt, fmod='', coll='pywb'): def make_memento_link(self, url, ts, dt, fmod='', coll='pywb', include_coll=True):
format_ = '<http://localhost:80/{4}/{1}{3}/{0}>; rel="memento"; datetime="{2}"' format_ = '<http://localhost:80/{4}/{1}{3}/{0}>; rel="memento"; datetime="{2}"'
if include_coll:
format_ += '; collection="{4}"'
return format_.format(url, ts, dt, fmod, coll) return format_.format(url, ts, dt, fmod, coll)

View File

@ -181,7 +181,7 @@ class TestCDXApp(BaseTestClass):
originals = {} originals = {}
for cdx in cdxes: for cdx in cdxes:
cdx = CDXObject(cdx.encode('utf-8')) cdx = CDXObject(cdx.encode('utf-8'))
assert len(cdx) == 15 assert len(cdx) == 16
# orig.* fields are either all '-' or (int, int, filename) # orig.* fields are either all '-' or (int, int, filename)
# check if orig.* fields are equals to corresponding fields # check if orig.* fields are equals to corresponding fields

View File

@ -54,7 +54,7 @@ class TestMemento(MementoMixin, BaseConfigTest):
links = self.get_links(resp) links = self.get_links(resp)
assert self.make_memento_link(url, '20140127171238', dt, 'mp_') in links assert self.make_memento_link(url, '20140127171238', dt, 'mp_', include_coll=False) in links
#timegate link #timegate link
assert self.make_timegate_link(url, 'mp_') in links assert self.make_timegate_link(url, 'mp_') in links
@ -131,8 +131,8 @@ class TestMemento(MementoMixin, BaseConfigTest):
<http://localhost:80/pywb/timemap/link/http://example.com?example=1>; rel="self"; type="application/link-format"; from="Fri, 03 Jan 2014 03:03:21 GMT", <http://localhost:80/pywb/timemap/link/http://example.com?example=1>; rel="self"; type="application/link-format"; from="Fri, 03 Jan 2014 03:03:21 GMT",
<http://localhost:80/pywb/mp_/http://example.com?example=1>; rel="timegate", <http://localhost:80/pywb/mp_/http://example.com?example=1>; rel="timegate",
<http://example.com?example=1>; rel="original", <http://example.com?example=1>; rel="original",
<http://example.com?example=1>; rel="memento"; datetime="Fri, 03 Jan 2014 03:03:21 GMT"; src="pywb:example.cdx", <http://example.com?example=1>; rel="memento"; datetime="Fri, 03 Jan 2014 03:03:21 GMT"; collection="pywb",
<http://example.com?example=1>; rel="memento"; datetime="Fri, 03 Jan 2014 03:03:41 GMT"; src="pywb:example.cdx" <http://example.com?example=1>; rel="memento"; datetime="Fri, 03 Jan 2014 03:03:41 GMT"; collection="pywb"
""" """
assert exp == resp.text assert exp == resp.text
@ -148,8 +148,8 @@ class TestMemento(MementoMixin, BaseConfigTest):
resp.charset = 'utf-8' resp.charset = 'utf-8'
exp = """\ exp = """\
com,example)/?example=1 20140103030321 {"url": "http://example.com?example=1", "mime": "text/html", "status": "200", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "redirect": "-", "robotflags": "-", "length": "1043", "offset": "333", "filename": "example.warc.gz", "source": "pywb:example.cdx"} com,example)/?example=1 20140103030321 {"url": "http://example.com?example=1", "mime": "text/html", "status": "200", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "redirect": "-", "robotflags": "-", "length": "1043", "offset": "333", "filename": "example.warc.gz", "source": "pywb:example.cdx", "source-coll": "pywb"}
com,example)/?example=1 20140103030341 {"url": "http://example.com?example=1", "mime": "warc/revisit", "status": "-", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "redirect": "-", "robotflags": "-", "length": "553", "offset": "1864", "filename": "example.warc.gz", "source": "pywb:example.cdx"} com,example)/?example=1 20140103030341 {"url": "http://example.com?example=1", "mime": "warc/revisit", "status": "-", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "redirect": "-", "robotflags": "-", "length": "553", "offset": "1864", "filename": "example.warc.gz", "source": "pywb:example.cdx", "source-coll": "pywb"}
""" """
assert exp == resp.text assert exp == resp.text

View File

@ -60,7 +60,7 @@ class TestProxy(BaseTestProxy):
assert 'WB Insert' in res.text assert 'WB Insert' in res.text
assert 'Example Domain' in res.text assert 'Example Domain' in res.text
assert res.headers['Link'] == '<http://example.com>; rel="memento"; datetime="Mon, 27 Jan 2014 17:12:51 GMT"' assert res.headers['Link'] == '<http://example.com>; rel="memento"; datetime="Mon, 27 Jan 2014 17:12:51 GMT"; collection="pywb"'
assert res.headers['Memento-Datetime'] == 'Mon, 27 Jan 2014 17:12:51 GMT' assert res.headers['Memento-Datetime'] == 'Mon, 27 Jan 2014 17:12:51 GMT'

View File

@ -97,6 +97,10 @@ class TestRecordReplay(CollsDirMixin, BaseConfigTest):
assert cdxj_lines[1]['source'] == to_path('test2/indexes/autoindex.cdxj') assert cdxj_lines[1]['source'] == to_path('test2/indexes/autoindex.cdxj')
assert cdxj_lines[2]['source'] == to_path('test/indexes/autoindex.cdxj') assert cdxj_lines[2]['source'] == to_path('test/indexes/autoindex.cdxj')
assert cdxj_lines[0]['source-coll'] == 'test'
assert cdxj_lines[1]['source-coll'] == 'test2'
assert cdxj_lines[2]['source-coll'] == 'test'
assert cdxj_lines[0]['filename'] == cdxj_lines[2]['filename'] assert cdxj_lines[0]['filename'] == cdxj_lines[2]['filename']
def test_timemap_all_coll(self): def test_timemap_all_coll(self):
@ -104,8 +108,8 @@ class TestRecordReplay(CollsDirMixin, BaseConfigTest):
link_lines = res.text.rstrip().split('\n') link_lines = res.text.rstrip().split('\n')
assert len(link_lines) == 5 assert len(link_lines) == 5
assert to_path('test2/indexes/autoindex.cdxj') in link_lines[3] assert to_path('collection="test2"') in link_lines[3]
assert to_path('test/indexes/autoindex.cdxj') in link_lines[4] assert to_path('collection="test"') in link_lines[4]
# ============================================================================ # ============================================================================