mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
include the collection in Memento Link outputs: (#259)
* include the collection in Memento Link outputs: - add new cdx 'source-coll' field, storing only the collection - ensure rel="collection" property included in the TimeMap and Link header - tests: update all tests to include the 'source-coll' property - docs: add 'collection provenance' to auto-all collection configuration docs
This commit is contained in:
parent
9d681d1a8a
commit
459cd706d3
@ -3,6 +3,7 @@
|
||||
#
|
||||
|
||||
collections:
|
||||
all: $all
|
||||
pywb:
|
||||
index_paths: ./sample_archive/cdx/
|
||||
archive_paths: ./sample_archive/warcs/
|
||||
|
@ -230,7 +230,43 @@ The aggregate all collections automatically aggregates data from all collections
|
||||
|
||||
Accessing ``/all/<url>`` will cause an aggregate lookup within the collections directory.
|
||||
|
||||
Note: It is not (yet) possible to exclude collections from the all collection, although "special" collections are not included.
|
||||
Note: It is not (yet) possible to exclude collections from the auto-all collection, although "special" collections are not included.
|
||||
|
||||
Collection Provenance
|
||||
"""""""""""""""""""""
|
||||
|
||||
When using the auto-all collection, it is possible to determine the original collection of each resource by looking at the ``Link`` header metadata
|
||||
if Memento API is enabled. The header will include the extra ``rel="collection"``, specifying the collection::
|
||||
|
||||
Link: <http://example.com/>; rel="original", <http://localhost:8080/all/mp_/http://example.com/>; rel="timegate", <http://localhost:8080/all/timemap/link/http://example.com/>; rel="timemap"; type="application/link-format", <http://localhost:8080/all/20170920185327mp_/http://example.com/>; rel="memento"; datetime="Wed, 20 Sep 2017 18:20:19 GMT"; collection="coll-1"
|
||||
|
||||
|
||||
For example, if two collections ``coll-1`` and ``coll-2`` contain ``http://example.com/``, loading the timemap for
|
||||
``/all/timemap/link/http://example.com/`` might look like as follows::
|
||||
|
||||
<http://localhost:8080/all/timemap/link/http://example.com/>; rel="self"; type="application/link-format"; from="Wed, 20 Sep 2017 03:53:27 GMT",
|
||||
<http://localhost:8080/all/mp_/http://example.com/>; rel="timegate",
|
||||
<http://example.com/>; rel="original",
|
||||
<http://example.com/>; rel="memento"; datetime="Wed, 20 Sep 2017 03:53:27 GMT"; collection="coll-1",
|
||||
<http://example.com/>; rel="memento"; datetime="Wed, 20 Sep 2017 04:53:27 GMT"; collection="coll-2",
|
||||
|
||||
Identifiying the Collections
|
||||
""""""""""""""""""""""""""""
|
||||
|
||||
When using the "all" collection, it is possible to determine the actual collection of each url by looking at the ``Link`` header metadata,
|
||||
which in addition to memento relations, include the extra ``rel="collection"``, specifying the collection::
|
||||
|
||||
Link: <http://example.com/>; rel="original", <http://localhost:8080/all/mp_/http://example.com/>; rel="timegate", <http://localhost:8080/all/timemap/link/http://example.com/>; rel="timemap"; type="application/link-format", <http://localhost:8080/all/20170920185327mp_/http://example.com/>; rel="memento"; datetime="Wed, 20 Sep 2017 18:20:19 GMT"; collection="coll-1"
|
||||
|
||||
|
||||
For example, if two collections ``coll-1`` and ``coll-2`` contain ``http://example.com/``, loading the timemap for
|
||||
``/all/timemap/link/http://example.com/`` might look like as follows::
|
||||
|
||||
<http://localhost:8080/all/timemap/link/http://example.com/>; rel="self"; type="application/link-format"; from="Wed, 20 Sep 2017 03:53:27 GMT",
|
||||
<http://localhost:8080/all/mp_/http://example.com/>; rel="timegate",
|
||||
<http://example.com/>; rel="original",
|
||||
<http://example.com/>; rel="memento"; datetime="Wed, 20 Sep 2017 03:53:27 GMT"; collection="coll-1",
|
||||
<http://example.com/>; rel="memento"; datetime="Wed, 20 Sep 2017 04:53:27 GMT"; collection="coll-2",
|
||||
|
||||
|
||||
Generic Collection Definitions
|
||||
|
@ -311,7 +311,7 @@ class RewriterApp(object):
|
||||
if not is_ajax and self.enable_memento:
|
||||
self._add_memento_links(cdx['url'], full_prefix,
|
||||
memento_dt, cdx['timestamp'], status_headers,
|
||||
is_timegate, is_proxy)
|
||||
is_timegate, is_proxy, cdx.get('source-coll'))
|
||||
|
||||
set_content_loc = True
|
||||
|
||||
@ -344,7 +344,7 @@ class RewriterApp(object):
|
||||
return response
|
||||
|
||||
def _add_memento_links(self, url, full_prefix, memento_dt, memento_ts,
|
||||
status_headers, is_timegate, is_proxy):
|
||||
status_headers, is_timegate, is_proxy, coll=None):
|
||||
|
||||
# memento url + header
|
||||
if not memento_dt and memento_ts:
|
||||
@ -370,7 +370,7 @@ class RewriterApp(object):
|
||||
link.append(MementoUtils.make_link(timemap_url, 'timemap'))
|
||||
|
||||
if memento_dt:
|
||||
link.append(MementoUtils.make_memento_link(memento_url, 'memento', memento_dt))
|
||||
link.append(MementoUtils.make_memento_link(memento_url, 'memento', memento_dt, coll))
|
||||
|
||||
link_str = ', '.join(link)
|
||||
|
||||
|
@ -70,12 +70,10 @@ class MementoUtils(object):
|
||||
if not url:
|
||||
url = 'file://{0}:{1}:{2}'.format(cdx.get('filename'), cdx.get('offset'), cdx.get('length'))
|
||||
|
||||
memento = '<{0}>; rel="{1}"; datetime="{2}"; src="{3}"' + end
|
||||
|
||||
if not datetime:
|
||||
datetime = timestamp_to_http_date(cdx['timestamp'])
|
||||
|
||||
return memento.format(url, rel, datetime, cdx.get('source', ''))
|
||||
return cls.make_memento_link(url, rel, datetime, cdx.get('source-coll')) + end
|
||||
|
||||
@classmethod
|
||||
def make_timemap(cls, cdx_iter):
|
||||
@ -113,7 +111,11 @@ class MementoUtils(object):
|
||||
return '<{0}>; rel="{1}"'.format(url, type)
|
||||
|
||||
@classmethod
|
||||
def make_memento_link(cls, url, type, dt):
|
||||
return '<{0}>; rel="{1}"; datetime="{2}"'.format(url, type, dt)
|
||||
def make_memento_link(cls, url, type, dt, coll=None):
|
||||
res = '<{0}>; rel="{1}"; datetime="{2}"'.format(url, type, dt)
|
||||
if coll:
|
||||
res += '; collection="{0}"'.format(coll)
|
||||
|
||||
return res
|
||||
|
||||
|
||||
|
@ -55,7 +55,7 @@ class BaseAggregator(object):
|
||||
cdx_iter = iter([])
|
||||
err_list = [(name, repr(wbe))]
|
||||
|
||||
def add_name(cdx, name):
|
||||
def add_source(cdx, name):
|
||||
if not cdx.get('url'):
|
||||
return cdx
|
||||
|
||||
@ -63,6 +63,9 @@ class BaseAggregator(object):
|
||||
cdx['source'] = name + ':' + cdx['source']
|
||||
else:
|
||||
cdx['source'] = name
|
||||
|
||||
cdx['source-coll'] = self._get_coll(name)
|
||||
|
||||
return cdx
|
||||
|
||||
if params.get('nosource') != 'true':
|
||||
@ -70,10 +73,13 @@ class BaseAggregator(object):
|
||||
if src_coll:
|
||||
name += ':' + src_coll
|
||||
|
||||
cdx_iter = (add_name(cdx, name) for cdx in cdx_iter)
|
||||
cdx_iter = (add_source(cdx, name) for cdx in cdx_iter)
|
||||
|
||||
return cdx_iter, err_list
|
||||
|
||||
def _get_coll(self, name):
|
||||
return name
|
||||
|
||||
def load_index(self, params):
|
||||
res_list = self._load_all(params)
|
||||
|
||||
@ -295,6 +301,9 @@ class BaseDirectoryIndexSource(BaseAggregator):
|
||||
|
||||
yield full_name, index_src
|
||||
|
||||
def _get_coll(self, name):
|
||||
return name.split(os.path.sep, 1)[0]
|
||||
|
||||
def __repr__(self):
|
||||
return '{0}(file://{1})'.format(self.__class__.__name__,
|
||||
os.path.join(self.base_prefix, self.base_dir))
|
||||
|
@ -41,6 +41,7 @@ class TestFuzzy(object):
|
||||
'is_fuzzy': True,
|
||||
'urlkey': canonicalize(url),
|
||||
'source': 'source',
|
||||
'source-coll': 'source',
|
||||
'url': url,
|
||||
'mime': mime}]
|
||||
|
||||
|
@ -41,9 +41,10 @@ class PrefixResolver(object):
|
||||
if '*' not in path:
|
||||
return path
|
||||
|
||||
res_path = self.resolve_coll(path, cdx.get('source'))
|
||||
if res_path:
|
||||
return res_path
|
||||
#res_path = self.resolve_coll(path, cdx.get('source'))
|
||||
coll = cdx.get('source-coll')
|
||||
if coll:
|
||||
return path.replace('*', coll)
|
||||
|
||||
if '://' in path:
|
||||
return path
|
||||
|
@ -35,6 +35,7 @@ class TestPathIndex(object):
|
||||
|
||||
cdx = CDXObject()
|
||||
cdx['source'] = 'my-coll/indexes/index.cdxj'
|
||||
cdx['source-coll'] = 'my-coll'
|
||||
|
||||
res = resolver('example.warc.gz', cdx)
|
||||
assert res == 's3://bucket/colls/my-coll/archives/example.warc.gz'
|
||||
|
@ -142,7 +142,8 @@ class TestBaseWarcServer(MementoOverrideTests, FakeRedisTests, BaseTestClass):
|
||||
cdxlist = list([json.loads(cdx) for cdx in resp.text.rstrip().split('\n')])
|
||||
cdxlist[0]['timestamp'] = '2016'
|
||||
assert(cdxlist == [{'url': 'http://httpbin.org/get', 'urlkey': 'org,httpbin)/get', 'is_live': 'true',
|
||||
'mime': '', 'load_url': 'http://httpbin.org/get', 'source': 'live', 'timestamp': '2016'}])
|
||||
'mime': '', 'load_url': 'http://httpbin.org/get',
|
||||
'source': 'live', 'source-coll': 'live', 'timestamp': '2016'}])
|
||||
|
||||
def test_live_resource(self):
|
||||
headers = {'foo': 'bar'}
|
||||
|
@ -23,8 +23,10 @@ class MementoMixin(object):
|
||||
format_ = '<http://localhost:80/{2}/{1}{0}>; rel="timegate"'
|
||||
return format_.format(url, fmod_slash, coll)
|
||||
|
||||
def make_memento_link(self, url, ts, dt, fmod='', coll='pywb'):
|
||||
def make_memento_link(self, url, ts, dt, fmod='', coll='pywb', include_coll=True):
|
||||
format_ = '<http://localhost:80/{4}/{1}{3}/{0}>; rel="memento"; datetime="{2}"'
|
||||
if include_coll:
|
||||
format_ += '; collection="{4}"'
|
||||
return format_.format(url, ts, dt, fmod, coll)
|
||||
|
||||
|
||||
|
@ -181,7 +181,7 @@ class TestCDXApp(BaseTestClass):
|
||||
originals = {}
|
||||
for cdx in cdxes:
|
||||
cdx = CDXObject(cdx.encode('utf-8'))
|
||||
assert len(cdx) == 15
|
||||
assert len(cdx) == 16
|
||||
|
||||
# orig.* fields are either all '-' or (int, int, filename)
|
||||
# check if orig.* fields are equals to corresponding fields
|
||||
|
@ -54,7 +54,7 @@ class TestMemento(MementoMixin, BaseConfigTest):
|
||||
|
||||
links = self.get_links(resp)
|
||||
|
||||
assert self.make_memento_link(url, '20140127171238', dt, 'mp_') in links
|
||||
assert self.make_memento_link(url, '20140127171238', dt, 'mp_', include_coll=False) in links
|
||||
|
||||
#timegate link
|
||||
assert self.make_timegate_link(url, 'mp_') in links
|
||||
@ -131,8 +131,8 @@ class TestMemento(MementoMixin, BaseConfigTest):
|
||||
<http://localhost:80/pywb/timemap/link/http://example.com?example=1>; rel="self"; type="application/link-format"; from="Fri, 03 Jan 2014 03:03:21 GMT",
|
||||
<http://localhost:80/pywb/mp_/http://example.com?example=1>; rel="timegate",
|
||||
<http://example.com?example=1>; rel="original",
|
||||
<http://example.com?example=1>; rel="memento"; datetime="Fri, 03 Jan 2014 03:03:21 GMT"; src="pywb:example.cdx",
|
||||
<http://example.com?example=1>; rel="memento"; datetime="Fri, 03 Jan 2014 03:03:41 GMT"; src="pywb:example.cdx"
|
||||
<http://example.com?example=1>; rel="memento"; datetime="Fri, 03 Jan 2014 03:03:21 GMT"; collection="pywb",
|
||||
<http://example.com?example=1>; rel="memento"; datetime="Fri, 03 Jan 2014 03:03:41 GMT"; collection="pywb"
|
||||
"""
|
||||
assert exp == resp.text
|
||||
|
||||
@ -148,8 +148,8 @@ class TestMemento(MementoMixin, BaseConfigTest):
|
||||
resp.charset = 'utf-8'
|
||||
|
||||
exp = """\
|
||||
com,example)/?example=1 20140103030321 {"url": "http://example.com?example=1", "mime": "text/html", "status": "200", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "redirect": "-", "robotflags": "-", "length": "1043", "offset": "333", "filename": "example.warc.gz", "source": "pywb:example.cdx"}
|
||||
com,example)/?example=1 20140103030341 {"url": "http://example.com?example=1", "mime": "warc/revisit", "status": "-", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "redirect": "-", "robotflags": "-", "length": "553", "offset": "1864", "filename": "example.warc.gz", "source": "pywb:example.cdx"}
|
||||
com,example)/?example=1 20140103030321 {"url": "http://example.com?example=1", "mime": "text/html", "status": "200", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "redirect": "-", "robotflags": "-", "length": "1043", "offset": "333", "filename": "example.warc.gz", "source": "pywb:example.cdx", "source-coll": "pywb"}
|
||||
com,example)/?example=1 20140103030341 {"url": "http://example.com?example=1", "mime": "warc/revisit", "status": "-", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "redirect": "-", "robotflags": "-", "length": "553", "offset": "1864", "filename": "example.warc.gz", "source": "pywb:example.cdx", "source-coll": "pywb"}
|
||||
"""
|
||||
assert exp == resp.text
|
||||
|
||||
|
@ -60,7 +60,7 @@ class TestProxy(BaseTestProxy):
|
||||
assert 'WB Insert' in res.text
|
||||
assert 'Example Domain' in res.text
|
||||
|
||||
assert res.headers['Link'] == '<http://example.com>; rel="memento"; datetime="Mon, 27 Jan 2014 17:12:51 GMT"'
|
||||
assert res.headers['Link'] == '<http://example.com>; rel="memento"; datetime="Mon, 27 Jan 2014 17:12:51 GMT"; collection="pywb"'
|
||||
assert res.headers['Memento-Datetime'] == 'Mon, 27 Jan 2014 17:12:51 GMT'
|
||||
|
||||
|
||||
|
@ -97,6 +97,10 @@ class TestRecordReplay(CollsDirMixin, BaseConfigTest):
|
||||
assert cdxj_lines[1]['source'] == to_path('test2/indexes/autoindex.cdxj')
|
||||
assert cdxj_lines[2]['source'] == to_path('test/indexes/autoindex.cdxj')
|
||||
|
||||
assert cdxj_lines[0]['source-coll'] == 'test'
|
||||
assert cdxj_lines[1]['source-coll'] == 'test2'
|
||||
assert cdxj_lines[2]['source-coll'] == 'test'
|
||||
|
||||
assert cdxj_lines[0]['filename'] == cdxj_lines[2]['filename']
|
||||
|
||||
def test_timemap_all_coll(self):
|
||||
@ -104,8 +108,8 @@ class TestRecordReplay(CollsDirMixin, BaseConfigTest):
|
||||
link_lines = res.text.rstrip().split('\n')
|
||||
assert len(link_lines) == 5
|
||||
|
||||
assert to_path('test2/indexes/autoindex.cdxj') in link_lines[3]
|
||||
assert to_path('test/indexes/autoindex.cdxj') in link_lines[4]
|
||||
assert to_path('collection="test2"') in link_lines[3]
|
||||
assert to_path('collection="test"') in link_lines[4]
|
||||
|
||||
|
||||
# ============================================================================
|
||||
|
Loading…
x
Reference in New Issue
Block a user