mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
include the collection in Memento Link outputs: (#259)
* include the collection in Memento Link outputs: - add new cdx 'source-coll' field, storing only the collection - ensure rel="collection" property included in the TimeMap and Link header - tests: update all tests to include the 'source-coll' property - docs: add 'collection provenance' to auto-all collection configuration docs
This commit is contained in:
parent
9d681d1a8a
commit
459cd706d3
@ -3,6 +3,7 @@
|
|||||||
#
|
#
|
||||||
|
|
||||||
collections:
|
collections:
|
||||||
|
all: $all
|
||||||
pywb:
|
pywb:
|
||||||
index_paths: ./sample_archive/cdx/
|
index_paths: ./sample_archive/cdx/
|
||||||
archive_paths: ./sample_archive/warcs/
|
archive_paths: ./sample_archive/warcs/
|
||||||
|
@ -230,7 +230,43 @@ The aggregate all collections automatically aggregates data from all collections
|
|||||||
|
|
||||||
Accessing ``/all/<url>`` will cause an aggregate lookup within the collections directory.
|
Accessing ``/all/<url>`` will cause an aggregate lookup within the collections directory.
|
||||||
|
|
||||||
Note: It is not (yet) possible to exclude collections from the all collection, although "special" collections are not included.
|
Note: It is not (yet) possible to exclude collections from the auto-all collection, although "special" collections are not included.
|
||||||
|
|
||||||
|
Collection Provenance
|
||||||
|
"""""""""""""""""""""
|
||||||
|
|
||||||
|
When using the auto-all collection, it is possible to determine the original collection of each resource by looking at the ``Link`` header metadata
|
||||||
|
if Memento API is enabled. The header will include the extra ``rel="collection"``, specifying the collection::
|
||||||
|
|
||||||
|
Link: <http://example.com/>; rel="original", <http://localhost:8080/all/mp_/http://example.com/>; rel="timegate", <http://localhost:8080/all/timemap/link/http://example.com/>; rel="timemap"; type="application/link-format", <http://localhost:8080/all/20170920185327mp_/http://example.com/>; rel="memento"; datetime="Wed, 20 Sep 2017 18:20:19 GMT"; collection="coll-1"
|
||||||
|
|
||||||
|
|
||||||
|
For example, if two collections ``coll-1`` and ``coll-2`` contain ``http://example.com/``, loading the timemap for
|
||||||
|
``/all/timemap/link/http://example.com/`` might look like as follows::
|
||||||
|
|
||||||
|
<http://localhost:8080/all/timemap/link/http://example.com/>; rel="self"; type="application/link-format"; from="Wed, 20 Sep 2017 03:53:27 GMT",
|
||||||
|
<http://localhost:8080/all/mp_/http://example.com/>; rel="timegate",
|
||||||
|
<http://example.com/>; rel="original",
|
||||||
|
<http://example.com/>; rel="memento"; datetime="Wed, 20 Sep 2017 03:53:27 GMT"; collection="coll-1",
|
||||||
|
<http://example.com/>; rel="memento"; datetime="Wed, 20 Sep 2017 04:53:27 GMT"; collection="coll-2",
|
||||||
|
|
||||||
|
Identifiying the Collections
|
||||||
|
""""""""""""""""""""""""""""
|
||||||
|
|
||||||
|
When using the "all" collection, it is possible to determine the actual collection of each url by looking at the ``Link`` header metadata,
|
||||||
|
which in addition to memento relations, include the extra ``rel="collection"``, specifying the collection::
|
||||||
|
|
||||||
|
Link: <http://example.com/>; rel="original", <http://localhost:8080/all/mp_/http://example.com/>; rel="timegate", <http://localhost:8080/all/timemap/link/http://example.com/>; rel="timemap"; type="application/link-format", <http://localhost:8080/all/20170920185327mp_/http://example.com/>; rel="memento"; datetime="Wed, 20 Sep 2017 18:20:19 GMT"; collection="coll-1"
|
||||||
|
|
||||||
|
|
||||||
|
For example, if two collections ``coll-1`` and ``coll-2`` contain ``http://example.com/``, loading the timemap for
|
||||||
|
``/all/timemap/link/http://example.com/`` might look like as follows::
|
||||||
|
|
||||||
|
<http://localhost:8080/all/timemap/link/http://example.com/>; rel="self"; type="application/link-format"; from="Wed, 20 Sep 2017 03:53:27 GMT",
|
||||||
|
<http://localhost:8080/all/mp_/http://example.com/>; rel="timegate",
|
||||||
|
<http://example.com/>; rel="original",
|
||||||
|
<http://example.com/>; rel="memento"; datetime="Wed, 20 Sep 2017 03:53:27 GMT"; collection="coll-1",
|
||||||
|
<http://example.com/>; rel="memento"; datetime="Wed, 20 Sep 2017 04:53:27 GMT"; collection="coll-2",
|
||||||
|
|
||||||
|
|
||||||
Generic Collection Definitions
|
Generic Collection Definitions
|
||||||
|
@ -311,7 +311,7 @@ class RewriterApp(object):
|
|||||||
if not is_ajax and self.enable_memento:
|
if not is_ajax and self.enable_memento:
|
||||||
self._add_memento_links(cdx['url'], full_prefix,
|
self._add_memento_links(cdx['url'], full_prefix,
|
||||||
memento_dt, cdx['timestamp'], status_headers,
|
memento_dt, cdx['timestamp'], status_headers,
|
||||||
is_timegate, is_proxy)
|
is_timegate, is_proxy, cdx.get('source-coll'))
|
||||||
|
|
||||||
set_content_loc = True
|
set_content_loc = True
|
||||||
|
|
||||||
@ -344,7 +344,7 @@ class RewriterApp(object):
|
|||||||
return response
|
return response
|
||||||
|
|
||||||
def _add_memento_links(self, url, full_prefix, memento_dt, memento_ts,
|
def _add_memento_links(self, url, full_prefix, memento_dt, memento_ts,
|
||||||
status_headers, is_timegate, is_proxy):
|
status_headers, is_timegate, is_proxy, coll=None):
|
||||||
|
|
||||||
# memento url + header
|
# memento url + header
|
||||||
if not memento_dt and memento_ts:
|
if not memento_dt and memento_ts:
|
||||||
@ -370,7 +370,7 @@ class RewriterApp(object):
|
|||||||
link.append(MementoUtils.make_link(timemap_url, 'timemap'))
|
link.append(MementoUtils.make_link(timemap_url, 'timemap'))
|
||||||
|
|
||||||
if memento_dt:
|
if memento_dt:
|
||||||
link.append(MementoUtils.make_memento_link(memento_url, 'memento', memento_dt))
|
link.append(MementoUtils.make_memento_link(memento_url, 'memento', memento_dt, coll))
|
||||||
|
|
||||||
link_str = ', '.join(link)
|
link_str = ', '.join(link)
|
||||||
|
|
||||||
|
@ -70,12 +70,10 @@ class MementoUtils(object):
|
|||||||
if not url:
|
if not url:
|
||||||
url = 'file://{0}:{1}:{2}'.format(cdx.get('filename'), cdx.get('offset'), cdx.get('length'))
|
url = 'file://{0}:{1}:{2}'.format(cdx.get('filename'), cdx.get('offset'), cdx.get('length'))
|
||||||
|
|
||||||
memento = '<{0}>; rel="{1}"; datetime="{2}"; src="{3}"' + end
|
|
||||||
|
|
||||||
if not datetime:
|
if not datetime:
|
||||||
datetime = timestamp_to_http_date(cdx['timestamp'])
|
datetime = timestamp_to_http_date(cdx['timestamp'])
|
||||||
|
|
||||||
return memento.format(url, rel, datetime, cdx.get('source', ''))
|
return cls.make_memento_link(url, rel, datetime, cdx.get('source-coll')) + end
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def make_timemap(cls, cdx_iter):
|
def make_timemap(cls, cdx_iter):
|
||||||
@ -113,7 +111,11 @@ class MementoUtils(object):
|
|||||||
return '<{0}>; rel="{1}"'.format(url, type)
|
return '<{0}>; rel="{1}"'.format(url, type)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def make_memento_link(cls, url, type, dt):
|
def make_memento_link(cls, url, type, dt, coll=None):
|
||||||
return '<{0}>; rel="{1}"; datetime="{2}"'.format(url, type, dt)
|
res = '<{0}>; rel="{1}"; datetime="{2}"'.format(url, type, dt)
|
||||||
|
if coll:
|
||||||
|
res += '; collection="{0}"'.format(coll)
|
||||||
|
|
||||||
|
return res
|
||||||
|
|
||||||
|
|
||||||
|
@ -55,7 +55,7 @@ class BaseAggregator(object):
|
|||||||
cdx_iter = iter([])
|
cdx_iter = iter([])
|
||||||
err_list = [(name, repr(wbe))]
|
err_list = [(name, repr(wbe))]
|
||||||
|
|
||||||
def add_name(cdx, name):
|
def add_source(cdx, name):
|
||||||
if not cdx.get('url'):
|
if not cdx.get('url'):
|
||||||
return cdx
|
return cdx
|
||||||
|
|
||||||
@ -63,6 +63,9 @@ class BaseAggregator(object):
|
|||||||
cdx['source'] = name + ':' + cdx['source']
|
cdx['source'] = name + ':' + cdx['source']
|
||||||
else:
|
else:
|
||||||
cdx['source'] = name
|
cdx['source'] = name
|
||||||
|
|
||||||
|
cdx['source-coll'] = self._get_coll(name)
|
||||||
|
|
||||||
return cdx
|
return cdx
|
||||||
|
|
||||||
if params.get('nosource') != 'true':
|
if params.get('nosource') != 'true':
|
||||||
@ -70,10 +73,13 @@ class BaseAggregator(object):
|
|||||||
if src_coll:
|
if src_coll:
|
||||||
name += ':' + src_coll
|
name += ':' + src_coll
|
||||||
|
|
||||||
cdx_iter = (add_name(cdx, name) for cdx in cdx_iter)
|
cdx_iter = (add_source(cdx, name) for cdx in cdx_iter)
|
||||||
|
|
||||||
return cdx_iter, err_list
|
return cdx_iter, err_list
|
||||||
|
|
||||||
|
def _get_coll(self, name):
|
||||||
|
return name
|
||||||
|
|
||||||
def load_index(self, params):
|
def load_index(self, params):
|
||||||
res_list = self._load_all(params)
|
res_list = self._load_all(params)
|
||||||
|
|
||||||
@ -295,6 +301,9 @@ class BaseDirectoryIndexSource(BaseAggregator):
|
|||||||
|
|
||||||
yield full_name, index_src
|
yield full_name, index_src
|
||||||
|
|
||||||
|
def _get_coll(self, name):
|
||||||
|
return name.split(os.path.sep, 1)[0]
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return '{0}(file://{1})'.format(self.__class__.__name__,
|
return '{0}(file://{1})'.format(self.__class__.__name__,
|
||||||
os.path.join(self.base_prefix, self.base_dir))
|
os.path.join(self.base_prefix, self.base_dir))
|
||||||
|
@ -41,6 +41,7 @@ class TestFuzzy(object):
|
|||||||
'is_fuzzy': True,
|
'is_fuzzy': True,
|
||||||
'urlkey': canonicalize(url),
|
'urlkey': canonicalize(url),
|
||||||
'source': 'source',
|
'source': 'source',
|
||||||
|
'source-coll': 'source',
|
||||||
'url': url,
|
'url': url,
|
||||||
'mime': mime}]
|
'mime': mime}]
|
||||||
|
|
||||||
|
@ -41,9 +41,10 @@ class PrefixResolver(object):
|
|||||||
if '*' not in path:
|
if '*' not in path:
|
||||||
return path
|
return path
|
||||||
|
|
||||||
res_path = self.resolve_coll(path, cdx.get('source'))
|
#res_path = self.resolve_coll(path, cdx.get('source'))
|
||||||
if res_path:
|
coll = cdx.get('source-coll')
|
||||||
return res_path
|
if coll:
|
||||||
|
return path.replace('*', coll)
|
||||||
|
|
||||||
if '://' in path:
|
if '://' in path:
|
||||||
return path
|
return path
|
||||||
|
@ -35,6 +35,7 @@ class TestPathIndex(object):
|
|||||||
|
|
||||||
cdx = CDXObject()
|
cdx = CDXObject()
|
||||||
cdx['source'] = 'my-coll/indexes/index.cdxj'
|
cdx['source'] = 'my-coll/indexes/index.cdxj'
|
||||||
|
cdx['source-coll'] = 'my-coll'
|
||||||
|
|
||||||
res = resolver('example.warc.gz', cdx)
|
res = resolver('example.warc.gz', cdx)
|
||||||
assert res == 's3://bucket/colls/my-coll/archives/example.warc.gz'
|
assert res == 's3://bucket/colls/my-coll/archives/example.warc.gz'
|
||||||
|
@ -142,7 +142,8 @@ class TestBaseWarcServer(MementoOverrideTests, FakeRedisTests, BaseTestClass):
|
|||||||
cdxlist = list([json.loads(cdx) for cdx in resp.text.rstrip().split('\n')])
|
cdxlist = list([json.loads(cdx) for cdx in resp.text.rstrip().split('\n')])
|
||||||
cdxlist[0]['timestamp'] = '2016'
|
cdxlist[0]['timestamp'] = '2016'
|
||||||
assert(cdxlist == [{'url': 'http://httpbin.org/get', 'urlkey': 'org,httpbin)/get', 'is_live': 'true',
|
assert(cdxlist == [{'url': 'http://httpbin.org/get', 'urlkey': 'org,httpbin)/get', 'is_live': 'true',
|
||||||
'mime': '', 'load_url': 'http://httpbin.org/get', 'source': 'live', 'timestamp': '2016'}])
|
'mime': '', 'load_url': 'http://httpbin.org/get',
|
||||||
|
'source': 'live', 'source-coll': 'live', 'timestamp': '2016'}])
|
||||||
|
|
||||||
def test_live_resource(self):
|
def test_live_resource(self):
|
||||||
headers = {'foo': 'bar'}
|
headers = {'foo': 'bar'}
|
||||||
|
@ -23,8 +23,10 @@ class MementoMixin(object):
|
|||||||
format_ = '<http://localhost:80/{2}/{1}{0}>; rel="timegate"'
|
format_ = '<http://localhost:80/{2}/{1}{0}>; rel="timegate"'
|
||||||
return format_.format(url, fmod_slash, coll)
|
return format_.format(url, fmod_slash, coll)
|
||||||
|
|
||||||
def make_memento_link(self, url, ts, dt, fmod='', coll='pywb'):
|
def make_memento_link(self, url, ts, dt, fmod='', coll='pywb', include_coll=True):
|
||||||
format_ = '<http://localhost:80/{4}/{1}{3}/{0}>; rel="memento"; datetime="{2}"'
|
format_ = '<http://localhost:80/{4}/{1}{3}/{0}>; rel="memento"; datetime="{2}"'
|
||||||
|
if include_coll:
|
||||||
|
format_ += '; collection="{4}"'
|
||||||
return format_.format(url, ts, dt, fmod, coll)
|
return format_.format(url, ts, dt, fmod, coll)
|
||||||
|
|
||||||
|
|
||||||
|
@ -181,7 +181,7 @@ class TestCDXApp(BaseTestClass):
|
|||||||
originals = {}
|
originals = {}
|
||||||
for cdx in cdxes:
|
for cdx in cdxes:
|
||||||
cdx = CDXObject(cdx.encode('utf-8'))
|
cdx = CDXObject(cdx.encode('utf-8'))
|
||||||
assert len(cdx) == 15
|
assert len(cdx) == 16
|
||||||
|
|
||||||
# orig.* fields are either all '-' or (int, int, filename)
|
# orig.* fields are either all '-' or (int, int, filename)
|
||||||
# check if orig.* fields are equals to corresponding fields
|
# check if orig.* fields are equals to corresponding fields
|
||||||
|
@ -54,7 +54,7 @@ class TestMemento(MementoMixin, BaseConfigTest):
|
|||||||
|
|
||||||
links = self.get_links(resp)
|
links = self.get_links(resp)
|
||||||
|
|
||||||
assert self.make_memento_link(url, '20140127171238', dt, 'mp_') in links
|
assert self.make_memento_link(url, '20140127171238', dt, 'mp_', include_coll=False) in links
|
||||||
|
|
||||||
#timegate link
|
#timegate link
|
||||||
assert self.make_timegate_link(url, 'mp_') in links
|
assert self.make_timegate_link(url, 'mp_') in links
|
||||||
@ -131,8 +131,8 @@ class TestMemento(MementoMixin, BaseConfigTest):
|
|||||||
<http://localhost:80/pywb/timemap/link/http://example.com?example=1>; rel="self"; type="application/link-format"; from="Fri, 03 Jan 2014 03:03:21 GMT",
|
<http://localhost:80/pywb/timemap/link/http://example.com?example=1>; rel="self"; type="application/link-format"; from="Fri, 03 Jan 2014 03:03:21 GMT",
|
||||||
<http://localhost:80/pywb/mp_/http://example.com?example=1>; rel="timegate",
|
<http://localhost:80/pywb/mp_/http://example.com?example=1>; rel="timegate",
|
||||||
<http://example.com?example=1>; rel="original",
|
<http://example.com?example=1>; rel="original",
|
||||||
<http://example.com?example=1>; rel="memento"; datetime="Fri, 03 Jan 2014 03:03:21 GMT"; src="pywb:example.cdx",
|
<http://example.com?example=1>; rel="memento"; datetime="Fri, 03 Jan 2014 03:03:21 GMT"; collection="pywb",
|
||||||
<http://example.com?example=1>; rel="memento"; datetime="Fri, 03 Jan 2014 03:03:41 GMT"; src="pywb:example.cdx"
|
<http://example.com?example=1>; rel="memento"; datetime="Fri, 03 Jan 2014 03:03:41 GMT"; collection="pywb"
|
||||||
"""
|
"""
|
||||||
assert exp == resp.text
|
assert exp == resp.text
|
||||||
|
|
||||||
@ -148,8 +148,8 @@ class TestMemento(MementoMixin, BaseConfigTest):
|
|||||||
resp.charset = 'utf-8'
|
resp.charset = 'utf-8'
|
||||||
|
|
||||||
exp = """\
|
exp = """\
|
||||||
com,example)/?example=1 20140103030321 {"url": "http://example.com?example=1", "mime": "text/html", "status": "200", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "redirect": "-", "robotflags": "-", "length": "1043", "offset": "333", "filename": "example.warc.gz", "source": "pywb:example.cdx"}
|
com,example)/?example=1 20140103030321 {"url": "http://example.com?example=1", "mime": "text/html", "status": "200", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "redirect": "-", "robotflags": "-", "length": "1043", "offset": "333", "filename": "example.warc.gz", "source": "pywb:example.cdx", "source-coll": "pywb"}
|
||||||
com,example)/?example=1 20140103030341 {"url": "http://example.com?example=1", "mime": "warc/revisit", "status": "-", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "redirect": "-", "robotflags": "-", "length": "553", "offset": "1864", "filename": "example.warc.gz", "source": "pywb:example.cdx"}
|
com,example)/?example=1 20140103030341 {"url": "http://example.com?example=1", "mime": "warc/revisit", "status": "-", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "redirect": "-", "robotflags": "-", "length": "553", "offset": "1864", "filename": "example.warc.gz", "source": "pywb:example.cdx", "source-coll": "pywb"}
|
||||||
"""
|
"""
|
||||||
assert exp == resp.text
|
assert exp == resp.text
|
||||||
|
|
||||||
|
@ -60,7 +60,7 @@ class TestProxy(BaseTestProxy):
|
|||||||
assert 'WB Insert' in res.text
|
assert 'WB Insert' in res.text
|
||||||
assert 'Example Domain' in res.text
|
assert 'Example Domain' in res.text
|
||||||
|
|
||||||
assert res.headers['Link'] == '<http://example.com>; rel="memento"; datetime="Mon, 27 Jan 2014 17:12:51 GMT"'
|
assert res.headers['Link'] == '<http://example.com>; rel="memento"; datetime="Mon, 27 Jan 2014 17:12:51 GMT"; collection="pywb"'
|
||||||
assert res.headers['Memento-Datetime'] == 'Mon, 27 Jan 2014 17:12:51 GMT'
|
assert res.headers['Memento-Datetime'] == 'Mon, 27 Jan 2014 17:12:51 GMT'
|
||||||
|
|
||||||
|
|
||||||
|
@ -97,6 +97,10 @@ class TestRecordReplay(CollsDirMixin, BaseConfigTest):
|
|||||||
assert cdxj_lines[1]['source'] == to_path('test2/indexes/autoindex.cdxj')
|
assert cdxj_lines[1]['source'] == to_path('test2/indexes/autoindex.cdxj')
|
||||||
assert cdxj_lines[2]['source'] == to_path('test/indexes/autoindex.cdxj')
|
assert cdxj_lines[2]['source'] == to_path('test/indexes/autoindex.cdxj')
|
||||||
|
|
||||||
|
assert cdxj_lines[0]['source-coll'] == 'test'
|
||||||
|
assert cdxj_lines[1]['source-coll'] == 'test2'
|
||||||
|
assert cdxj_lines[2]['source-coll'] == 'test'
|
||||||
|
|
||||||
assert cdxj_lines[0]['filename'] == cdxj_lines[2]['filename']
|
assert cdxj_lines[0]['filename'] == cdxj_lines[2]['filename']
|
||||||
|
|
||||||
def test_timemap_all_coll(self):
|
def test_timemap_all_coll(self):
|
||||||
@ -104,8 +108,8 @@ class TestRecordReplay(CollsDirMixin, BaseConfigTest):
|
|||||||
link_lines = res.text.rstrip().split('\n')
|
link_lines = res.text.rstrip().split('\n')
|
||||||
assert len(link_lines) == 5
|
assert len(link_lines) == 5
|
||||||
|
|
||||||
assert to_path('test2/indexes/autoindex.cdxj') in link_lines[3]
|
assert to_path('collection="test2"') in link_lines[3]
|
||||||
assert to_path('test/indexes/autoindex.cdxj') in link_lines[4]
|
assert to_path('collection="test"') in link_lines[4]
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
|
Loading…
x
Reference in New Issue
Block a user