diff --git a/pywb/apps/frontendapp.py b/pywb/apps/frontendapp.py index f28e92a3..73c557a1 100644 --- a/pywb/apps/frontendapp.py +++ b/pywb/apps/frontendapp.py @@ -43,6 +43,7 @@ class FrontEndApp(object): self.url_map.add(Rule('/static/_//', endpoint=self.serve_static)) self.url_map.add(Rule('/static/', endpoint=self.serve_static)) self.url_map.add(Rule('//', endpoint=self.serve_coll_page)) + self.url_map.add(Rule('//timemap//', endpoint=self.serve_content)) self.url_map.add(Rule('//', endpoint=self.serve_content)) self.url_map.add(Rule('/collinfo.json', endpoint=self.serve_listing)) self.url_map.add(Rule('/', endpoint=self.serve_home)) @@ -116,7 +117,7 @@ class FrontEndApp(object): return WbResponse.text_response(content, content_type='text/html; charset="utf-8"') - def serve_content(self, environ, coll='', url=''): + def serve_content(self, environ, coll='', url='', timemap_output=''): if not self.is_valid_coll(coll): self.raise_not_found(environ, 'No handler for "/{0}"'.format(coll)) @@ -128,6 +129,8 @@ class FrontEndApp(object): wb_url_str += '?' + environ.get('QUERY_STRING') metadata = self.get_metadata(coll) + if timemap_output: + metadata['output'] = timemap_output try: response = self.rewriterapp.render_content(wb_url_str, metadata, environ) diff --git a/pywb/apps/rewriterapp.py b/pywb/apps/rewriterapp.py index 1162b046..6531db49 100644 --- a/pywb/apps/rewriterapp.py +++ b/pywb/apps/rewriterapp.py @@ -14,7 +14,7 @@ from pywb.utils.loaders import extract_client_cookie from pywb.utils.io import BUFF_SIZE from pywb.utils.memento import MementoUtils -from warcio.timeutils import http_date_to_timestamp +from warcio.timeutils import http_date_to_timestamp, timestamp_to_http_date from warcio.bufferedreaders import BufferedReader from warcio.recordloader import ArcWarcRecordLoader @@ -98,23 +98,36 @@ class RewriterApp(object): wb_url.mod == self.frame_mod and wb_url.is_replay()) + def _check_accept_dt(self, wb_url, environ): + is_timegate = False + if wb_url.is_latest_replay(): + accept_dt = environ.get('HTTP_ACCEPT_DATETIME') + is_timegate = True + if accept_dt: + try: + wb_url.timestamp = http_date_to_timestamp(accept_dt) + except: + raise UpstreamException(400, url=wb_url.url, details='Invalid Accept-Datetime') + #return WbResponse.text_response('Invalid Accept-Datetime', status='400 Bad Request') + + wb_url.type = wb_url.REPLAY + + return is_timegate + def render_content(self, wb_url, kwargs, environ): wb_url = WbUrl(wb_url) + is_timegate = self._check_accept_dt(wb_url, environ) host_prefix = self.get_host_prefix(environ) rel_prefix = self.get_rel_prefix(environ) full_prefix = host_prefix + rel_prefix - resp = self.handle_custom_response(environ, wb_url, - full_prefix, host_prefix, kwargs) - if resp is not None: - content_type = 'text/html' + response = self.handle_custom_response(environ, wb_url, + full_prefix, host_prefix, + kwargs) - # if not replay outer frame, specify utf-8 charset - if not self.is_framed_replay(wb_url): - content_type += '; charset=utf-8' - - return WbResponse.text_response(resp, content_type=content_type) + if response: + return self.format_response(response, wb_url, full_prefix, is_timegate) is_proxy = ('wsgiprox.proxy_host' in environ) @@ -278,7 +291,9 @@ class RewriterApp(object): status_headers.statusline += ' None' if not is_ajax and self.enable_memento: - self._add_memento_links(urlrewriter, full_prefix, memento_dt, status_headers) + self._add_memento_links(cdx['url'], full_prefix, + memento_dt, cdx['timestamp'], status_headers, + is_timegate) set_content_loc = True @@ -295,20 +310,67 @@ class RewriterApp(object): return response - def _add_memento_links(self, urlrewriter, full_prefix, memento_dt, status_headers): - wb_url = urlrewriter.wburl - status_headers.headers.append(('Memento-Datetime', memento_dt)) + def format_response(self, response, wb_url, full_prefix, is_timegate): + memento_ts = None + if not isinstance(response, WbResponse): + content_type = 'text/html' - memento_url = full_prefix + str(wb_url) - timegate_url = urlrewriter.get_new_url(timestamp='') + # if not replay outer frame, specify utf-8 charset + if not self.is_framed_replay(wb_url): + content_type += '; charset=utf-8' + else: + memento_ts = wb_url.timestamp + + response = WbResponse.text_response(response, content_type=content_type) + + self._add_memento_links(wb_url.url, full_prefix, None, memento_ts, + response.status_headers, is_timegate) + return response + + def _add_memento_links(self, url, full_prefix, memento_dt, memento_ts, + status_headers, is_timegate): + + # memento url + header + if not memento_dt and memento_ts: + memento_dt = timestamp_to_http_date(memento_ts) + + if memento_dt: + status_headers.headers.append(('Memento-Datetime', memento_dt)) + + memento_url = full_prefix + memento_ts + self.replay_mod + memento_url += '/' + url + else: + memento_url = None + + timegate_url, timemap_url = self._get_timegate_timemap(url, full_prefix) link = [] + link.append(MementoUtils.make_link(url, 'original')) link.append(MementoUtils.make_link(timegate_url, 'timegate')) - link.append(MementoUtils.make_memento_link(memento_url, 'memento', memento_dt)) + link.append(MementoUtils.make_link(timemap_url, 'timemap')) + + if memento_dt: + link.append(MementoUtils.make_memento_link(memento_url, 'memento', memento_dt)) + link_str = ', '.join(link) status_headers.headers.append(('Link', link_str)) + if is_timegate: + status_headers.headers.append(('Vary', 'accept-datetime')) + + def _get_timegate_timemap(self, url, full_prefix): + # timegate url + timegate_url = full_prefix + if self.replay_mod: + timegate_url += self.replay_mod + '/' + + timegate_url += url + + # timemap url + timemap_url = full_prefix + 'timemap/link/' + url + return timegate_url, timemap_url + def get_top_url(self, full_prefix, wb_url, cdx, kwargs): top_url = full_prefix top_url += wb_url.to_str(mod='') @@ -389,7 +451,7 @@ class RewriterApp(object): def do_query(self, wb_url, kwargs): params = {} params['url'] = wb_url.url - params['output'] = 'json' + params['output'] = kwargs.get('output', 'json') params['from'] = wb_url.timestamp params['to'] = wb_url.end_timestamp @@ -398,11 +460,37 @@ class RewriterApp(object): r = requests.get(upstream_url) - return r.text + return r - def handle_query(self, environ, wb_url, kwargs): + def make_timemap(self, wb_url, res, full_prefix): + wb_url.type = wb_url.QUERY + + content_type = res.headers.get('Content-Type') + text = res.text + + if not res.text: + status = '404 Not Found' + + elif res.status_code: + status = str(res.status_code) + ' ' + res.reason + + if res.status_code == 200: + timegate, timemap = self._get_timegate_timemap(wb_url.url, full_prefix) + + text = MementoUtils.wrap_timemap_header(wb_url.url, + timegate, + timemap, + res.text) + return WbResponse.text_response(text, + content_type=content_type, + status=status) + + def handle_query(self, environ, wb_url, kwargs, full_prefix): res = self.do_query(wb_url, kwargs) + if kwargs.get('output'): + return self.make_timemap(wb_url, res, full_prefix) + def format_cdx(text): cdx_lines = text.rstrip().split('\n') for cdx in cdx_lines: @@ -417,7 +505,7 @@ class RewriterApp(object): params = dict(url=wb_url.url, prefix=prefix, - cdx_lines=list(format_cdx(res))) + cdx_lines=list(format_cdx(res.text))) extra_params = self.get_query_params(wb_url, kwargs) if extra_params: @@ -506,8 +594,8 @@ class RewriterApp(object): return None def handle_custom_response(self, environ, wb_url, full_prefix, host_prefix, kwargs): - if wb_url.is_query(): - return self.handle_query(environ, wb_url, kwargs) + if wb_url.is_query() or kwargs.get('output'): + return self.handle_query(environ, wb_url, kwargs, full_prefix) if self.is_framed_replay(wb_url): extra_params = self.get_top_frame_params(wb_url, kwargs) diff --git a/pywb/utils/memento.py b/pywb/utils/memento.py index de385171..e948e669 100644 --- a/pywb/utils/memento.py +++ b/pywb/utils/memento.py @@ -11,6 +11,9 @@ LINK_SEG_SPLIT = re.compile(';\s*') LINK_URL = re.compile('<(.*)>') LINK_PROP = re.compile('([\w]+)="([^"]+)') +FIND_DT = re.compile('datetime=\"([^\"]+)\"') + + #============================================================================= class MementoException(BadRequestException): pass @@ -18,8 +21,8 @@ class MementoException(BadRequestException): #============================================================================= class MementoUtils(object): - @staticmethod - def parse_links(link_header, def_name='timemap'): + @classmethod + def parse_links(cls, link_header, def_name='timemap'): links = LINK_SPLIT.split(link_header) results = {} mementos = [] @@ -61,9 +64,9 @@ class MementoUtils(object): results['mementos'] = mementos return results - @staticmethod - def make_timemap_memento_link(cdx, datetime=None, rel='memento', end=',\n'): - url = cdx.get('load_url') + @classmethod + def make_timemap_memento_link(cls, cdx, datetime=None, rel='memento', end=',\n'): + url = cdx.get('url') if not url: url = 'file://{0}:{1}:{2}'.format(cdx.get('filename'), cdx.get('offset'), cdx.get('length')) @@ -74,37 +77,43 @@ class MementoUtils(object): return memento.format(url, rel, datetime, cdx.get('source', '')) - @staticmethod - def make_timemap(cdx_iter): - # get first memento as it'll be used for 'from' field - try: - first_cdx = six.next(cdx_iter) - from_date = timestamp_to_http_date(first_cdx['timestamp']) - except StopIteration: - first_cdx = None - return - - # first memento link - yield MementoUtils.make_timemap_memento_link(first_cdx, datetime=from_date) - + @classmethod + def make_timemap(cls, cdx_iter): prev_cdx = None for cdx in cdx_iter: if prev_cdx: - yield MementoUtils.make_timemap_memento_link(prev_cdx) + yield cls.make_timemap_memento_link(prev_cdx) prev_cdx = cdx # last memento link, if any if prev_cdx: - yield MementoUtils.make_timemap_memento_link(prev_cdx, end='\n') + yield cls.make_timemap_memento_link(prev_cdx, end='\n') + + @classmethod + def wrap_timemap_header(cls, url, timegate_url, timemap_url, timemap): + string = cls.make_link(timemap_url, "self") + m = FIND_DT.search(timemap) + if m: + string += '; from="{0}"'.format(m.group(1)) + + string += ',\n' + + string += cls.make_link(timegate_url, "timegate") + ',\n' + string += cls.make_link(url, "original") + ',\n' + string += timemap + return string + + @classmethod + def make_link(cls, url, type): + if type in ('timemap', 'self'): + return '<{0}>; rel="{1}"; type="application/link-format"'.format(url, type) - @staticmethod - def make_link(url, type): return '<{0}>; rel="{1}"'.format(url, type) - @staticmethod - def make_memento_link(url, type, dt): + @classmethod + def make_memento_link(cls, url, type, dt): return '<{0}>; rel="{1}"; datetime="{2}"'.format(url, type, dt) diff --git a/pywb/warcserver/handlers.py b/pywb/warcserver/handlers.py index 785de72d..a8f2dd11 100644 --- a/pywb/warcserver/handlers.py +++ b/pywb/warcserver/handlers.py @@ -24,7 +24,7 @@ def to_text(cdx_iter, fields): return content_type, (cdx.to_text(fields) for cdx in cdx_iter) def to_link(cdx_iter, fields): - content_type = 'application/link' + content_type = 'application/link-format' return content_type, MementoUtils.make_timemap(cdx_iter) diff --git a/tests/memento_fixture.py b/tests/memento_fixture.py index 150cbcf2..28ed7fc9 100644 --- a/tests/memento_fixture.py +++ b/tests/memento_fixture.py @@ -11,11 +11,20 @@ class MementoMixin(object): return list(map(lambda x: x.strip(), re.split(', (?![0-9])', resp.headers[LINK]))) def make_timemap_link(self, url, coll='pywb'): - format_ = '; rel="timemap"; type="{1}"' + format_ = '; rel="timemap"; type="{1}"' return format_.format(url, LINK_FORMAT, coll) - def make_memento_link(self, url, ts, dt, coll='pywb'): - format_ = '; rel="memento"; datetime="{2}"' - return format_.format(url, ts, dt, coll) + def make_original_link(self, url): + format_ = '<{0}>; rel="original"' + return format_.format(url) + + def make_timegate_link(self, url, fmod='', coll='pywb'): + fmod_slash = fmod + '/' if fmod else '' + format_ = '; rel="timegate"' + return format_.format(url, fmod_slash, coll) + + def make_memento_link(self, url, ts, dt, fmod='', coll='pywb'): + format_ = '; rel="memento"; datetime="{2}"' + return format_.format(url, ts, dt, fmod, coll) diff --git a/tests/test_memento.py b/tests/test_memento.py index 340a8ba3..c1745a0c 100644 --- a/tests/test_memento.py +++ b/tests/test_memento.py @@ -2,13 +2,45 @@ from .base_config_test import BaseConfigTest, fmod from .memento_fixture import * +from warcio.timeutils import timestamp_to_http_date + + # ============================================================================ class TestMemento(MementoMixin, BaseConfigTest): @classmethod def setup_class(cls): super(TestMemento, cls).setup_class('config_test.yaml') - def _test_top_frame_replay(self): + def _assert_memento(self, resp, url, ts, fmod, dt=''): + dt = dt or timestamp_to_http_date(ts) + + links = self.get_links(resp) + + assert MEMENTO_DATETIME in resp.headers + assert resp.headers[MEMENTO_DATETIME] == dt + + # memento link + memento_link = self.make_memento_link(url, ts, dt, fmod) + assert memento_link in links + + # content location + assert '/pywb/{1}{0}/{2}'.format(fmod, ts, url) in resp.headers['Content-Location'] + + # content location part of memento link + assert resp.headers['Content-Location'] in memento_link + + # timegate link + assert self.make_timegate_link(url, fmod) in links + + # timemap link + assert self.make_timemap_link(url) in links + + # original + assert self.make_original_link(url) in links + + + # Memento Pattern 2.2 (no redirect, 200 negotiation) + def test_memento_top_frame(self): resp = self.testapp.get('/pywb/20140127171238/http://www.iana.org/') # Memento Headers @@ -18,37 +50,119 @@ class TestMemento(MementoMixin, BaseConfigTest): # memento link dt = 'Mon, 27 Jan 2014 17:12:38 GMT' + url = 'http://www.iana.org/' links = self.get_links(resp) - assert self.make_memento_link('http://www.iana.org/', '20140127171238mp_', dt) in links + + assert self.make_memento_link(url, '20140127171238', dt, 'mp_') in links #timegate link - assert '; rel="timegate"' in links + assert self.make_timegate_link(url, 'mp_') in links # Body assert '