diff --git a/config.yaml b/config.yaml index 67f5dc71..319fe1fe 100644 --- a/config.yaml +++ b/config.yaml @@ -101,3 +101,5 @@ enable_cdx_api: true # Permissions checker #perms_checker: !!python/object/new:pywb.cdx.perms.AllowAllPerms [] +# Memento support, enable +enable_memento: true diff --git a/pywb/cdx/cdxobject.py b/pywb/cdx/cdxobject.py index c993a817..02b57c0e 100644 --- a/pywb/cdx/cdxobject.py +++ b/pywb/cdx/cdxobject.py @@ -17,12 +17,6 @@ class CDXException(WbException): return '400 Bad Request' -#================================================================= -class CaptureNotFoundException(CDXException): - def status(self): - return '404 Not Found' - - #================================================================= class CDXObject(OrderedDict): """ diff --git a/pywb/cdx/cdxserver.py b/pywb/cdx/cdxserver.py index ee02edbe..24a34557 100644 --- a/pywb/cdx/cdxserver.py +++ b/pywb/cdx/cdxserver.py @@ -1,9 +1,10 @@ from pywb.utils.canonicalize import UrlCanonicalizer, calc_search_range +from pywb.utils.wbexception import NotFoundException from cdxops import cdx_load from cdxsource import CDXSource, CDXFile, RemoteCDXSource, RedisCDXSource from zipnum import ZipNumCluster -from cdxobject import CDXObject, CaptureNotFoundException, CDXException +from cdxobject import CDXObject, CDXException from query import CDXQuery from cdxdomainspecific import load_domain_specific_cdx_rules @@ -41,7 +42,7 @@ class BaseCDXServer(object): """ Check cdx iter semantics If `cdx_iter` is empty (no matches), check if fuzzy matching is allowed, and try it -- otherwise, - throw :exc:`~pywb.cdx.cdxobject.CaptureNotFoundException` + throw :exc:`~pywb.utils.wbexception.NotFoundException` """ cdx_iter = self.peek_iter(cdx_iter) @@ -60,7 +61,7 @@ class BaseCDXServer(object): return self.load_cdx_query(fuzzy_query_params) msg = 'No Captures found for: ' + query.url - raise CaptureNotFoundException(msg) + raise NotFoundException(msg) def load_cdx(self, **params): return self.load_cdx_query(CDXQuery(**params)) @@ -99,7 +100,7 @@ class CDXServer(BaseCDXServer): ``matchType`` parameter specifies matching method for ``key`` (default ``exact``). other parameters are passed down to :func:`cdx_load`. - raises :exc:`~pywb.cdx.cdxobject.CaptureNotFoundException` + raises :exc:`~pywb.utils.wbexception.NotFoundException` if no captures are found. :param query: query parameters diff --git a/pywb/cdx/test/cdxserver_test.py b/pywb/cdx/test/cdxserver_test.py index 28a669ed..ebfffb91 100644 --- a/pywb/cdx/test/cdxserver_test.py +++ b/pywb/cdx/test/cdxserver_test.py @@ -26,12 +26,12 @@ org,iana)/_js/2013.1/jquery.js 20140126201307 https://www.iana.org/_js/2013.1/jq # No matching results >>> cdx_ops_test('http://iana.org/dont_have_this', reverse = True, resolveRevisits = True, limit = 2) Traceback (most recent call last): -CaptureNotFoundException: No Captures found for: http://iana.org/dont_have_this +NotFoundException: No Captures found for: http://iana.org/dont_have_this # No matching -- limit=1 >>> cdx_ops_test('http://iana.org/dont_have_this', reverse = True, resolveRevisits = True, limit = 1) Traceback (most recent call last): -CaptureNotFoundException: No Captures found for: http://iana.org/dont_have_this +NotFoundException: No Captures found for: http://iana.org/dont_have_this # Filter cdx (default: regex) >>> cdx_ops_test(url = 'http://iana.org/domains', matchType = 'prefix', filter = ['mimetype:text/html']) diff --git a/pywb/core/handlers.py b/pywb/core/handlers.py index 473632a1..876b9393 100644 --- a/pywb/core/handlers.py +++ b/pywb/core/handlers.py @@ -2,9 +2,10 @@ import pkgutil import mimetypes import time +from pywb.utils.wbexception import NotFoundException from pywb.framework.basehandlers import BaseHandler, WbUrlHandler from pywb.framework.wbrequestresponse import WbResponse -from pywb.framework.wbexceptions import WbException, NotFoundException +from pywb.framework.wbexceptions import WbException from views import TextCapturesView diff --git a/pywb/core/pywb_init.py b/pywb/core/pywb_init.py index 741aa489..626a9dfd 100644 --- a/pywb/core/pywb_init.py +++ b/pywb/core/pywb_init.py @@ -1,5 +1,7 @@ from pywb.framework.archivalrouter import ArchivalRouter, Route from pywb.framework.proxy import ProxyArchivalRouter +from pywb.framework.wbrequestresponse import WbRequest +from pywb.framework.memento import MementoRequest from pywb.warc.recordloader import ArcWarcRecordLoader from pywb.warc.resolvingloader import ResolvingLoader @@ -37,6 +39,8 @@ DEFAULTS = { 'static_routes': {'static/default': 'static/'}, 'domain_specific_rules': 'pywb/rules.yaml', + + 'enable_memento': True, } #================================================================= @@ -86,6 +90,8 @@ def create_wb_handler(cdx_server, config, ds_rules_file=None): redir_to_exact=config.get('redir_to_exact', True), + memento=config.get('enable_memento', False), + reporter=config.get('reporter') ) @@ -126,6 +132,12 @@ def create_wb_router(passed_config = {}): # collections based on cdx source collections = config.get('collections') + if config.get('enable_memento', False): + request_class = MementoRequest + else: + request_class = WbRequest + + for name, value in collections.iteritems(): if isinstance(value, str): value = {'index_paths': value} @@ -151,7 +163,9 @@ def create_wb_router(passed_config = {}): route_class = route_config.get('route_class', Route) - routes.append(route_class(name, wb_handler, config = route_config)) + routes.append(route_class(name, wb_handler, + config=route_config, + request_class=request_class)) # cdx query handler if route_config.get('enable_cdx_api', False): diff --git a/pywb/core/replay_views.py b/pywb/core/replay_views.py index f4f5346d..4a6ab4e4 100644 --- a/pywb/core/replay_views.py +++ b/pywb/core/replay_views.py @@ -2,20 +2,25 @@ import re from io import BytesIO from pywb.utils.bufferedreaders import ChunkedDataReader -from pywb.framework.wbrequestresponse import WbResponse +from pywb.utils.statusandheaders import StatusAndHeaders -from pywb.framework.wbexceptions import CaptureException, InternalRedirect +from pywb.framework.wbrequestresponse import WbResponse +from pywb.framework.memento import MementoResponse + +from pywb.framework.wbexceptions import CaptureException from pywb.warc.recordloader import ArchiveLoadFailed from pywb.utils.loaders import LimitReader + #================================================================= class ReplayView: STRIP_SCHEME = re.compile('^([\w]+:[/]*)?(.*?)$') - def __init__(self, content_loader, content_rewriter, head_insert_view = None, - redir_to_exact = True, buffer_response = False, reporter = None): + def __init__(self, content_loader, content_rewriter, head_insert_view=None, + redir_to_exact=True, buffer_response=False, reporter=None, + memento=False): self.content_loader = content_loader self.content_rewriter = content_rewriter @@ -28,6 +33,11 @@ class ReplayView: self._reporter = reporter + if memento: + self.response_class = MementoResponse + else: + self.response_class = WbResponse + def __call__(self, wbrequest, cdx_lines, cdx_loader): last_e = None @@ -42,7 +52,10 @@ class ReplayView: try: # optimize: can detect if redirect is needed just from the cdx, no need to load w/arc data if first: - self._redirect_if_needed(wbrequest, cdx) + redir_response = self._redirect_if_needed(wbrequest, cdx) + if redir_response: + return redir_response + first = False (status_headers, stream) = (self.content_loader. @@ -52,7 +65,9 @@ class ReplayView: self._reject_self_redirect(wbrequest, cdx, status_headers) # check if redir is needed - self._redirect_if_needed(wbrequest, cdx) + redir_response = self._redirect_if_needed(wbrequest, cdx) + if redir_response: + return redir_response # one more check for referrer-based self-redirect self._reject_referrer_self_redirect(wbrequest) @@ -121,12 +136,16 @@ class ReplayView: # no rewriting needed! if rewritten_headers.text_type is None: response_iter = self.stream_to_iter(stream) - return WbResponse(rewritten_headers.status_headers, response_iter) + return self.response_class(rewritten_headers.status_headers, + response_iter, + wbrequest=wbrequest, + cdx=cdx) def make_head_insert(rule): - return (self.head_insert_view.render_to_string(wbrequest=wbrequest, - cdx=cdx, - rule=rule)) + return (self.head_insert_view. + render_to_string(wbrequest=wbrequest, + cdx=cdx, + rule=rule)) # do head insert if self.head_insert_view: head_insert_func = make_head_insert @@ -145,9 +164,12 @@ class ReplayView: if wbrequest.wb_url.mod == 'id_': status_headers.remove_header('content-length') - return self.buffered_response(status_headers, response_gen) + response_gen = self.buffered_response(status_headers, response_gen) - return WbResponse(status_headers, response_gen) + return self.response_class(status_headers, + response_gen, + wbrequest=wbrequest, + cdx=cdx) # Buffer rewrite iterator and return a response from a string @@ -165,15 +187,27 @@ class ReplayView: status_headers.headers.append(('Content-Length', content_length_str)) out.close() - return WbResponse(status_headers, value = [content]) - + return content def _redirect_if_needed(self, wbrequest, cdx): - if self.redir_to_exact and not wbrequest.is_proxy and cdx and (cdx['timestamp'] != wbrequest.wb_url.timestamp): - new_url = wbrequest.urlrewriter.get_timestamp_url(cdx['timestamp'], cdx['original']) - raise InternalRedirect(new_url) + if wbrequest.is_proxy: + return None - return None + redir_needed = hasattr(wbrequest, 'is_timegate') and wbrequest.is_timegate + + if not redir_needed and self.redir_to_exact: + redir_needed = (cdx['timestamp'] != wbrequest.wb_url.timestamp) + + if not redir_needed: + return None + + new_url = wbrequest.urlrewriter.get_timestamp_url(cdx['timestamp'], cdx['original']) + status_headers = StatusAndHeaders('302 Internal Redirect', + [('Location', new_url)]) + + # don't include cdx to indicate internal redirect + return self.response_class(status_headers, + wbrequest=wbrequest) def _reject_self_redirect(self, wbrequest, cdx, status_headers): diff --git a/pywb/framework/archivalrouter.py b/pywb/framework/archivalrouter.py index 78062864..4f5278de 100644 --- a/pywb/framework/archivalrouter.py +++ b/pywb/framework/archivalrouter.py @@ -9,27 +9,23 @@ from wbrequestresponse import WbRequest, WbResponse # ArchivalRouter -- route WB requests in archival mode #================================================================= class ArchivalRouter(object): - def __init__(self, routes, - hostpaths=None, - port=None, - abs_path=True, - home_view=None, - error_view=None): - + def __init__(self, routes, **kwargs): self.routes = routes # optional port setting may be ignored by wsgi container - self.port = port + self.port = kwargs.get('port') + + hostpaths = kwargs.get('hostpaths') if hostpaths: self.fallback = ReferRedirect(hostpaths) else: self.fallback = None - self.abs_path = abs_path + self.abs_path = kwargs.get('abs_path') - self.home_view = home_view - self.error_view = error_view + self.home_view = kwargs.get('home_view') + self.error_view = kwargs.get('error_view') def __call__(self, env): for route in self.routes: @@ -62,6 +58,7 @@ class Route(object): SLASH_QUERY_LOOKAHEAD = '(?=/|$|\?)' def __init__(self, regex, handler, coll_group=0, config={}, + request_class=WbRequest, lookahead=SLASH_QUERY_LOOKAHEAD): self.path = regex @@ -70,6 +67,7 @@ class Route(object): else: self.regex = re.compile('') self.handler = handler + self.request_class = request_class # collection id from regex group (default 0) self.coll_group = coll_group self._custom_init(config) @@ -98,7 +96,7 @@ class Route(object): coll = matcher.group(self.coll_group) - wbrequest = WbRequest(env, + wbrequest = self.request_class(env, request_uri=request_uri, wb_url_str=wb_url_str, rel_prefix=rel_prefix, diff --git a/pywb/framework/memento.py b/pywb/framework/memento.py new file mode 100644 index 00000000..8f380121 --- /dev/null +++ b/pywb/framework/memento.py @@ -0,0 +1,92 @@ +from pywb.utils.wbexception import BadRequestException +from pywb.utils.timeutils import http_date_to_timestamp +from pywb.utils.timeutils import timestamp_to_http_date + +from wbrequestresponse import WbRequest, WbResponse + + +#================================================================= +class MementoReqMixin(object): + def _parse_extra(self): + self.is_timegate = False + + if not self.wb_url: + return + + if self.wb_url.type != self.wb_url.LATEST_REPLAY: + return + + self.is_timegate = True + + accept_datetime = self.env.get('HTTP_ACCEPT_DATETIME') + if not accept_datetime: + return + + try: + timestamp = http_date_to_timestamp(accept_datetime) + except Exception: + raise BadRequestException('Invalid Accept-Datetime: ' + + accept_datetime) + + self.wb_url.set_replay_timestamp(timestamp) + + +#================================================================= +class MementoRequest(MementoReqMixin, WbRequest): + pass + + +#================================================================= +class MementoRespMixin(object): + def _init_derived(self, params): + wbrequest = params.get('wbrequest') + cdx = params.get('cdx') + + if not wbrequest or not wbrequest.wb_url: + return + + is_timegate = wbrequest.is_timegate + + if is_timegate: + self.status_headers.headers.append(('Vary', 'accept-datetime')) + + # Determine if memento: + # if no cdx included, definitely not a memento + if not cdx: + is_memento = False + + # otherwise, if in proxy mode, then always a memento + elif wbrequest.is_proxy: + is_memento = True + + # otherwise only for replay + else: + is_memento = (wbrequest.wb_url.type == wbrequest.wb_url.REPLAY) + + if is_memento: + http_date = timestamp_to_http_date(cdx['timestamp']) + self.status_headers.headers.append(('Memento-Datetime', http_date)) + + req_url = wbrequest.wb_url.url + + if is_memento and is_timegate: + link = self.make_link(req_url, 'original timegate') + elif is_memento: + timegate = wbrequest.urlrewriter.get_timestamp_url('') + + link = [] + link.append(self.make_link(req_url, 'original')) + link.append(self.make_link(timegate, 'timegate')) + link = ', '.join(link) + else: + link = self.make_link(req_url, 'original') + + self.status_headers.headers.append(('Link', link)) + + def make_link(self, url, type): + return '<{0}>; rel="{1}"'.format(url, type) + + +#================================================================= +class MementoResponse(MementoRespMixin, WbResponse): + pass diff --git a/pywb/framework/proxy.py b/pywb/framework/proxy.py index fa0421b0..4b53b8b4 100644 --- a/pywb/framework/proxy.py +++ b/pywb/framework/proxy.py @@ -10,23 +10,12 @@ from pywb.rewrite.url_rewriter import HttpsUrlRewriter # only latest capture is available currently #================================================================= class ProxyArchivalRouter(ArchivalRouter): - def __init__(self, routes, - hostpaths=None, - port=None, - abs_path=True, - home_view=None, - error_view=None): - - (super(ProxyArchivalRouter, self). - __init__(routes, - hostpaths=hostpaths, - port=port, - abs_path=abs_path, - home_view=home_view, - error_view=error_view)) - - self.proxy = ProxyRouter(routes[0].handler, hostpaths, error_view) - #self.error_view = error_view + def __init__(self, routes, **kwargs): + super(ProxyArchivalRouter, self).__init__(routes, **kwargs) + request_class = routes[0].request_class + self.proxy = ProxyRouter(routes[0].handler, + request_class=request_class, + **kwargs) def __call__(self, env): response = self.proxy(env) @@ -44,11 +33,12 @@ class ProxyArchivalRouter(ArchivalRouter): # Only supports latest capture replay at the moment #================================================================= class ProxyRouter: - def __init__(self, handler, hostpaths=None, error_view=None): + def __init__(self, handler, **kwargs): self.handler = handler - self.hostpaths = hostpaths + self.hostpaths = kwargs.get('hostpaths') - self.error_view = error_view + self.error_view = kwargs.get('error_view') + self.request_class = kwargs.get('request_class') def __call__(self, env): url = env['REL_REQUEST_URI'] @@ -59,10 +49,9 @@ class ProxyRouter: if not url.startswith('http://'): return None - wbrequest = WbRequest(env, + wbrequest = self.request_class(env, request_uri=url, wb_url_str=url, - #rel_prefix=url, host_prefix=self.hostpaths[0], wburl_class=self.handler.get_wburl_type(), urlrewriter_class=HttpsUrlRewriter, diff --git a/pywb/framework/wbexceptions.py b/pywb/framework/wbexceptions.py index 97743f17..dc7621c6 100644 --- a/pywb/framework/wbexceptions.py +++ b/pywb/framework/wbexceptions.py @@ -1,22 +1,8 @@ from pywb.utils.wbexception import WbException -class NotFoundException(WbException): - def status(self): - return '404 Not Found' - - # Exceptions that effect a specific capture and result in a retry class CaptureException(WbException): def status(self): return '502 Internal Server Error' - -class InternalRedirect(WbException): - def __init__(self, location, status='302 Internal Redirect'): - WbException.__init__(self, 'Redirecting -> ' + location) - self.status = status - self.httpHeaders = [('Location', location)] - - def status(self): - return self.status diff --git a/pywb/framework/wbrequestresponse.py b/pywb/framework/wbrequestresponse.py index 3ef091d9..34912e5d 100644 --- a/pywb/framework/wbrequestresponse.py +++ b/pywb/framework/wbrequestresponse.py @@ -1,9 +1,9 @@ from pywb.utils.statusandheaders import StatusAndHeaders + import pprint - #================================================================= -class WbRequest: +class WbRequest(object): """ Represents the main pywb request object. @@ -84,6 +84,8 @@ class WbRequest: # PERF env['X_PERF'] = {} + self._parse_extra() + def _is_ajax(self): value = self.env.get('HTTP_X_REQUESTED_WITH') if not value: @@ -101,18 +103,25 @@ class WbRequest: varstr = pprint.pformat(varlist) return varstr + def _parse_extra(self): + pass + #================================================================= -class WbResponse: +class WbResponse(object): """ Represnts a pywb wsgi response object. Holds a status_headers object and a response iter, to be returned to wsgi container. """ - def __init__(self, status_headers, value=[]): + def __init__(self, status_headers, value=[], **kwargs): self.status_headers = status_headers self.body = value + self._init_derived(kwargs) + + def _init_derived(self, params): + pass @staticmethod def text_stream(stream, status='200 OK', content_type='text/plain'): diff --git a/pywb/framework/wsgi_wrappers.py b/pywb/framework/wsgi_wrappers.py index 17f2cd7f..13b425e9 100644 --- a/pywb/framework/wsgi_wrappers.py +++ b/pywb/framework/wsgi_wrappers.py @@ -1,7 +1,7 @@ -from pywb.utils.wbexception import WbException +from pywb.utils.wbexception import WbException, NotFoundException from pywb.utils.loaders import load_yaml_config -from wbexceptions import NotFoundException, InternalRedirect +#from wbexceptions import InternalRedirect from wbrequestresponse import WbResponse, StatusAndHeaders @@ -66,8 +66,8 @@ class WSGIApp(object): msg = 'No handler for "{0}"'.format(env['REL_REQUEST_URI']) raise NotFoundException(msg) - except InternalRedirect as ir: - response = WbResponse(StatusAndHeaders(ir.status, ir.httpHeaders)) +# except InternalRedirect as ir: +# return ir.response except WbException as e: response = handle_exception(env, wb_router, e, False) diff --git a/pywb/perms/perms_handler.py b/pywb/perms/perms_handler.py index 1a1b9507..58f4a21b 100644 --- a/pywb/perms/perms_handler.py +++ b/pywb/perms/perms_handler.py @@ -1,9 +1,9 @@ from pywb.utils.canonicalize import UrlCanonicalizer +from pywb.utils.wbexception import NotFoundException from pywb.framework.basehandlers import WbUrlHandler from pywb.framework.archivalrouter import ArchivalRouter, Route from pywb.framework.wbrequestresponse import WbResponse -from pywb.framework.wbexceptions import NotFoundException BLOCK = '["block"]' ALLOW = '["allow"]' diff --git a/pywb/rewrite/wburl.py b/pywb/rewrite/wburl.py index 1b2e3a0b..5115eed6 100644 --- a/pywb/rewrite/wburl.py +++ b/pywb/rewrite/wburl.py @@ -214,6 +214,11 @@ class WbUrl(BaseWbUrl): return True + def set_replay_timestamp(self, timestamp): + self.timestamp = timestamp + self.type = self.REPLAY + + # Str Representation # ==================== def to_str(self, **overrides): diff --git a/pywb/utils/canonicalize.py b/pywb/utils/canonicalize.py index 6979a323..79aed38f 100644 --- a/pywb/utils/canonicalize.py +++ b/pywb/utils/canonicalize.py @@ -4,7 +4,7 @@ import surt import urlparse -from wbexception import WbException +from wbexception import BadRequestException #================================================================= @@ -17,10 +17,8 @@ class UrlCanonicalizer(object): #================================================================= -class UrlCanonicalizeException(WbException): - def status(self): - return '400 Bad Request' - +class UrlCanonicalizeException(BadRequestException): + pass #================================================================= def canonicalize(url, surt_ordered=True): diff --git a/pywb/utils/timeutils.py b/pywb/utils/timeutils.py index a89424aa..c09e8d27 100644 --- a/pywb/utils/timeutils.py +++ b/pywb/utils/timeutils.py @@ -8,6 +8,7 @@ import time import datetime import calendar from itertools import imap +from email.utils import parsedate, formatdate #================================================================= # str <-> datetime conversion @@ -38,6 +39,30 @@ def iso_date_to_datetime(string): return the_datetime +def http_date_to_datetime(string): + """ + >>> http_date_to_datetime('Thu, 26 Dec 2013 09:50:10 GMT') + datetime.datetime(2013, 12, 26, 9, 50, 10) + """ + return datetime.datetime(*parsedate(string)[:6]) + + +def datetime_to_http_date(the_datetime): + """ + >>> datetime_to_http_date(datetime.datetime(2013, 12, 26, 9, 50, 10)) + 'Thu, 26 Dec 2013 09:50:10 GMT' + + # Verify inverses + >>> x = 'Thu, 26 Dec 2013 09:50:10 GMT' + >>> datetime_to_http_date(http_date_to_datetime(x)) == x + True + """ + timeval = calendar.timegm(the_datetime.utctimetuple()) + return formatdate(timeval=timeval, + localtime=False, + usegmt=True) + + def datetime_to_timestamp(the_datetime): """ >>> datetime_to_timestamp(datetime.datetime(2013, 12, 26, 10, 11, 12)) @@ -59,6 +84,17 @@ def iso_date_to_timestamp(string): return datetime_to_timestamp(iso_date_to_datetime(string)) +def http_date_to_timestamp(string): + """ + >>> http_date_to_timestamp('Thu, 26 Dec 2013 09:50:00 GMT') + '20131226095000' + + >>> http_date_to_timestamp('Sun, 26 Jan 2014 20:08:04 GMT') + '20140126200804' + """ + return datetime_to_timestamp(http_date_to_datetime(string)) + + # pad to certain length (default 6) def _pad_timestamp(string, pad_str=PAD_6): """ @@ -215,6 +251,17 @@ def timestamp_to_sec(string): return calendar.timegm(timestamp_to_datetime(string).utctimetuple()) +def timestamp_to_http_date(string): + """ + >>> timestamp_to_http_date('20131226095000') + 'Thu, 26 Dec 2013 09:50:00 GMT' + + >>> timestamp_to_http_date('20140126200804') + 'Sun, 26 Jan 2014 20:08:04 GMT' + """ + return datetime_to_http_date(timestamp_to_datetime(string)) + + if __name__ == "__main__": import doctest doctest.testmod() diff --git a/pywb/utils/wbexception.py b/pywb/utils/wbexception.py index a31002b5..c230abc1 100644 --- a/pywb/utils/wbexception.py +++ b/pywb/utils/wbexception.py @@ -10,3 +10,15 @@ class WbException(Exception): class AccessException(WbException): def status(self): return '403 Access Denied' + + +#================================================================= +class BadRequestException(WbException): + def status(self): + return '400 Bad Request' + + +#================================================================= +class NotFoundException(WbException): + def status(self): + return '404 Not Found' diff --git a/tests/fixture.py b/tests/fixture.py index e42abfcc..16120790 100644 --- a/tests/fixture.py +++ b/tests/fixture.py @@ -5,7 +5,7 @@ import yaml @pytest.fixture def testconfig(): - config = yaml.load(open('test_config.yaml')) + config = yaml.load(open('tests/test_config.yaml')) assert config if 'index_paths' not in config: # !!! assumes this module is in a sub-directory of project root. diff --git a/test_config.yaml b/tests/test_config.yaml similarity index 98% rename from test_config.yaml rename to tests/test_config.yaml index c955a649..9d854a67 100644 --- a/test_config.yaml +++ b/tests/test_config.yaml @@ -80,10 +80,6 @@ absoulte_paths: true static_routes: static/test/route: static/ - -# ==== New / Experimental Settings ==== -# Not yet production ready -- used primarily for testing - # Enable simple http proxy mode enable_http_proxy: true @@ -100,5 +96,11 @@ reporter: !!python/object/new:tests.fixture.PrintReporter [] # custom rules for domain specific matching #domain_specific_rules: rules.yaml +# ==== New / Experimental Settings ==== +# Not yet production ready -- used primarily for testing + #perms_checker: !!python/object/new:pywb.cdx.perms.AllowAllPerms [] perms_policy: !!python/name:pywb.perms.test.test_perms_policy.perms_policy + +# not testing memento here +enable_memento: False diff --git a/tests/test_integration.py b/tests/test_integration.py index fe224e9f..526ca69d 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -4,7 +4,7 @@ from pywb.framework.wsgi_wrappers import init_app from pywb.cdx.cdxobject import CDXObject class TestWb: - TEST_CONFIG = 'test_config.yaml' + TEST_CONFIG = 'tests/test_config.yaml' def setup(self): #self.app = pywb.wbapp.create_wb_app(pywb.pywb_init.pywb_config()) diff --git a/tests/test_memento.py b/tests/test_memento.py new file mode 100644 index 00000000..838c6463 --- /dev/null +++ b/tests/test_memento.py @@ -0,0 +1,175 @@ +import webtest +from pywb.core.pywb_init import create_wb_router +from pywb.framework.wsgi_wrappers import init_app +from pywb.cdx.cdxobject import CDXObject + +MEMENTO_DATETIME = 'Memento-Datetime' +ACCEPT_DATETIME = 'Accept-Datetime' +LINK = 'Link' +VARY = 'Vary' + +class TestWb: + TEST_CONFIG = 'tests/test_config_memento.yaml' + + def setup(self): + self.app = init_app(create_wb_router, + load_yaml=True, + config_file=self.TEST_CONFIG) + + self.testapp = webtest.TestApp(self.app) + + # Below functionality is for archival (non-proxy) mode + # It is designed to conform to Memento protocol Pattern 2.1 + # http://www.mementoweb.org/guide/rfc/#Pattern2.1 + + def test_timegate_latest(self): + """ + TimeGate with no Accept-Datetime header + """ + resp = self.testapp.get('/pywb/http://www.iana.org/_css/2013.1/screen.css') + + assert resp.status_int == 302 + + assert resp.headers[VARY] == 'accept-datetime' + assert resp.headers[LINK] == '; rel="original"' + assert MEMENTO_DATETIME not in resp.headers + + assert '/pywb/20140127171239/http://www.iana.org/_css/2013.1/screen.css' in resp.headers['Location'] + + + def test_timegate_accept_datetime(self): + """ + TimeGate with Accept-Datetime header + """ + headers = {ACCEPT_DATETIME: 'Sun, 26 Jan 2014 20:08:04'} + resp = self.testapp.get('/pywb/http://www.iana.org/_css/2013.1/screen.css', headers=headers) + + assert resp.status_int == 302 + + assert resp.headers[VARY] == 'accept-datetime' + assert resp.headers[LINK] == '; rel="original"' + assert MEMENTO_DATETIME not in resp.headers + + assert '/pywb/20140126200804/http://www.iana.org/_css/2013.1/screen.css' in resp.headers['Location'] + + + def test_non_timegate_intermediate_redir(self): + """ + Not a timegate, but an 'intermediate resource', redirect to closest timestamp + """ + headers = {ACCEPT_DATETIME: 'Sun, 26 Jan 2014 20:08:04'} + # not a timegate, partial timestamp /2014/ present + resp = self.testapp.get('/pywb/2014/http://www.iana.org/_css/2013.1/screen.css', headers=headers) + + assert resp.status_int == 302 + + # no vary header + assert VARY not in resp.headers + assert resp.headers[LINK] == '; rel="original"' + assert MEMENTO_DATETIME not in resp.headers + + + # redirect to latest, not negotiation via Accept-Datetime + assert '/pywb/20140127171239/' in resp.headers['Location'] + + + def test_memento_url(self): + """ + Memento response, 200 capture + """ + resp = self.testapp.get('/pywb/20140126200804/http://www.iana.org/_css/2013.1/screen.css') + + assert resp.status_int == 200 + + assert VARY not in resp.headers + + assert resp.headers[LINK] == '; rel="original", \ +; rel="timegate"' + + assert resp.headers[MEMENTO_DATETIME] == 'Sun, 26 Jan 2014 20:08:04 GMT' + + + def test_302_memento(self): + """ + Memento (capture) of a 302 response + """ + resp = self.testapp.get('/pywb/20140128051539/http://www.iana.org/domains/example') + + assert resp.status_int == 302 + + assert VARY not in resp.headers + + assert resp.headers[LINK] == '; rel="original", \ +; rel="timegate"' + + assert resp.headers[MEMENTO_DATETIME] == 'Tue, 28 Jan 2014 05:15:39 GMT' + + + # Below functions test pywb proxy mode behavior + # They are designed to roughly conform to Memento protocol Pattern 1.3 + # with the exception that the original resource is not available + + def test_proxy_latest_memento(self): + """ + Proxy Mode memento with no Accept-Datetime + Both a timegate and a memento + """ + # simulate proxy mode by setting REQUEST_URI + request_uri = 'http://www.iana.org/_css/2013.1/screen.css' + extra = dict(REQUEST_URI=request_uri, SCRIPT_NAME='') + + resp = self.testapp.get('/x-ignore-this-x', extra_environ=extra) + + assert resp.status_int == 200 + + # for timegate + assert resp.headers[VARY] == 'accept-datetime' + + # for memento + assert resp.headers[LINK] == '; rel="original timegate"' + assert resp.headers[MEMENTO_DATETIME] == 'Mon, 27 Jan 2014 17:12:39 GMT' + + + def test_proxy_accept_datetime_memento(self): + """ + Proxy Mode memento with specific Accept-Datetime + Both a timegate and a memento + """ + # simulate proxy mode by setting REQUEST_URI + request_uri = 'http://www.iana.org/_css/2013.1/screen.css' + extra = dict(REQUEST_URI=request_uri, SCRIPT_NAME='') + headers = {ACCEPT_DATETIME: 'Sun, 26 Jan 2014 20:08:04'} + + resp = self.testapp.get('/x-ignore-this-x', extra_environ=extra, headers=headers) + + assert resp.status_int == 200 + + # for timegate + assert resp.headers[VARY] == 'accept-datetime' + + # for memento + assert resp.headers[LINK] == '; rel="original timegate"' + assert resp.headers[MEMENTO_DATETIME] == 'Sun, 26 Jan 2014 20:08:04 GMT' + + + def test_error_bad_accept_datetime(self): + """ + 400 response for bad accept_datetime + """ + headers = {ACCEPT_DATETIME: 'Sun'} + resp = self.testapp.get('/pywb/http://www.iana.org/_css/2013.1/screen.css', headers=headers, status=400) + assert resp.status_int == 400 + + + def test_error_bad_accept_datetime_proxy(self): + """ + 400 response for bad accept_datetime + with proxy mode + """ + request_uri = 'http://www.iana.org/_css/2013.1/screen.css' + extra = dict(REQUEST_URI=request_uri, SCRIPT_NAME='') + headers = {ACCEPT_DATETIME: 'Sun, abc'} + + resp = self.testapp.get('/x-ignore-this-x', extra_environ=extra, headers=headers, status=400) + + assert resp.status_int == 400 diff --git a/tests/test_perms_app.py b/tests/test_perms_app.py index da7d3840..be9c99b4 100644 --- a/tests/test_perms_app.py +++ b/tests/test_perms_app.py @@ -5,7 +5,7 @@ from pywb.perms.perms_handler import ALLOW, BLOCK from pywb.framework.wsgi_wrappers import init_app class TestPermsApp: - TEST_CONFIG = 'test_config.yaml' + TEST_CONFIG = 'tests/test_config.yaml' def setup(self): self.app = init_app(create_perms_checker_app,