1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

first pass at memento support #10!

memento support enabled by default, togglable via 'enable_memento' config property
supporting timegate and memento apis, no timemap yet
supporting pattern 2.3 for archival and pattern 1.3 for proxy modes
also:
simplify exception hierarchy a bit more, move down to utils
make WbRequest and WbResponse extensible with mixins (eg for memento)
This commit is contained in:
Ilya Kreymer 2014-03-14 10:46:20 -07:00
parent dd9a2c635f
commit a1ab54c340
23 changed files with 460 additions and 101 deletions

View File

@ -101,3 +101,5 @@ enable_cdx_api: true
# Permissions checker
#perms_checker: !!python/object/new:pywb.cdx.perms.AllowAllPerms []
# Memento support, enable
enable_memento: true

View File

@ -17,12 +17,6 @@ class CDXException(WbException):
return '400 Bad Request'
#=================================================================
class CaptureNotFoundException(CDXException):
def status(self):
return '404 Not Found'
#=================================================================
class CDXObject(OrderedDict):
"""

View File

@ -1,9 +1,10 @@
from pywb.utils.canonicalize import UrlCanonicalizer, calc_search_range
from pywb.utils.wbexception import NotFoundException
from cdxops import cdx_load
from cdxsource import CDXSource, CDXFile, RemoteCDXSource, RedisCDXSource
from zipnum import ZipNumCluster
from cdxobject import CDXObject, CaptureNotFoundException, CDXException
from cdxobject import CDXObject, CDXException
from query import CDXQuery
from cdxdomainspecific import load_domain_specific_cdx_rules
@ -41,7 +42,7 @@ class BaseCDXServer(object):
""" Check cdx iter semantics
If `cdx_iter` is empty (no matches), check if fuzzy matching
is allowed, and try it -- otherwise,
throw :exc:`~pywb.cdx.cdxobject.CaptureNotFoundException`
throw :exc:`~pywb.utils.wbexception.NotFoundException`
"""
cdx_iter = self.peek_iter(cdx_iter)
@ -60,7 +61,7 @@ class BaseCDXServer(object):
return self.load_cdx_query(fuzzy_query_params)
msg = 'No Captures found for: ' + query.url
raise CaptureNotFoundException(msg)
raise NotFoundException(msg)
def load_cdx(self, **params):
return self.load_cdx_query(CDXQuery(**params))
@ -99,7 +100,7 @@ class CDXServer(BaseCDXServer):
``matchType`` parameter specifies matching method for ``key``
(default ``exact``).
other parameters are passed down to :func:`cdx_load`.
raises :exc:`~pywb.cdx.cdxobject.CaptureNotFoundException`
raises :exc:`~pywb.utils.wbexception.NotFoundException`
if no captures are found.
:param query: query parameters

View File

@ -26,12 +26,12 @@ org,iana)/_js/2013.1/jquery.js 20140126201307 https://www.iana.org/_js/2013.1/jq
# No matching results
>>> cdx_ops_test('http://iana.org/dont_have_this', reverse = True, resolveRevisits = True, limit = 2)
Traceback (most recent call last):
CaptureNotFoundException: No Captures found for: http://iana.org/dont_have_this
NotFoundException: No Captures found for: http://iana.org/dont_have_this
# No matching -- limit=1
>>> cdx_ops_test('http://iana.org/dont_have_this', reverse = True, resolveRevisits = True, limit = 1)
Traceback (most recent call last):
CaptureNotFoundException: No Captures found for: http://iana.org/dont_have_this
NotFoundException: No Captures found for: http://iana.org/dont_have_this
# Filter cdx (default: regex)
>>> cdx_ops_test(url = 'http://iana.org/domains', matchType = 'prefix', filter = ['mimetype:text/html'])

View File

@ -2,9 +2,10 @@ import pkgutil
import mimetypes
import time
from pywb.utils.wbexception import NotFoundException
from pywb.framework.basehandlers import BaseHandler, WbUrlHandler
from pywb.framework.wbrequestresponse import WbResponse
from pywb.framework.wbexceptions import WbException, NotFoundException
from pywb.framework.wbexceptions import WbException
from views import TextCapturesView

View File

@ -1,5 +1,7 @@
from pywb.framework.archivalrouter import ArchivalRouter, Route
from pywb.framework.proxy import ProxyArchivalRouter
from pywb.framework.wbrequestresponse import WbRequest
from pywb.framework.memento import MementoRequest
from pywb.warc.recordloader import ArcWarcRecordLoader
from pywb.warc.resolvingloader import ResolvingLoader
@ -37,6 +39,8 @@ DEFAULTS = {
'static_routes': {'static/default': 'static/'},
'domain_specific_rules': 'pywb/rules.yaml',
'enable_memento': True,
}
#=================================================================
@ -86,6 +90,8 @@ def create_wb_handler(cdx_server, config, ds_rules_file=None):
redir_to_exact=config.get('redir_to_exact', True),
memento=config.get('enable_memento', False),
reporter=config.get('reporter')
)
@ -126,6 +132,12 @@ def create_wb_router(passed_config = {}):
# collections based on cdx source
collections = config.get('collections')
if config.get('enable_memento', False):
request_class = MementoRequest
else:
request_class = WbRequest
for name, value in collections.iteritems():
if isinstance(value, str):
value = {'index_paths': value}
@ -151,7 +163,9 @@ def create_wb_router(passed_config = {}):
route_class = route_config.get('route_class', Route)
routes.append(route_class(name, wb_handler, config = route_config))
routes.append(route_class(name, wb_handler,
config=route_config,
request_class=request_class))
# cdx query handler
if route_config.get('enable_cdx_api', False):

View File

@ -2,20 +2,25 @@ import re
from io import BytesIO
from pywb.utils.bufferedreaders import ChunkedDataReader
from pywb.framework.wbrequestresponse import WbResponse
from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.framework.wbexceptions import CaptureException, InternalRedirect
from pywb.framework.wbrequestresponse import WbResponse
from pywb.framework.memento import MementoResponse
from pywb.framework.wbexceptions import CaptureException
from pywb.warc.recordloader import ArchiveLoadFailed
from pywb.utils.loaders import LimitReader
#=================================================================
class ReplayView:
STRIP_SCHEME = re.compile('^([\w]+:[/]*)?(.*?)$')
def __init__(self, content_loader, content_rewriter, head_insert_view = None,
redir_to_exact = True, buffer_response = False, reporter = None):
def __init__(self, content_loader, content_rewriter, head_insert_view=None,
redir_to_exact=True, buffer_response=False, reporter=None,
memento=False):
self.content_loader = content_loader
self.content_rewriter = content_rewriter
@ -28,6 +33,11 @@ class ReplayView:
self._reporter = reporter
if memento:
self.response_class = MementoResponse
else:
self.response_class = WbResponse
def __call__(self, wbrequest, cdx_lines, cdx_loader):
last_e = None
@ -42,7 +52,10 @@ class ReplayView:
try:
# optimize: can detect if redirect is needed just from the cdx, no need to load w/arc data
if first:
self._redirect_if_needed(wbrequest, cdx)
redir_response = self._redirect_if_needed(wbrequest, cdx)
if redir_response:
return redir_response
first = False
(status_headers, stream) = (self.content_loader.
@ -52,7 +65,9 @@ class ReplayView:
self._reject_self_redirect(wbrequest, cdx, status_headers)
# check if redir is needed
self._redirect_if_needed(wbrequest, cdx)
redir_response = self._redirect_if_needed(wbrequest, cdx)
if redir_response:
return redir_response
# one more check for referrer-based self-redirect
self._reject_referrer_self_redirect(wbrequest)
@ -121,12 +136,16 @@ class ReplayView:
# no rewriting needed!
if rewritten_headers.text_type is None:
response_iter = self.stream_to_iter(stream)
return WbResponse(rewritten_headers.status_headers, response_iter)
return self.response_class(rewritten_headers.status_headers,
response_iter,
wbrequest=wbrequest,
cdx=cdx)
def make_head_insert(rule):
return (self.head_insert_view.render_to_string(wbrequest=wbrequest,
cdx=cdx,
rule=rule))
return (self.head_insert_view.
render_to_string(wbrequest=wbrequest,
cdx=cdx,
rule=rule))
# do head insert
if self.head_insert_view:
head_insert_func = make_head_insert
@ -145,9 +164,12 @@ class ReplayView:
if wbrequest.wb_url.mod == 'id_':
status_headers.remove_header('content-length')
return self.buffered_response(status_headers, response_gen)
response_gen = self.buffered_response(status_headers, response_gen)
return WbResponse(status_headers, response_gen)
return self.response_class(status_headers,
response_gen,
wbrequest=wbrequest,
cdx=cdx)
# Buffer rewrite iterator and return a response from a string
@ -165,15 +187,27 @@ class ReplayView:
status_headers.headers.append(('Content-Length', content_length_str))
out.close()
return WbResponse(status_headers, value = [content])
return content
def _redirect_if_needed(self, wbrequest, cdx):
if self.redir_to_exact and not wbrequest.is_proxy and cdx and (cdx['timestamp'] != wbrequest.wb_url.timestamp):
new_url = wbrequest.urlrewriter.get_timestamp_url(cdx['timestamp'], cdx['original'])
raise InternalRedirect(new_url)
if wbrequest.is_proxy:
return None
return None
redir_needed = hasattr(wbrequest, 'is_timegate') and wbrequest.is_timegate
if not redir_needed and self.redir_to_exact:
redir_needed = (cdx['timestamp'] != wbrequest.wb_url.timestamp)
if not redir_needed:
return None
new_url = wbrequest.urlrewriter.get_timestamp_url(cdx['timestamp'], cdx['original'])
status_headers = StatusAndHeaders('302 Internal Redirect',
[('Location', new_url)])
# don't include cdx to indicate internal redirect
return self.response_class(status_headers,
wbrequest=wbrequest)
def _reject_self_redirect(self, wbrequest, cdx, status_headers):

View File

@ -9,27 +9,23 @@ from wbrequestresponse import WbRequest, WbResponse
# ArchivalRouter -- route WB requests in archival mode
#=================================================================
class ArchivalRouter(object):
def __init__(self, routes,
hostpaths=None,
port=None,
abs_path=True,
home_view=None,
error_view=None):
def __init__(self, routes, **kwargs):
self.routes = routes
# optional port setting may be ignored by wsgi container
self.port = port
self.port = kwargs.get('port')
hostpaths = kwargs.get('hostpaths')
if hostpaths:
self.fallback = ReferRedirect(hostpaths)
else:
self.fallback = None
self.abs_path = abs_path
self.abs_path = kwargs.get('abs_path')
self.home_view = home_view
self.error_view = error_view
self.home_view = kwargs.get('home_view')
self.error_view = kwargs.get('error_view')
def __call__(self, env):
for route in self.routes:
@ -62,6 +58,7 @@ class Route(object):
SLASH_QUERY_LOOKAHEAD = '(?=/|$|\?)'
def __init__(self, regex, handler, coll_group=0, config={},
request_class=WbRequest,
lookahead=SLASH_QUERY_LOOKAHEAD):
self.path = regex
@ -70,6 +67,7 @@ class Route(object):
else:
self.regex = re.compile('')
self.handler = handler
self.request_class = request_class
# collection id from regex group (default 0)
self.coll_group = coll_group
self._custom_init(config)
@ -98,7 +96,7 @@ class Route(object):
coll = matcher.group(self.coll_group)
wbrequest = WbRequest(env,
wbrequest = self.request_class(env,
request_uri=request_uri,
wb_url_str=wb_url_str,
rel_prefix=rel_prefix,

92
pywb/framework/memento.py Normal file
View File

@ -0,0 +1,92 @@
from pywb.utils.wbexception import BadRequestException
from pywb.utils.timeutils import http_date_to_timestamp
from pywb.utils.timeutils import timestamp_to_http_date
from wbrequestresponse import WbRequest, WbResponse
#=================================================================
class MementoReqMixin(object):
def _parse_extra(self):
self.is_timegate = False
if not self.wb_url:
return
if self.wb_url.type != self.wb_url.LATEST_REPLAY:
return
self.is_timegate = True
accept_datetime = self.env.get('HTTP_ACCEPT_DATETIME')
if not accept_datetime:
return
try:
timestamp = http_date_to_timestamp(accept_datetime)
except Exception:
raise BadRequestException('Invalid Accept-Datetime: ' +
accept_datetime)
self.wb_url.set_replay_timestamp(timestamp)
#=================================================================
class MementoRequest(MementoReqMixin, WbRequest):
pass
#=================================================================
class MementoRespMixin(object):
def _init_derived(self, params):
wbrequest = params.get('wbrequest')
cdx = params.get('cdx')
if not wbrequest or not wbrequest.wb_url:
return
is_timegate = wbrequest.is_timegate
if is_timegate:
self.status_headers.headers.append(('Vary', 'accept-datetime'))
# Determine if memento:
# if no cdx included, definitely not a memento
if not cdx:
is_memento = False
# otherwise, if in proxy mode, then always a memento
elif wbrequest.is_proxy:
is_memento = True
# otherwise only for replay
else:
is_memento = (wbrequest.wb_url.type == wbrequest.wb_url.REPLAY)
if is_memento:
http_date = timestamp_to_http_date(cdx['timestamp'])
self.status_headers.headers.append(('Memento-Datetime', http_date))
req_url = wbrequest.wb_url.url
if is_memento and is_timegate:
link = self.make_link(req_url, 'original timegate')
elif is_memento:
timegate = wbrequest.urlrewriter.get_timestamp_url('')
link = []
link.append(self.make_link(req_url, 'original'))
link.append(self.make_link(timegate, 'timegate'))
link = ', '.join(link)
else:
link = self.make_link(req_url, 'original')
self.status_headers.headers.append(('Link', link))
def make_link(self, url, type):
return '<{0}>; rel="{1}"'.format(url, type)
#=================================================================
class MementoResponse(MementoRespMixin, WbResponse):
pass

View File

@ -10,23 +10,12 @@ from pywb.rewrite.url_rewriter import HttpsUrlRewriter
# only latest capture is available currently
#=================================================================
class ProxyArchivalRouter(ArchivalRouter):
def __init__(self, routes,
hostpaths=None,
port=None,
abs_path=True,
home_view=None,
error_view=None):
(super(ProxyArchivalRouter, self).
__init__(routes,
hostpaths=hostpaths,
port=port,
abs_path=abs_path,
home_view=home_view,
error_view=error_view))
self.proxy = ProxyRouter(routes[0].handler, hostpaths, error_view)
#self.error_view = error_view
def __init__(self, routes, **kwargs):
super(ProxyArchivalRouter, self).__init__(routes, **kwargs)
request_class = routes[0].request_class
self.proxy = ProxyRouter(routes[0].handler,
request_class=request_class,
**kwargs)
def __call__(self, env):
response = self.proxy(env)
@ -44,11 +33,12 @@ class ProxyArchivalRouter(ArchivalRouter):
# Only supports latest capture replay at the moment
#=================================================================
class ProxyRouter:
def __init__(self, handler, hostpaths=None, error_view=None):
def __init__(self, handler, **kwargs):
self.handler = handler
self.hostpaths = hostpaths
self.hostpaths = kwargs.get('hostpaths')
self.error_view = error_view
self.error_view = kwargs.get('error_view')
self.request_class = kwargs.get('request_class')
def __call__(self, env):
url = env['REL_REQUEST_URI']
@ -59,10 +49,9 @@ class ProxyRouter:
if not url.startswith('http://'):
return None
wbrequest = WbRequest(env,
wbrequest = self.request_class(env,
request_uri=url,
wb_url_str=url,
#rel_prefix=url,
host_prefix=self.hostpaths[0],
wburl_class=self.handler.get_wburl_type(),
urlrewriter_class=HttpsUrlRewriter,

View File

@ -1,22 +1,8 @@
from pywb.utils.wbexception import WbException
class NotFoundException(WbException):
def status(self):
return '404 Not Found'
# Exceptions that effect a specific capture and result in a retry
class CaptureException(WbException):
def status(self):
return '502 Internal Server Error'
class InternalRedirect(WbException):
def __init__(self, location, status='302 Internal Redirect'):
WbException.__init__(self, 'Redirecting -> ' + location)
self.status = status
self.httpHeaders = [('Location', location)]
def status(self):
return self.status

View File

@ -1,9 +1,9 @@
from pywb.utils.statusandheaders import StatusAndHeaders
import pprint
#=================================================================
class WbRequest:
class WbRequest(object):
"""
Represents the main pywb request object.
@ -84,6 +84,8 @@ class WbRequest:
# PERF
env['X_PERF'] = {}
self._parse_extra()
def _is_ajax(self):
value = self.env.get('HTTP_X_REQUESTED_WITH')
if not value:
@ -101,18 +103,25 @@ class WbRequest:
varstr = pprint.pformat(varlist)
return varstr
def _parse_extra(self):
pass
#=================================================================
class WbResponse:
class WbResponse(object):
"""
Represnts a pywb wsgi response object.
Holds a status_headers object and a response iter, to be
returned to wsgi container.
"""
def __init__(self, status_headers, value=[]):
def __init__(self, status_headers, value=[], **kwargs):
self.status_headers = status_headers
self.body = value
self._init_derived(kwargs)
def _init_derived(self, params):
pass
@staticmethod
def text_stream(stream, status='200 OK', content_type='text/plain'):

View File

@ -1,7 +1,7 @@
from pywb.utils.wbexception import WbException
from pywb.utils.wbexception import WbException, NotFoundException
from pywb.utils.loaders import load_yaml_config
from wbexceptions import NotFoundException, InternalRedirect
#from wbexceptions import InternalRedirect
from wbrequestresponse import WbResponse, StatusAndHeaders
@ -66,8 +66,8 @@ class WSGIApp(object):
msg = 'No handler for "{0}"'.format(env['REL_REQUEST_URI'])
raise NotFoundException(msg)
except InternalRedirect as ir:
response = WbResponse(StatusAndHeaders(ir.status, ir.httpHeaders))
# except InternalRedirect as ir:
# return ir.response
except WbException as e:
response = handle_exception(env, wb_router, e, False)

View File

@ -1,9 +1,9 @@
from pywb.utils.canonicalize import UrlCanonicalizer
from pywb.utils.wbexception import NotFoundException
from pywb.framework.basehandlers import WbUrlHandler
from pywb.framework.archivalrouter import ArchivalRouter, Route
from pywb.framework.wbrequestresponse import WbResponse
from pywb.framework.wbexceptions import NotFoundException
BLOCK = '["block"]'
ALLOW = '["allow"]'

View File

@ -214,6 +214,11 @@ class WbUrl(BaseWbUrl):
return True
def set_replay_timestamp(self, timestamp):
self.timestamp = timestamp
self.type = self.REPLAY
# Str Representation
# ====================
def to_str(self, **overrides):

View File

@ -4,7 +4,7 @@
import surt
import urlparse
from wbexception import WbException
from wbexception import BadRequestException
#=================================================================
@ -17,10 +17,8 @@ class UrlCanonicalizer(object):
#=================================================================
class UrlCanonicalizeException(WbException):
def status(self):
return '400 Bad Request'
class UrlCanonicalizeException(BadRequestException):
pass
#=================================================================
def canonicalize(url, surt_ordered=True):

View File

@ -8,6 +8,7 @@ import time
import datetime
import calendar
from itertools import imap
from email.utils import parsedate, formatdate
#=================================================================
# str <-> datetime conversion
@ -38,6 +39,30 @@ def iso_date_to_datetime(string):
return the_datetime
def http_date_to_datetime(string):
"""
>>> http_date_to_datetime('Thu, 26 Dec 2013 09:50:10 GMT')
datetime.datetime(2013, 12, 26, 9, 50, 10)
"""
return datetime.datetime(*parsedate(string)[:6])
def datetime_to_http_date(the_datetime):
"""
>>> datetime_to_http_date(datetime.datetime(2013, 12, 26, 9, 50, 10))
'Thu, 26 Dec 2013 09:50:10 GMT'
# Verify inverses
>>> x = 'Thu, 26 Dec 2013 09:50:10 GMT'
>>> datetime_to_http_date(http_date_to_datetime(x)) == x
True
"""
timeval = calendar.timegm(the_datetime.utctimetuple())
return formatdate(timeval=timeval,
localtime=False,
usegmt=True)
def datetime_to_timestamp(the_datetime):
"""
>>> datetime_to_timestamp(datetime.datetime(2013, 12, 26, 10, 11, 12))
@ -59,6 +84,17 @@ def iso_date_to_timestamp(string):
return datetime_to_timestamp(iso_date_to_datetime(string))
def http_date_to_timestamp(string):
"""
>>> http_date_to_timestamp('Thu, 26 Dec 2013 09:50:00 GMT')
'20131226095000'
>>> http_date_to_timestamp('Sun, 26 Jan 2014 20:08:04 GMT')
'20140126200804'
"""
return datetime_to_timestamp(http_date_to_datetime(string))
# pad to certain length (default 6)
def _pad_timestamp(string, pad_str=PAD_6):
"""
@ -215,6 +251,17 @@ def timestamp_to_sec(string):
return calendar.timegm(timestamp_to_datetime(string).utctimetuple())
def timestamp_to_http_date(string):
"""
>>> timestamp_to_http_date('20131226095000')
'Thu, 26 Dec 2013 09:50:00 GMT'
>>> timestamp_to_http_date('20140126200804')
'Sun, 26 Jan 2014 20:08:04 GMT'
"""
return datetime_to_http_date(timestamp_to_datetime(string))
if __name__ == "__main__":
import doctest
doctest.testmod()

View File

@ -10,3 +10,15 @@ class WbException(Exception):
class AccessException(WbException):
def status(self):
return '403 Access Denied'
#=================================================================
class BadRequestException(WbException):
def status(self):
return '400 Bad Request'
#=================================================================
class NotFoundException(WbException):
def status(self):
return '404 Not Found'

View File

@ -5,7 +5,7 @@ import yaml
@pytest.fixture
def testconfig():
config = yaml.load(open('test_config.yaml'))
config = yaml.load(open('tests/test_config.yaml'))
assert config
if 'index_paths' not in config:
# !!! assumes this module is in a sub-directory of project root.

View File

@ -80,10 +80,6 @@ absoulte_paths: true
static_routes:
static/test/route: static/
# ==== New / Experimental Settings ====
# Not yet production ready -- used primarily for testing
# Enable simple http proxy mode
enable_http_proxy: true
@ -100,5 +96,11 @@ reporter: !!python/object/new:tests.fixture.PrintReporter []
# custom rules for domain specific matching
#domain_specific_rules: rules.yaml
# ==== New / Experimental Settings ====
# Not yet production ready -- used primarily for testing
#perms_checker: !!python/object/new:pywb.cdx.perms.AllowAllPerms []
perms_policy: !!python/name:pywb.perms.test.test_perms_policy.perms_policy
# not testing memento here
enable_memento: False

View File

@ -4,7 +4,7 @@ from pywb.framework.wsgi_wrappers import init_app
from pywb.cdx.cdxobject import CDXObject
class TestWb:
TEST_CONFIG = 'test_config.yaml'
TEST_CONFIG = 'tests/test_config.yaml'
def setup(self):
#self.app = pywb.wbapp.create_wb_app(pywb.pywb_init.pywb_config())

175
tests/test_memento.py Normal file
View File

@ -0,0 +1,175 @@
import webtest
from pywb.core.pywb_init import create_wb_router
from pywb.framework.wsgi_wrappers import init_app
from pywb.cdx.cdxobject import CDXObject
MEMENTO_DATETIME = 'Memento-Datetime'
ACCEPT_DATETIME = 'Accept-Datetime'
LINK = 'Link'
VARY = 'Vary'
class TestWb:
TEST_CONFIG = 'tests/test_config_memento.yaml'
def setup(self):
self.app = init_app(create_wb_router,
load_yaml=True,
config_file=self.TEST_CONFIG)
self.testapp = webtest.TestApp(self.app)
# Below functionality is for archival (non-proxy) mode
# It is designed to conform to Memento protocol Pattern 2.1
# http://www.mementoweb.org/guide/rfc/#Pattern2.1
def test_timegate_latest(self):
"""
TimeGate with no Accept-Datetime header
"""
resp = self.testapp.get('/pywb/http://www.iana.org/_css/2013.1/screen.css')
assert resp.status_int == 302
assert resp.headers[VARY] == 'accept-datetime'
assert resp.headers[LINK] == '<http://www.iana.org/_css/2013.1/screen.css>; rel="original"'
assert MEMENTO_DATETIME not in resp.headers
assert '/pywb/20140127171239/http://www.iana.org/_css/2013.1/screen.css' in resp.headers['Location']
def test_timegate_accept_datetime(self):
"""
TimeGate with Accept-Datetime header
"""
headers = {ACCEPT_DATETIME: 'Sun, 26 Jan 2014 20:08:04'}
resp = self.testapp.get('/pywb/http://www.iana.org/_css/2013.1/screen.css', headers=headers)
assert resp.status_int == 302
assert resp.headers[VARY] == 'accept-datetime'
assert resp.headers[LINK] == '<http://www.iana.org/_css/2013.1/screen.css>; rel="original"'
assert MEMENTO_DATETIME not in resp.headers
assert '/pywb/20140126200804/http://www.iana.org/_css/2013.1/screen.css' in resp.headers['Location']
def test_non_timegate_intermediate_redir(self):
"""
Not a timegate, but an 'intermediate resource', redirect to closest timestamp
"""
headers = {ACCEPT_DATETIME: 'Sun, 26 Jan 2014 20:08:04'}
# not a timegate, partial timestamp /2014/ present
resp = self.testapp.get('/pywb/2014/http://www.iana.org/_css/2013.1/screen.css', headers=headers)
assert resp.status_int == 302
# no vary header
assert VARY not in resp.headers
assert resp.headers[LINK] == '<http://www.iana.org/_css/2013.1/screen.css>; rel="original"'
assert MEMENTO_DATETIME not in resp.headers
# redirect to latest, not negotiation via Accept-Datetime
assert '/pywb/20140127171239/' in resp.headers['Location']
def test_memento_url(self):
"""
Memento response, 200 capture
"""
resp = self.testapp.get('/pywb/20140126200804/http://www.iana.org/_css/2013.1/screen.css')
assert resp.status_int == 200
assert VARY not in resp.headers
assert resp.headers[LINK] == '<http://www.iana.org/_css/2013.1/screen.css>; rel="original", \
<http://localhost:80/pywb/http://www.iana.org/_css/2013.1/screen.css>; rel="timegate"'
assert resp.headers[MEMENTO_DATETIME] == 'Sun, 26 Jan 2014 20:08:04 GMT'
def test_302_memento(self):
"""
Memento (capture) of a 302 response
"""
resp = self.testapp.get('/pywb/20140128051539/http://www.iana.org/domains/example')
assert resp.status_int == 302
assert VARY not in resp.headers
assert resp.headers[LINK] == '<http://www.iana.org/domains/example>; rel="original", \
<http://localhost:80/pywb/http://www.iana.org/domains/example>; rel="timegate"'
assert resp.headers[MEMENTO_DATETIME] == 'Tue, 28 Jan 2014 05:15:39 GMT'
# Below functions test pywb proxy mode behavior
# They are designed to roughly conform to Memento protocol Pattern 1.3
# with the exception that the original resource is not available
def test_proxy_latest_memento(self):
"""
Proxy Mode memento with no Accept-Datetime
Both a timegate and a memento
"""
# simulate proxy mode by setting REQUEST_URI
request_uri = 'http://www.iana.org/_css/2013.1/screen.css'
extra = dict(REQUEST_URI=request_uri, SCRIPT_NAME='')
resp = self.testapp.get('/x-ignore-this-x', extra_environ=extra)
assert resp.status_int == 200
# for timegate
assert resp.headers[VARY] == 'accept-datetime'
# for memento
assert resp.headers[LINK] == '<http://www.iana.org/_css/2013.1/screen.css>; rel="original timegate"'
assert resp.headers[MEMENTO_DATETIME] == 'Mon, 27 Jan 2014 17:12:39 GMT'
def test_proxy_accept_datetime_memento(self):
"""
Proxy Mode memento with specific Accept-Datetime
Both a timegate and a memento
"""
# simulate proxy mode by setting REQUEST_URI
request_uri = 'http://www.iana.org/_css/2013.1/screen.css'
extra = dict(REQUEST_URI=request_uri, SCRIPT_NAME='')
headers = {ACCEPT_DATETIME: 'Sun, 26 Jan 2014 20:08:04'}
resp = self.testapp.get('/x-ignore-this-x', extra_environ=extra, headers=headers)
assert resp.status_int == 200
# for timegate
assert resp.headers[VARY] == 'accept-datetime'
# for memento
assert resp.headers[LINK] == '<http://www.iana.org/_css/2013.1/screen.css>; rel="original timegate"'
assert resp.headers[MEMENTO_DATETIME] == 'Sun, 26 Jan 2014 20:08:04 GMT'
def test_error_bad_accept_datetime(self):
"""
400 response for bad accept_datetime
"""
headers = {ACCEPT_DATETIME: 'Sun'}
resp = self.testapp.get('/pywb/http://www.iana.org/_css/2013.1/screen.css', headers=headers, status=400)
assert resp.status_int == 400
def test_error_bad_accept_datetime_proxy(self):
"""
400 response for bad accept_datetime
with proxy mode
"""
request_uri = 'http://www.iana.org/_css/2013.1/screen.css'
extra = dict(REQUEST_URI=request_uri, SCRIPT_NAME='')
headers = {ACCEPT_DATETIME: 'Sun, abc'}
resp = self.testapp.get('/x-ignore-this-x', extra_environ=extra, headers=headers, status=400)
assert resp.status_int == 400

View File

@ -5,7 +5,7 @@ from pywb.perms.perms_handler import ALLOW, BLOCK
from pywb.framework.wsgi_wrappers import init_app
class TestPermsApp:
TEST_CONFIG = 'test_config.yaml'
TEST_CONFIG = 'tests/test_config.yaml'
def setup(self):
self.app = init_app(create_perms_checker_app,