mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
first pass at memento support #10!
memento support enabled by default, togglable via 'enable_memento' config property supporting timegate and memento apis, no timemap yet supporting pattern 2.3 for archival and pattern 1.3 for proxy modes also: simplify exception hierarchy a bit more, move down to utils make WbRequest and WbResponse extensible with mixins (eg for memento)
This commit is contained in:
parent
dd9a2c635f
commit
a1ab54c340
@ -101,3 +101,5 @@ enable_cdx_api: true
|
||||
# Permissions checker
|
||||
#perms_checker: !!python/object/new:pywb.cdx.perms.AllowAllPerms []
|
||||
|
||||
# Memento support, enable
|
||||
enable_memento: true
|
||||
|
@ -17,12 +17,6 @@ class CDXException(WbException):
|
||||
return '400 Bad Request'
|
||||
|
||||
|
||||
#=================================================================
|
||||
class CaptureNotFoundException(CDXException):
|
||||
def status(self):
|
||||
return '404 Not Found'
|
||||
|
||||
|
||||
#=================================================================
|
||||
class CDXObject(OrderedDict):
|
||||
"""
|
||||
|
@ -1,9 +1,10 @@
|
||||
from pywb.utils.canonicalize import UrlCanonicalizer, calc_search_range
|
||||
from pywb.utils.wbexception import NotFoundException
|
||||
|
||||
from cdxops import cdx_load
|
||||
from cdxsource import CDXSource, CDXFile, RemoteCDXSource, RedisCDXSource
|
||||
from zipnum import ZipNumCluster
|
||||
from cdxobject import CDXObject, CaptureNotFoundException, CDXException
|
||||
from cdxobject import CDXObject, CDXException
|
||||
from query import CDXQuery
|
||||
from cdxdomainspecific import load_domain_specific_cdx_rules
|
||||
|
||||
@ -41,7 +42,7 @@ class BaseCDXServer(object):
|
||||
""" Check cdx iter semantics
|
||||
If `cdx_iter` is empty (no matches), check if fuzzy matching
|
||||
is allowed, and try it -- otherwise,
|
||||
throw :exc:`~pywb.cdx.cdxobject.CaptureNotFoundException`
|
||||
throw :exc:`~pywb.utils.wbexception.NotFoundException`
|
||||
"""
|
||||
|
||||
cdx_iter = self.peek_iter(cdx_iter)
|
||||
@ -60,7 +61,7 @@ class BaseCDXServer(object):
|
||||
return self.load_cdx_query(fuzzy_query_params)
|
||||
|
||||
msg = 'No Captures found for: ' + query.url
|
||||
raise CaptureNotFoundException(msg)
|
||||
raise NotFoundException(msg)
|
||||
|
||||
def load_cdx(self, **params):
|
||||
return self.load_cdx_query(CDXQuery(**params))
|
||||
@ -99,7 +100,7 @@ class CDXServer(BaseCDXServer):
|
||||
``matchType`` parameter specifies matching method for ``key``
|
||||
(default ``exact``).
|
||||
other parameters are passed down to :func:`cdx_load`.
|
||||
raises :exc:`~pywb.cdx.cdxobject.CaptureNotFoundException`
|
||||
raises :exc:`~pywb.utils.wbexception.NotFoundException`
|
||||
if no captures are found.
|
||||
|
||||
:param query: query parameters
|
||||
|
@ -26,12 +26,12 @@ org,iana)/_js/2013.1/jquery.js 20140126201307 https://www.iana.org/_js/2013.1/jq
|
||||
# No matching results
|
||||
>>> cdx_ops_test('http://iana.org/dont_have_this', reverse = True, resolveRevisits = True, limit = 2)
|
||||
Traceback (most recent call last):
|
||||
CaptureNotFoundException: No Captures found for: http://iana.org/dont_have_this
|
||||
NotFoundException: No Captures found for: http://iana.org/dont_have_this
|
||||
|
||||
# No matching -- limit=1
|
||||
>>> cdx_ops_test('http://iana.org/dont_have_this', reverse = True, resolveRevisits = True, limit = 1)
|
||||
Traceback (most recent call last):
|
||||
CaptureNotFoundException: No Captures found for: http://iana.org/dont_have_this
|
||||
NotFoundException: No Captures found for: http://iana.org/dont_have_this
|
||||
|
||||
# Filter cdx (default: regex)
|
||||
>>> cdx_ops_test(url = 'http://iana.org/domains', matchType = 'prefix', filter = ['mimetype:text/html'])
|
||||
|
@ -2,9 +2,10 @@ import pkgutil
|
||||
import mimetypes
|
||||
import time
|
||||
|
||||
from pywb.utils.wbexception import NotFoundException
|
||||
from pywb.framework.basehandlers import BaseHandler, WbUrlHandler
|
||||
from pywb.framework.wbrequestresponse import WbResponse
|
||||
from pywb.framework.wbexceptions import WbException, NotFoundException
|
||||
from pywb.framework.wbexceptions import WbException
|
||||
from views import TextCapturesView
|
||||
|
||||
|
||||
|
@ -1,5 +1,7 @@
|
||||
from pywb.framework.archivalrouter import ArchivalRouter, Route
|
||||
from pywb.framework.proxy import ProxyArchivalRouter
|
||||
from pywb.framework.wbrequestresponse import WbRequest
|
||||
from pywb.framework.memento import MementoRequest
|
||||
|
||||
from pywb.warc.recordloader import ArcWarcRecordLoader
|
||||
from pywb.warc.resolvingloader import ResolvingLoader
|
||||
@ -37,6 +39,8 @@ DEFAULTS = {
|
||||
'static_routes': {'static/default': 'static/'},
|
||||
|
||||
'domain_specific_rules': 'pywb/rules.yaml',
|
||||
|
||||
'enable_memento': True,
|
||||
}
|
||||
|
||||
#=================================================================
|
||||
@ -86,6 +90,8 @@ def create_wb_handler(cdx_server, config, ds_rules_file=None):
|
||||
|
||||
redir_to_exact=config.get('redir_to_exact', True),
|
||||
|
||||
memento=config.get('enable_memento', False),
|
||||
|
||||
reporter=config.get('reporter')
|
||||
)
|
||||
|
||||
@ -126,6 +132,12 @@ def create_wb_router(passed_config = {}):
|
||||
# collections based on cdx source
|
||||
collections = config.get('collections')
|
||||
|
||||
if config.get('enable_memento', False):
|
||||
request_class = MementoRequest
|
||||
else:
|
||||
request_class = WbRequest
|
||||
|
||||
|
||||
for name, value in collections.iteritems():
|
||||
if isinstance(value, str):
|
||||
value = {'index_paths': value}
|
||||
@ -151,7 +163,9 @@ def create_wb_router(passed_config = {}):
|
||||
|
||||
route_class = route_config.get('route_class', Route)
|
||||
|
||||
routes.append(route_class(name, wb_handler, config = route_config))
|
||||
routes.append(route_class(name, wb_handler,
|
||||
config=route_config,
|
||||
request_class=request_class))
|
||||
|
||||
# cdx query handler
|
||||
if route_config.get('enable_cdx_api', False):
|
||||
|
@ -2,20 +2,25 @@ import re
|
||||
from io import BytesIO
|
||||
|
||||
from pywb.utils.bufferedreaders import ChunkedDataReader
|
||||
from pywb.framework.wbrequestresponse import WbResponse
|
||||
from pywb.utils.statusandheaders import StatusAndHeaders
|
||||
|
||||
from pywb.framework.wbexceptions import CaptureException, InternalRedirect
|
||||
from pywb.framework.wbrequestresponse import WbResponse
|
||||
from pywb.framework.memento import MementoResponse
|
||||
|
||||
from pywb.framework.wbexceptions import CaptureException
|
||||
from pywb.warc.recordloader import ArchiveLoadFailed
|
||||
|
||||
from pywb.utils.loaders import LimitReader
|
||||
|
||||
|
||||
#=================================================================
|
||||
class ReplayView:
|
||||
|
||||
STRIP_SCHEME = re.compile('^([\w]+:[/]*)?(.*?)$')
|
||||
|
||||
def __init__(self, content_loader, content_rewriter, head_insert_view=None,
|
||||
redir_to_exact = True, buffer_response = False, reporter = None):
|
||||
redir_to_exact=True, buffer_response=False, reporter=None,
|
||||
memento=False):
|
||||
|
||||
self.content_loader = content_loader
|
||||
self.content_rewriter = content_rewriter
|
||||
@ -28,6 +33,11 @@ class ReplayView:
|
||||
|
||||
self._reporter = reporter
|
||||
|
||||
if memento:
|
||||
self.response_class = MementoResponse
|
||||
else:
|
||||
self.response_class = WbResponse
|
||||
|
||||
|
||||
def __call__(self, wbrequest, cdx_lines, cdx_loader):
|
||||
last_e = None
|
||||
@ -42,7 +52,10 @@ class ReplayView:
|
||||
try:
|
||||
# optimize: can detect if redirect is needed just from the cdx, no need to load w/arc data
|
||||
if first:
|
||||
self._redirect_if_needed(wbrequest, cdx)
|
||||
redir_response = self._redirect_if_needed(wbrequest, cdx)
|
||||
if redir_response:
|
||||
return redir_response
|
||||
|
||||
first = False
|
||||
|
||||
(status_headers, stream) = (self.content_loader.
|
||||
@ -52,7 +65,9 @@ class ReplayView:
|
||||
self._reject_self_redirect(wbrequest, cdx, status_headers)
|
||||
|
||||
# check if redir is needed
|
||||
self._redirect_if_needed(wbrequest, cdx)
|
||||
redir_response = self._redirect_if_needed(wbrequest, cdx)
|
||||
if redir_response:
|
||||
return redir_response
|
||||
|
||||
# one more check for referrer-based self-redirect
|
||||
self._reject_referrer_self_redirect(wbrequest)
|
||||
@ -121,10 +136,14 @@ class ReplayView:
|
||||
# no rewriting needed!
|
||||
if rewritten_headers.text_type is None:
|
||||
response_iter = self.stream_to_iter(stream)
|
||||
return WbResponse(rewritten_headers.status_headers, response_iter)
|
||||
return self.response_class(rewritten_headers.status_headers,
|
||||
response_iter,
|
||||
wbrequest=wbrequest,
|
||||
cdx=cdx)
|
||||
|
||||
def make_head_insert(rule):
|
||||
return (self.head_insert_view.render_to_string(wbrequest=wbrequest,
|
||||
return (self.head_insert_view.
|
||||
render_to_string(wbrequest=wbrequest,
|
||||
cdx=cdx,
|
||||
rule=rule))
|
||||
# do head insert
|
||||
@ -145,9 +164,12 @@ class ReplayView:
|
||||
if wbrequest.wb_url.mod == 'id_':
|
||||
status_headers.remove_header('content-length')
|
||||
|
||||
return self.buffered_response(status_headers, response_gen)
|
||||
response_gen = self.buffered_response(status_headers, response_gen)
|
||||
|
||||
return WbResponse(status_headers, response_gen)
|
||||
return self.response_class(status_headers,
|
||||
response_gen,
|
||||
wbrequest=wbrequest,
|
||||
cdx=cdx)
|
||||
|
||||
|
||||
# Buffer rewrite iterator and return a response from a string
|
||||
@ -165,16 +187,28 @@ class ReplayView:
|
||||
status_headers.headers.append(('Content-Length', content_length_str))
|
||||
out.close()
|
||||
|
||||
return WbResponse(status_headers, value = [content])
|
||||
|
||||
return content
|
||||
|
||||
def _redirect_if_needed(self, wbrequest, cdx):
|
||||
if self.redir_to_exact and not wbrequest.is_proxy and cdx and (cdx['timestamp'] != wbrequest.wb_url.timestamp):
|
||||
new_url = wbrequest.urlrewriter.get_timestamp_url(cdx['timestamp'], cdx['original'])
|
||||
raise InternalRedirect(new_url)
|
||||
|
||||
if wbrequest.is_proxy:
|
||||
return None
|
||||
|
||||
redir_needed = hasattr(wbrequest, 'is_timegate') and wbrequest.is_timegate
|
||||
|
||||
if not redir_needed and self.redir_to_exact:
|
||||
redir_needed = (cdx['timestamp'] != wbrequest.wb_url.timestamp)
|
||||
|
||||
if not redir_needed:
|
||||
return None
|
||||
|
||||
new_url = wbrequest.urlrewriter.get_timestamp_url(cdx['timestamp'], cdx['original'])
|
||||
status_headers = StatusAndHeaders('302 Internal Redirect',
|
||||
[('Location', new_url)])
|
||||
|
||||
# don't include cdx to indicate internal redirect
|
||||
return self.response_class(status_headers,
|
||||
wbrequest=wbrequest)
|
||||
|
||||
|
||||
def _reject_self_redirect(self, wbrequest, cdx, status_headers):
|
||||
"""
|
||||
|
@ -9,27 +9,23 @@ from wbrequestresponse import WbRequest, WbResponse
|
||||
# ArchivalRouter -- route WB requests in archival mode
|
||||
#=================================================================
|
||||
class ArchivalRouter(object):
|
||||
def __init__(self, routes,
|
||||
hostpaths=None,
|
||||
port=None,
|
||||
abs_path=True,
|
||||
home_view=None,
|
||||
error_view=None):
|
||||
|
||||
def __init__(self, routes, **kwargs):
|
||||
self.routes = routes
|
||||
|
||||
# optional port setting may be ignored by wsgi container
|
||||
self.port = port
|
||||
self.port = kwargs.get('port')
|
||||
|
||||
hostpaths = kwargs.get('hostpaths')
|
||||
|
||||
if hostpaths:
|
||||
self.fallback = ReferRedirect(hostpaths)
|
||||
else:
|
||||
self.fallback = None
|
||||
|
||||
self.abs_path = abs_path
|
||||
self.abs_path = kwargs.get('abs_path')
|
||||
|
||||
self.home_view = home_view
|
||||
self.error_view = error_view
|
||||
self.home_view = kwargs.get('home_view')
|
||||
self.error_view = kwargs.get('error_view')
|
||||
|
||||
def __call__(self, env):
|
||||
for route in self.routes:
|
||||
@ -62,6 +58,7 @@ class Route(object):
|
||||
SLASH_QUERY_LOOKAHEAD = '(?=/|$|\?)'
|
||||
|
||||
def __init__(self, regex, handler, coll_group=0, config={},
|
||||
request_class=WbRequest,
|
||||
lookahead=SLASH_QUERY_LOOKAHEAD):
|
||||
|
||||
self.path = regex
|
||||
@ -70,6 +67,7 @@ class Route(object):
|
||||
else:
|
||||
self.regex = re.compile('')
|
||||
self.handler = handler
|
||||
self.request_class = request_class
|
||||
# collection id from regex group (default 0)
|
||||
self.coll_group = coll_group
|
||||
self._custom_init(config)
|
||||
@ -98,7 +96,7 @@ class Route(object):
|
||||
|
||||
coll = matcher.group(self.coll_group)
|
||||
|
||||
wbrequest = WbRequest(env,
|
||||
wbrequest = self.request_class(env,
|
||||
request_uri=request_uri,
|
||||
wb_url_str=wb_url_str,
|
||||
rel_prefix=rel_prefix,
|
||||
|
92
pywb/framework/memento.py
Normal file
92
pywb/framework/memento.py
Normal file
@ -0,0 +1,92 @@
|
||||
from pywb.utils.wbexception import BadRequestException
|
||||
from pywb.utils.timeutils import http_date_to_timestamp
|
||||
from pywb.utils.timeutils import timestamp_to_http_date
|
||||
|
||||
from wbrequestresponse import WbRequest, WbResponse
|
||||
|
||||
|
||||
#=================================================================
|
||||
class MementoReqMixin(object):
|
||||
def _parse_extra(self):
|
||||
self.is_timegate = False
|
||||
|
||||
if not self.wb_url:
|
||||
return
|
||||
|
||||
if self.wb_url.type != self.wb_url.LATEST_REPLAY:
|
||||
return
|
||||
|
||||
self.is_timegate = True
|
||||
|
||||
accept_datetime = self.env.get('HTTP_ACCEPT_DATETIME')
|
||||
if not accept_datetime:
|
||||
return
|
||||
|
||||
try:
|
||||
timestamp = http_date_to_timestamp(accept_datetime)
|
||||
except Exception:
|
||||
raise BadRequestException('Invalid Accept-Datetime: ' +
|
||||
accept_datetime)
|
||||
|
||||
self.wb_url.set_replay_timestamp(timestamp)
|
||||
|
||||
|
||||
#=================================================================
|
||||
class MementoRequest(MementoReqMixin, WbRequest):
|
||||
pass
|
||||
|
||||
|
||||
#=================================================================
|
||||
class MementoRespMixin(object):
|
||||
def _init_derived(self, params):
|
||||
wbrequest = params.get('wbrequest')
|
||||
cdx = params.get('cdx')
|
||||
|
||||
if not wbrequest or not wbrequest.wb_url:
|
||||
return
|
||||
|
||||
is_timegate = wbrequest.is_timegate
|
||||
|
||||
if is_timegate:
|
||||
self.status_headers.headers.append(('Vary', 'accept-datetime'))
|
||||
|
||||
# Determine if memento:
|
||||
# if no cdx included, definitely not a memento
|
||||
if not cdx:
|
||||
is_memento = False
|
||||
|
||||
# otherwise, if in proxy mode, then always a memento
|
||||
elif wbrequest.is_proxy:
|
||||
is_memento = True
|
||||
|
||||
# otherwise only for replay
|
||||
else:
|
||||
is_memento = (wbrequest.wb_url.type == wbrequest.wb_url.REPLAY)
|
||||
|
||||
if is_memento:
|
||||
http_date = timestamp_to_http_date(cdx['timestamp'])
|
||||
self.status_headers.headers.append(('Memento-Datetime', http_date))
|
||||
|
||||
req_url = wbrequest.wb_url.url
|
||||
|
||||
if is_memento and is_timegate:
|
||||
link = self.make_link(req_url, 'original timegate')
|
||||
elif is_memento:
|
||||
timegate = wbrequest.urlrewriter.get_timestamp_url('')
|
||||
|
||||
link = []
|
||||
link.append(self.make_link(req_url, 'original'))
|
||||
link.append(self.make_link(timegate, 'timegate'))
|
||||
link = ', '.join(link)
|
||||
else:
|
||||
link = self.make_link(req_url, 'original')
|
||||
|
||||
self.status_headers.headers.append(('Link', link))
|
||||
|
||||
def make_link(self, url, type):
|
||||
return '<{0}>; rel="{1}"'.format(url, type)
|
||||
|
||||
|
||||
#=================================================================
|
||||
class MementoResponse(MementoRespMixin, WbResponse):
|
||||
pass
|
@ -10,23 +10,12 @@ from pywb.rewrite.url_rewriter import HttpsUrlRewriter
|
||||
# only latest capture is available currently
|
||||
#=================================================================
|
||||
class ProxyArchivalRouter(ArchivalRouter):
|
||||
def __init__(self, routes,
|
||||
hostpaths=None,
|
||||
port=None,
|
||||
abs_path=True,
|
||||
home_view=None,
|
||||
error_view=None):
|
||||
|
||||
(super(ProxyArchivalRouter, self).
|
||||
__init__(routes,
|
||||
hostpaths=hostpaths,
|
||||
port=port,
|
||||
abs_path=abs_path,
|
||||
home_view=home_view,
|
||||
error_view=error_view))
|
||||
|
||||
self.proxy = ProxyRouter(routes[0].handler, hostpaths, error_view)
|
||||
#self.error_view = error_view
|
||||
def __init__(self, routes, **kwargs):
|
||||
super(ProxyArchivalRouter, self).__init__(routes, **kwargs)
|
||||
request_class = routes[0].request_class
|
||||
self.proxy = ProxyRouter(routes[0].handler,
|
||||
request_class=request_class,
|
||||
**kwargs)
|
||||
|
||||
def __call__(self, env):
|
||||
response = self.proxy(env)
|
||||
@ -44,11 +33,12 @@ class ProxyArchivalRouter(ArchivalRouter):
|
||||
# Only supports latest capture replay at the moment
|
||||
#=================================================================
|
||||
class ProxyRouter:
|
||||
def __init__(self, handler, hostpaths=None, error_view=None):
|
||||
def __init__(self, handler, **kwargs):
|
||||
self.handler = handler
|
||||
self.hostpaths = hostpaths
|
||||
self.hostpaths = kwargs.get('hostpaths')
|
||||
|
||||
self.error_view = error_view
|
||||
self.error_view = kwargs.get('error_view')
|
||||
self.request_class = kwargs.get('request_class')
|
||||
|
||||
def __call__(self, env):
|
||||
url = env['REL_REQUEST_URI']
|
||||
@ -59,10 +49,9 @@ class ProxyRouter:
|
||||
if not url.startswith('http://'):
|
||||
return None
|
||||
|
||||
wbrequest = WbRequest(env,
|
||||
wbrequest = self.request_class(env,
|
||||
request_uri=url,
|
||||
wb_url_str=url,
|
||||
#rel_prefix=url,
|
||||
host_prefix=self.hostpaths[0],
|
||||
wburl_class=self.handler.get_wburl_type(),
|
||||
urlrewriter_class=HttpsUrlRewriter,
|
||||
|
@ -1,22 +1,8 @@
|
||||
from pywb.utils.wbexception import WbException
|
||||
|
||||
|
||||
class NotFoundException(WbException):
|
||||
def status(self):
|
||||
return '404 Not Found'
|
||||
|
||||
|
||||
# Exceptions that effect a specific capture and result in a retry
|
||||
class CaptureException(WbException):
|
||||
def status(self):
|
||||
return '502 Internal Server Error'
|
||||
|
||||
|
||||
class InternalRedirect(WbException):
|
||||
def __init__(self, location, status='302 Internal Redirect'):
|
||||
WbException.__init__(self, 'Redirecting -> ' + location)
|
||||
self.status = status
|
||||
self.httpHeaders = [('Location', location)]
|
||||
|
||||
def status(self):
|
||||
return self.status
|
||||
|
@ -1,9 +1,9 @@
|
||||
from pywb.utils.statusandheaders import StatusAndHeaders
|
||||
|
||||
import pprint
|
||||
|
||||
|
||||
#=================================================================
|
||||
class WbRequest:
|
||||
class WbRequest(object):
|
||||
"""
|
||||
Represents the main pywb request object.
|
||||
|
||||
@ -84,6 +84,8 @@ class WbRequest:
|
||||
# PERF
|
||||
env['X_PERF'] = {}
|
||||
|
||||
self._parse_extra()
|
||||
|
||||
def _is_ajax(self):
|
||||
value = self.env.get('HTTP_X_REQUESTED_WITH')
|
||||
if not value:
|
||||
@ -101,18 +103,25 @@ class WbRequest:
|
||||
varstr = pprint.pformat(varlist)
|
||||
return varstr
|
||||
|
||||
def _parse_extra(self):
|
||||
pass
|
||||
|
||||
|
||||
#=================================================================
|
||||
class WbResponse:
|
||||
class WbResponse(object):
|
||||
"""
|
||||
Represnts a pywb wsgi response object.
|
||||
|
||||
Holds a status_headers object and a response iter, to be
|
||||
returned to wsgi container.
|
||||
"""
|
||||
def __init__(self, status_headers, value=[]):
|
||||
def __init__(self, status_headers, value=[], **kwargs):
|
||||
self.status_headers = status_headers
|
||||
self.body = value
|
||||
self._init_derived(kwargs)
|
||||
|
||||
def _init_derived(self, params):
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def text_stream(stream, status='200 OK', content_type='text/plain'):
|
||||
|
@ -1,7 +1,7 @@
|
||||
from pywb.utils.wbexception import WbException
|
||||
from pywb.utils.wbexception import WbException, NotFoundException
|
||||
from pywb.utils.loaders import load_yaml_config
|
||||
|
||||
from wbexceptions import NotFoundException, InternalRedirect
|
||||
#from wbexceptions import InternalRedirect
|
||||
from wbrequestresponse import WbResponse, StatusAndHeaders
|
||||
|
||||
|
||||
@ -66,8 +66,8 @@ class WSGIApp(object):
|
||||
msg = 'No handler for "{0}"'.format(env['REL_REQUEST_URI'])
|
||||
raise NotFoundException(msg)
|
||||
|
||||
except InternalRedirect as ir:
|
||||
response = WbResponse(StatusAndHeaders(ir.status, ir.httpHeaders))
|
||||
# except InternalRedirect as ir:
|
||||
# return ir.response
|
||||
|
||||
except WbException as e:
|
||||
response = handle_exception(env, wb_router, e, False)
|
||||
|
@ -1,9 +1,9 @@
|
||||
from pywb.utils.canonicalize import UrlCanonicalizer
|
||||
from pywb.utils.wbexception import NotFoundException
|
||||
|
||||
from pywb.framework.basehandlers import WbUrlHandler
|
||||
from pywb.framework.archivalrouter import ArchivalRouter, Route
|
||||
from pywb.framework.wbrequestresponse import WbResponse
|
||||
from pywb.framework.wbexceptions import NotFoundException
|
||||
|
||||
BLOCK = '["block"]'
|
||||
ALLOW = '["allow"]'
|
||||
|
@ -214,6 +214,11 @@ class WbUrl(BaseWbUrl):
|
||||
|
||||
return True
|
||||
|
||||
def set_replay_timestamp(self, timestamp):
|
||||
self.timestamp = timestamp
|
||||
self.type = self.REPLAY
|
||||
|
||||
|
||||
# Str Representation
|
||||
# ====================
|
||||
def to_str(self, **overrides):
|
||||
|
@ -4,7 +4,7 @@
|
||||
import surt
|
||||
import urlparse
|
||||
|
||||
from wbexception import WbException
|
||||
from wbexception import BadRequestException
|
||||
|
||||
|
||||
#=================================================================
|
||||
@ -17,10 +17,8 @@ class UrlCanonicalizer(object):
|
||||
|
||||
|
||||
#=================================================================
|
||||
class UrlCanonicalizeException(WbException):
|
||||
def status(self):
|
||||
return '400 Bad Request'
|
||||
|
||||
class UrlCanonicalizeException(BadRequestException):
|
||||
pass
|
||||
|
||||
#=================================================================
|
||||
def canonicalize(url, surt_ordered=True):
|
||||
|
@ -8,6 +8,7 @@ import time
|
||||
import datetime
|
||||
import calendar
|
||||
from itertools import imap
|
||||
from email.utils import parsedate, formatdate
|
||||
|
||||
#=================================================================
|
||||
# str <-> datetime conversion
|
||||
@ -38,6 +39,30 @@ def iso_date_to_datetime(string):
|
||||
return the_datetime
|
||||
|
||||
|
||||
def http_date_to_datetime(string):
|
||||
"""
|
||||
>>> http_date_to_datetime('Thu, 26 Dec 2013 09:50:10 GMT')
|
||||
datetime.datetime(2013, 12, 26, 9, 50, 10)
|
||||
"""
|
||||
return datetime.datetime(*parsedate(string)[:6])
|
||||
|
||||
|
||||
def datetime_to_http_date(the_datetime):
|
||||
"""
|
||||
>>> datetime_to_http_date(datetime.datetime(2013, 12, 26, 9, 50, 10))
|
||||
'Thu, 26 Dec 2013 09:50:10 GMT'
|
||||
|
||||
# Verify inverses
|
||||
>>> x = 'Thu, 26 Dec 2013 09:50:10 GMT'
|
||||
>>> datetime_to_http_date(http_date_to_datetime(x)) == x
|
||||
True
|
||||
"""
|
||||
timeval = calendar.timegm(the_datetime.utctimetuple())
|
||||
return formatdate(timeval=timeval,
|
||||
localtime=False,
|
||||
usegmt=True)
|
||||
|
||||
|
||||
def datetime_to_timestamp(the_datetime):
|
||||
"""
|
||||
>>> datetime_to_timestamp(datetime.datetime(2013, 12, 26, 10, 11, 12))
|
||||
@ -59,6 +84,17 @@ def iso_date_to_timestamp(string):
|
||||
return datetime_to_timestamp(iso_date_to_datetime(string))
|
||||
|
||||
|
||||
def http_date_to_timestamp(string):
|
||||
"""
|
||||
>>> http_date_to_timestamp('Thu, 26 Dec 2013 09:50:00 GMT')
|
||||
'20131226095000'
|
||||
|
||||
>>> http_date_to_timestamp('Sun, 26 Jan 2014 20:08:04 GMT')
|
||||
'20140126200804'
|
||||
"""
|
||||
return datetime_to_timestamp(http_date_to_datetime(string))
|
||||
|
||||
|
||||
# pad to certain length (default 6)
|
||||
def _pad_timestamp(string, pad_str=PAD_6):
|
||||
"""
|
||||
@ -215,6 +251,17 @@ def timestamp_to_sec(string):
|
||||
return calendar.timegm(timestamp_to_datetime(string).utctimetuple())
|
||||
|
||||
|
||||
def timestamp_to_http_date(string):
|
||||
"""
|
||||
>>> timestamp_to_http_date('20131226095000')
|
||||
'Thu, 26 Dec 2013 09:50:00 GMT'
|
||||
|
||||
>>> timestamp_to_http_date('20140126200804')
|
||||
'Sun, 26 Jan 2014 20:08:04 GMT'
|
||||
"""
|
||||
return datetime_to_http_date(timestamp_to_datetime(string))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import doctest
|
||||
doctest.testmod()
|
||||
|
@ -10,3 +10,15 @@ class WbException(Exception):
|
||||
class AccessException(WbException):
|
||||
def status(self):
|
||||
return '403 Access Denied'
|
||||
|
||||
|
||||
#=================================================================
|
||||
class BadRequestException(WbException):
|
||||
def status(self):
|
||||
return '400 Bad Request'
|
||||
|
||||
|
||||
#=================================================================
|
||||
class NotFoundException(WbException):
|
||||
def status(self):
|
||||
return '404 Not Found'
|
||||
|
@ -5,7 +5,7 @@ import yaml
|
||||
|
||||
@pytest.fixture
|
||||
def testconfig():
|
||||
config = yaml.load(open('test_config.yaml'))
|
||||
config = yaml.load(open('tests/test_config.yaml'))
|
||||
assert config
|
||||
if 'index_paths' not in config:
|
||||
# !!! assumes this module is in a sub-directory of project root.
|
||||
|
@ -80,10 +80,6 @@ absoulte_paths: true
|
||||
static_routes:
|
||||
static/test/route: static/
|
||||
|
||||
|
||||
# ==== New / Experimental Settings ====
|
||||
# Not yet production ready -- used primarily for testing
|
||||
|
||||
# Enable simple http proxy mode
|
||||
enable_http_proxy: true
|
||||
|
||||
@ -100,5 +96,11 @@ reporter: !!python/object/new:tests.fixture.PrintReporter []
|
||||
# custom rules for domain specific matching
|
||||
#domain_specific_rules: rules.yaml
|
||||
|
||||
# ==== New / Experimental Settings ====
|
||||
# Not yet production ready -- used primarily for testing
|
||||
|
||||
#perms_checker: !!python/object/new:pywb.cdx.perms.AllowAllPerms []
|
||||
perms_policy: !!python/name:pywb.perms.test.test_perms_policy.perms_policy
|
||||
|
||||
# not testing memento here
|
||||
enable_memento: False
|
@ -4,7 +4,7 @@ from pywb.framework.wsgi_wrappers import init_app
|
||||
from pywb.cdx.cdxobject import CDXObject
|
||||
|
||||
class TestWb:
|
||||
TEST_CONFIG = 'test_config.yaml'
|
||||
TEST_CONFIG = 'tests/test_config.yaml'
|
||||
|
||||
def setup(self):
|
||||
#self.app = pywb.wbapp.create_wb_app(pywb.pywb_init.pywb_config())
|
||||
|
175
tests/test_memento.py
Normal file
175
tests/test_memento.py
Normal file
@ -0,0 +1,175 @@
|
||||
import webtest
|
||||
from pywb.core.pywb_init import create_wb_router
|
||||
from pywb.framework.wsgi_wrappers import init_app
|
||||
from pywb.cdx.cdxobject import CDXObject
|
||||
|
||||
MEMENTO_DATETIME = 'Memento-Datetime'
|
||||
ACCEPT_DATETIME = 'Accept-Datetime'
|
||||
LINK = 'Link'
|
||||
VARY = 'Vary'
|
||||
|
||||
class TestWb:
|
||||
TEST_CONFIG = 'tests/test_config_memento.yaml'
|
||||
|
||||
def setup(self):
|
||||
self.app = init_app(create_wb_router,
|
||||
load_yaml=True,
|
||||
config_file=self.TEST_CONFIG)
|
||||
|
||||
self.testapp = webtest.TestApp(self.app)
|
||||
|
||||
# Below functionality is for archival (non-proxy) mode
|
||||
# It is designed to conform to Memento protocol Pattern 2.1
|
||||
# http://www.mementoweb.org/guide/rfc/#Pattern2.1
|
||||
|
||||
def test_timegate_latest(self):
|
||||
"""
|
||||
TimeGate with no Accept-Datetime header
|
||||
"""
|
||||
resp = self.testapp.get('/pywb/http://www.iana.org/_css/2013.1/screen.css')
|
||||
|
||||
assert resp.status_int == 302
|
||||
|
||||
assert resp.headers[VARY] == 'accept-datetime'
|
||||
assert resp.headers[LINK] == '<http://www.iana.org/_css/2013.1/screen.css>; rel="original"'
|
||||
assert MEMENTO_DATETIME not in resp.headers
|
||||
|
||||
assert '/pywb/20140127171239/http://www.iana.org/_css/2013.1/screen.css' in resp.headers['Location']
|
||||
|
||||
|
||||
def test_timegate_accept_datetime(self):
|
||||
"""
|
||||
TimeGate with Accept-Datetime header
|
||||
"""
|
||||
headers = {ACCEPT_DATETIME: 'Sun, 26 Jan 2014 20:08:04'}
|
||||
resp = self.testapp.get('/pywb/http://www.iana.org/_css/2013.1/screen.css', headers=headers)
|
||||
|
||||
assert resp.status_int == 302
|
||||
|
||||
assert resp.headers[VARY] == 'accept-datetime'
|
||||
assert resp.headers[LINK] == '<http://www.iana.org/_css/2013.1/screen.css>; rel="original"'
|
||||
assert MEMENTO_DATETIME not in resp.headers
|
||||
|
||||
assert '/pywb/20140126200804/http://www.iana.org/_css/2013.1/screen.css' in resp.headers['Location']
|
||||
|
||||
|
||||
def test_non_timegate_intermediate_redir(self):
|
||||
"""
|
||||
Not a timegate, but an 'intermediate resource', redirect to closest timestamp
|
||||
"""
|
||||
headers = {ACCEPT_DATETIME: 'Sun, 26 Jan 2014 20:08:04'}
|
||||
# not a timegate, partial timestamp /2014/ present
|
||||
resp = self.testapp.get('/pywb/2014/http://www.iana.org/_css/2013.1/screen.css', headers=headers)
|
||||
|
||||
assert resp.status_int == 302
|
||||
|
||||
# no vary header
|
||||
assert VARY not in resp.headers
|
||||
assert resp.headers[LINK] == '<http://www.iana.org/_css/2013.1/screen.css>; rel="original"'
|
||||
assert MEMENTO_DATETIME not in resp.headers
|
||||
|
||||
|
||||
# redirect to latest, not negotiation via Accept-Datetime
|
||||
assert '/pywb/20140127171239/' in resp.headers['Location']
|
||||
|
||||
|
||||
def test_memento_url(self):
|
||||
"""
|
||||
Memento response, 200 capture
|
||||
"""
|
||||
resp = self.testapp.get('/pywb/20140126200804/http://www.iana.org/_css/2013.1/screen.css')
|
||||
|
||||
assert resp.status_int == 200
|
||||
|
||||
assert VARY not in resp.headers
|
||||
|
||||
assert resp.headers[LINK] == '<http://www.iana.org/_css/2013.1/screen.css>; rel="original", \
|
||||
<http://localhost:80/pywb/http://www.iana.org/_css/2013.1/screen.css>; rel="timegate"'
|
||||
|
||||
assert resp.headers[MEMENTO_DATETIME] == 'Sun, 26 Jan 2014 20:08:04 GMT'
|
||||
|
||||
|
||||
def test_302_memento(self):
|
||||
"""
|
||||
Memento (capture) of a 302 response
|
||||
"""
|
||||
resp = self.testapp.get('/pywb/20140128051539/http://www.iana.org/domains/example')
|
||||
|
||||
assert resp.status_int == 302
|
||||
|
||||
assert VARY not in resp.headers
|
||||
|
||||
assert resp.headers[LINK] == '<http://www.iana.org/domains/example>; rel="original", \
|
||||
<http://localhost:80/pywb/http://www.iana.org/domains/example>; rel="timegate"'
|
||||
|
||||
assert resp.headers[MEMENTO_DATETIME] == 'Tue, 28 Jan 2014 05:15:39 GMT'
|
||||
|
||||
|
||||
# Below functions test pywb proxy mode behavior
|
||||
# They are designed to roughly conform to Memento protocol Pattern 1.3
|
||||
# with the exception that the original resource is not available
|
||||
|
||||
def test_proxy_latest_memento(self):
|
||||
"""
|
||||
Proxy Mode memento with no Accept-Datetime
|
||||
Both a timegate and a memento
|
||||
"""
|
||||
# simulate proxy mode by setting REQUEST_URI
|
||||
request_uri = 'http://www.iana.org/_css/2013.1/screen.css'
|
||||
extra = dict(REQUEST_URI=request_uri, SCRIPT_NAME='')
|
||||
|
||||
resp = self.testapp.get('/x-ignore-this-x', extra_environ=extra)
|
||||
|
||||
assert resp.status_int == 200
|
||||
|
||||
# for timegate
|
||||
assert resp.headers[VARY] == 'accept-datetime'
|
||||
|
||||
# for memento
|
||||
assert resp.headers[LINK] == '<http://www.iana.org/_css/2013.1/screen.css>; rel="original timegate"'
|
||||
assert resp.headers[MEMENTO_DATETIME] == 'Mon, 27 Jan 2014 17:12:39 GMT'
|
||||
|
||||
|
||||
def test_proxy_accept_datetime_memento(self):
|
||||
"""
|
||||
Proxy Mode memento with specific Accept-Datetime
|
||||
Both a timegate and a memento
|
||||
"""
|
||||
# simulate proxy mode by setting REQUEST_URI
|
||||
request_uri = 'http://www.iana.org/_css/2013.1/screen.css'
|
||||
extra = dict(REQUEST_URI=request_uri, SCRIPT_NAME='')
|
||||
headers = {ACCEPT_DATETIME: 'Sun, 26 Jan 2014 20:08:04'}
|
||||
|
||||
resp = self.testapp.get('/x-ignore-this-x', extra_environ=extra, headers=headers)
|
||||
|
||||
assert resp.status_int == 200
|
||||
|
||||
# for timegate
|
||||
assert resp.headers[VARY] == 'accept-datetime'
|
||||
|
||||
# for memento
|
||||
assert resp.headers[LINK] == '<http://www.iana.org/_css/2013.1/screen.css>; rel="original timegate"'
|
||||
assert resp.headers[MEMENTO_DATETIME] == 'Sun, 26 Jan 2014 20:08:04 GMT'
|
||||
|
||||
|
||||
def test_error_bad_accept_datetime(self):
|
||||
"""
|
||||
400 response for bad accept_datetime
|
||||
"""
|
||||
headers = {ACCEPT_DATETIME: 'Sun'}
|
||||
resp = self.testapp.get('/pywb/http://www.iana.org/_css/2013.1/screen.css', headers=headers, status=400)
|
||||
assert resp.status_int == 400
|
||||
|
||||
|
||||
def test_error_bad_accept_datetime_proxy(self):
|
||||
"""
|
||||
400 response for bad accept_datetime
|
||||
with proxy mode
|
||||
"""
|
||||
request_uri = 'http://www.iana.org/_css/2013.1/screen.css'
|
||||
extra = dict(REQUEST_URI=request_uri, SCRIPT_NAME='')
|
||||
headers = {ACCEPT_DATETIME: 'Sun, abc'}
|
||||
|
||||
resp = self.testapp.get('/x-ignore-this-x', extra_environ=extra, headers=headers, status=400)
|
||||
|
||||
assert resp.status_int == 400
|
@ -5,7 +5,7 @@ from pywb.perms.perms_handler import ALLOW, BLOCK
|
||||
from pywb.framework.wsgi_wrappers import init_app
|
||||
|
||||
class TestPermsApp:
|
||||
TEST_CONFIG = 'test_config.yaml'
|
||||
TEST_CONFIG = 'tests/test_config.yaml'
|
||||
|
||||
def setup(self):
|
||||
self.app = init_app(create_perms_checker_app,
|
||||
|
Loading…
x
Reference in New Issue
Block a user