1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

first pass at memento support #10!

memento support enabled by default, togglable via 'enable_memento' config property
supporting timegate and memento apis, no timemap yet
supporting pattern 2.3 for archival and pattern 1.3 for proxy modes
also:
simplify exception hierarchy a bit more, move down to utils
make WbRequest and WbResponse extensible with mixins (eg for memento)
This commit is contained in:
Ilya Kreymer 2014-03-14 10:46:20 -07:00
parent dd9a2c635f
commit a1ab54c340
23 changed files with 460 additions and 101 deletions

View File

@ -101,3 +101,5 @@ enable_cdx_api: true
# Permissions checker # Permissions checker
#perms_checker: !!python/object/new:pywb.cdx.perms.AllowAllPerms [] #perms_checker: !!python/object/new:pywb.cdx.perms.AllowAllPerms []
# Memento support, enable
enable_memento: true

View File

@ -17,12 +17,6 @@ class CDXException(WbException):
return '400 Bad Request' return '400 Bad Request'
#=================================================================
class CaptureNotFoundException(CDXException):
def status(self):
return '404 Not Found'
#================================================================= #=================================================================
class CDXObject(OrderedDict): class CDXObject(OrderedDict):
""" """

View File

@ -1,9 +1,10 @@
from pywb.utils.canonicalize import UrlCanonicalizer, calc_search_range from pywb.utils.canonicalize import UrlCanonicalizer, calc_search_range
from pywb.utils.wbexception import NotFoundException
from cdxops import cdx_load from cdxops import cdx_load
from cdxsource import CDXSource, CDXFile, RemoteCDXSource, RedisCDXSource from cdxsource import CDXSource, CDXFile, RemoteCDXSource, RedisCDXSource
from zipnum import ZipNumCluster from zipnum import ZipNumCluster
from cdxobject import CDXObject, CaptureNotFoundException, CDXException from cdxobject import CDXObject, CDXException
from query import CDXQuery from query import CDXQuery
from cdxdomainspecific import load_domain_specific_cdx_rules from cdxdomainspecific import load_domain_specific_cdx_rules
@ -41,7 +42,7 @@ class BaseCDXServer(object):
""" Check cdx iter semantics """ Check cdx iter semantics
If `cdx_iter` is empty (no matches), check if fuzzy matching If `cdx_iter` is empty (no matches), check if fuzzy matching
is allowed, and try it -- otherwise, is allowed, and try it -- otherwise,
throw :exc:`~pywb.cdx.cdxobject.CaptureNotFoundException` throw :exc:`~pywb.utils.wbexception.NotFoundException`
""" """
cdx_iter = self.peek_iter(cdx_iter) cdx_iter = self.peek_iter(cdx_iter)
@ -60,7 +61,7 @@ class BaseCDXServer(object):
return self.load_cdx_query(fuzzy_query_params) return self.load_cdx_query(fuzzy_query_params)
msg = 'No Captures found for: ' + query.url msg = 'No Captures found for: ' + query.url
raise CaptureNotFoundException(msg) raise NotFoundException(msg)
def load_cdx(self, **params): def load_cdx(self, **params):
return self.load_cdx_query(CDXQuery(**params)) return self.load_cdx_query(CDXQuery(**params))
@ -99,7 +100,7 @@ class CDXServer(BaseCDXServer):
``matchType`` parameter specifies matching method for ``key`` ``matchType`` parameter specifies matching method for ``key``
(default ``exact``). (default ``exact``).
other parameters are passed down to :func:`cdx_load`. other parameters are passed down to :func:`cdx_load`.
raises :exc:`~pywb.cdx.cdxobject.CaptureNotFoundException` raises :exc:`~pywb.utils.wbexception.NotFoundException`
if no captures are found. if no captures are found.
:param query: query parameters :param query: query parameters

View File

@ -26,12 +26,12 @@ org,iana)/_js/2013.1/jquery.js 20140126201307 https://www.iana.org/_js/2013.1/jq
# No matching results # No matching results
>>> cdx_ops_test('http://iana.org/dont_have_this', reverse = True, resolveRevisits = True, limit = 2) >>> cdx_ops_test('http://iana.org/dont_have_this', reverse = True, resolveRevisits = True, limit = 2)
Traceback (most recent call last): Traceback (most recent call last):
CaptureNotFoundException: No Captures found for: http://iana.org/dont_have_this NotFoundException: No Captures found for: http://iana.org/dont_have_this
# No matching -- limit=1 # No matching -- limit=1
>>> cdx_ops_test('http://iana.org/dont_have_this', reverse = True, resolveRevisits = True, limit = 1) >>> cdx_ops_test('http://iana.org/dont_have_this', reverse = True, resolveRevisits = True, limit = 1)
Traceback (most recent call last): Traceback (most recent call last):
CaptureNotFoundException: No Captures found for: http://iana.org/dont_have_this NotFoundException: No Captures found for: http://iana.org/dont_have_this
# Filter cdx (default: regex) # Filter cdx (default: regex)
>>> cdx_ops_test(url = 'http://iana.org/domains', matchType = 'prefix', filter = ['mimetype:text/html']) >>> cdx_ops_test(url = 'http://iana.org/domains', matchType = 'prefix', filter = ['mimetype:text/html'])

View File

@ -2,9 +2,10 @@ import pkgutil
import mimetypes import mimetypes
import time import time
from pywb.utils.wbexception import NotFoundException
from pywb.framework.basehandlers import BaseHandler, WbUrlHandler from pywb.framework.basehandlers import BaseHandler, WbUrlHandler
from pywb.framework.wbrequestresponse import WbResponse from pywb.framework.wbrequestresponse import WbResponse
from pywb.framework.wbexceptions import WbException, NotFoundException from pywb.framework.wbexceptions import WbException
from views import TextCapturesView from views import TextCapturesView

View File

@ -1,5 +1,7 @@
from pywb.framework.archivalrouter import ArchivalRouter, Route from pywb.framework.archivalrouter import ArchivalRouter, Route
from pywb.framework.proxy import ProxyArchivalRouter from pywb.framework.proxy import ProxyArchivalRouter
from pywb.framework.wbrequestresponse import WbRequest
from pywb.framework.memento import MementoRequest
from pywb.warc.recordloader import ArcWarcRecordLoader from pywb.warc.recordloader import ArcWarcRecordLoader
from pywb.warc.resolvingloader import ResolvingLoader from pywb.warc.resolvingloader import ResolvingLoader
@ -37,6 +39,8 @@ DEFAULTS = {
'static_routes': {'static/default': 'static/'}, 'static_routes': {'static/default': 'static/'},
'domain_specific_rules': 'pywb/rules.yaml', 'domain_specific_rules': 'pywb/rules.yaml',
'enable_memento': True,
} }
#================================================================= #=================================================================
@ -86,6 +90,8 @@ def create_wb_handler(cdx_server, config, ds_rules_file=None):
redir_to_exact=config.get('redir_to_exact', True), redir_to_exact=config.get('redir_to_exact', True),
memento=config.get('enable_memento', False),
reporter=config.get('reporter') reporter=config.get('reporter')
) )
@ -126,6 +132,12 @@ def create_wb_router(passed_config = {}):
# collections based on cdx source # collections based on cdx source
collections = config.get('collections') collections = config.get('collections')
if config.get('enable_memento', False):
request_class = MementoRequest
else:
request_class = WbRequest
for name, value in collections.iteritems(): for name, value in collections.iteritems():
if isinstance(value, str): if isinstance(value, str):
value = {'index_paths': value} value = {'index_paths': value}
@ -151,7 +163,9 @@ def create_wb_router(passed_config = {}):
route_class = route_config.get('route_class', Route) route_class = route_config.get('route_class', Route)
routes.append(route_class(name, wb_handler, config = route_config)) routes.append(route_class(name, wb_handler,
config=route_config,
request_class=request_class))
# cdx query handler # cdx query handler
if route_config.get('enable_cdx_api', False): if route_config.get('enable_cdx_api', False):

View File

@ -2,20 +2,25 @@ import re
from io import BytesIO from io import BytesIO
from pywb.utils.bufferedreaders import ChunkedDataReader from pywb.utils.bufferedreaders import ChunkedDataReader
from pywb.framework.wbrequestresponse import WbResponse from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.framework.wbexceptions import CaptureException, InternalRedirect from pywb.framework.wbrequestresponse import WbResponse
from pywb.framework.memento import MementoResponse
from pywb.framework.wbexceptions import CaptureException
from pywb.warc.recordloader import ArchiveLoadFailed from pywb.warc.recordloader import ArchiveLoadFailed
from pywb.utils.loaders import LimitReader from pywb.utils.loaders import LimitReader
#================================================================= #=================================================================
class ReplayView: class ReplayView:
STRIP_SCHEME = re.compile('^([\w]+:[/]*)?(.*?)$') STRIP_SCHEME = re.compile('^([\w]+:[/]*)?(.*?)$')
def __init__(self, content_loader, content_rewriter, head_insert_view = None, def __init__(self, content_loader, content_rewriter, head_insert_view=None,
redir_to_exact = True, buffer_response = False, reporter = None): redir_to_exact=True, buffer_response=False, reporter=None,
memento=False):
self.content_loader = content_loader self.content_loader = content_loader
self.content_rewriter = content_rewriter self.content_rewriter = content_rewriter
@ -28,6 +33,11 @@ class ReplayView:
self._reporter = reporter self._reporter = reporter
if memento:
self.response_class = MementoResponse
else:
self.response_class = WbResponse
def __call__(self, wbrequest, cdx_lines, cdx_loader): def __call__(self, wbrequest, cdx_lines, cdx_loader):
last_e = None last_e = None
@ -42,7 +52,10 @@ class ReplayView:
try: try:
# optimize: can detect if redirect is needed just from the cdx, no need to load w/arc data # optimize: can detect if redirect is needed just from the cdx, no need to load w/arc data
if first: if first:
self._redirect_if_needed(wbrequest, cdx) redir_response = self._redirect_if_needed(wbrequest, cdx)
if redir_response:
return redir_response
first = False first = False
(status_headers, stream) = (self.content_loader. (status_headers, stream) = (self.content_loader.
@ -52,7 +65,9 @@ class ReplayView:
self._reject_self_redirect(wbrequest, cdx, status_headers) self._reject_self_redirect(wbrequest, cdx, status_headers)
# check if redir is needed # check if redir is needed
self._redirect_if_needed(wbrequest, cdx) redir_response = self._redirect_if_needed(wbrequest, cdx)
if redir_response:
return redir_response
# one more check for referrer-based self-redirect # one more check for referrer-based self-redirect
self._reject_referrer_self_redirect(wbrequest) self._reject_referrer_self_redirect(wbrequest)
@ -121,12 +136,16 @@ class ReplayView:
# no rewriting needed! # no rewriting needed!
if rewritten_headers.text_type is None: if rewritten_headers.text_type is None:
response_iter = self.stream_to_iter(stream) response_iter = self.stream_to_iter(stream)
return WbResponse(rewritten_headers.status_headers, response_iter) return self.response_class(rewritten_headers.status_headers,
response_iter,
wbrequest=wbrequest,
cdx=cdx)
def make_head_insert(rule): def make_head_insert(rule):
return (self.head_insert_view.render_to_string(wbrequest=wbrequest, return (self.head_insert_view.
cdx=cdx, render_to_string(wbrequest=wbrequest,
rule=rule)) cdx=cdx,
rule=rule))
# do head insert # do head insert
if self.head_insert_view: if self.head_insert_view:
head_insert_func = make_head_insert head_insert_func = make_head_insert
@ -145,9 +164,12 @@ class ReplayView:
if wbrequest.wb_url.mod == 'id_': if wbrequest.wb_url.mod == 'id_':
status_headers.remove_header('content-length') status_headers.remove_header('content-length')
return self.buffered_response(status_headers, response_gen) response_gen = self.buffered_response(status_headers, response_gen)
return WbResponse(status_headers, response_gen) return self.response_class(status_headers,
response_gen,
wbrequest=wbrequest,
cdx=cdx)
# Buffer rewrite iterator and return a response from a string # Buffer rewrite iterator and return a response from a string
@ -165,15 +187,27 @@ class ReplayView:
status_headers.headers.append(('Content-Length', content_length_str)) status_headers.headers.append(('Content-Length', content_length_str))
out.close() out.close()
return WbResponse(status_headers, value = [content]) return content
def _redirect_if_needed(self, wbrequest, cdx): def _redirect_if_needed(self, wbrequest, cdx):
if self.redir_to_exact and not wbrequest.is_proxy and cdx and (cdx['timestamp'] != wbrequest.wb_url.timestamp): if wbrequest.is_proxy:
new_url = wbrequest.urlrewriter.get_timestamp_url(cdx['timestamp'], cdx['original']) return None
raise InternalRedirect(new_url)
return None redir_needed = hasattr(wbrequest, 'is_timegate') and wbrequest.is_timegate
if not redir_needed and self.redir_to_exact:
redir_needed = (cdx['timestamp'] != wbrequest.wb_url.timestamp)
if not redir_needed:
return None
new_url = wbrequest.urlrewriter.get_timestamp_url(cdx['timestamp'], cdx['original'])
status_headers = StatusAndHeaders('302 Internal Redirect',
[('Location', new_url)])
# don't include cdx to indicate internal redirect
return self.response_class(status_headers,
wbrequest=wbrequest)
def _reject_self_redirect(self, wbrequest, cdx, status_headers): def _reject_self_redirect(self, wbrequest, cdx, status_headers):

View File

@ -9,27 +9,23 @@ from wbrequestresponse import WbRequest, WbResponse
# ArchivalRouter -- route WB requests in archival mode # ArchivalRouter -- route WB requests in archival mode
#================================================================= #=================================================================
class ArchivalRouter(object): class ArchivalRouter(object):
def __init__(self, routes, def __init__(self, routes, **kwargs):
hostpaths=None,
port=None,
abs_path=True,
home_view=None,
error_view=None):
self.routes = routes self.routes = routes
# optional port setting may be ignored by wsgi container # optional port setting may be ignored by wsgi container
self.port = port self.port = kwargs.get('port')
hostpaths = kwargs.get('hostpaths')
if hostpaths: if hostpaths:
self.fallback = ReferRedirect(hostpaths) self.fallback = ReferRedirect(hostpaths)
else: else:
self.fallback = None self.fallback = None
self.abs_path = abs_path self.abs_path = kwargs.get('abs_path')
self.home_view = home_view self.home_view = kwargs.get('home_view')
self.error_view = error_view self.error_view = kwargs.get('error_view')
def __call__(self, env): def __call__(self, env):
for route in self.routes: for route in self.routes:
@ -62,6 +58,7 @@ class Route(object):
SLASH_QUERY_LOOKAHEAD = '(?=/|$|\?)' SLASH_QUERY_LOOKAHEAD = '(?=/|$|\?)'
def __init__(self, regex, handler, coll_group=0, config={}, def __init__(self, regex, handler, coll_group=0, config={},
request_class=WbRequest,
lookahead=SLASH_QUERY_LOOKAHEAD): lookahead=SLASH_QUERY_LOOKAHEAD):
self.path = regex self.path = regex
@ -70,6 +67,7 @@ class Route(object):
else: else:
self.regex = re.compile('') self.regex = re.compile('')
self.handler = handler self.handler = handler
self.request_class = request_class
# collection id from regex group (default 0) # collection id from regex group (default 0)
self.coll_group = coll_group self.coll_group = coll_group
self._custom_init(config) self._custom_init(config)
@ -98,7 +96,7 @@ class Route(object):
coll = matcher.group(self.coll_group) coll = matcher.group(self.coll_group)
wbrequest = WbRequest(env, wbrequest = self.request_class(env,
request_uri=request_uri, request_uri=request_uri,
wb_url_str=wb_url_str, wb_url_str=wb_url_str,
rel_prefix=rel_prefix, rel_prefix=rel_prefix,

92
pywb/framework/memento.py Normal file
View File

@ -0,0 +1,92 @@
from pywb.utils.wbexception import BadRequestException
from pywb.utils.timeutils import http_date_to_timestamp
from pywb.utils.timeutils import timestamp_to_http_date
from wbrequestresponse import WbRequest, WbResponse
#=================================================================
class MementoReqMixin(object):
def _parse_extra(self):
self.is_timegate = False
if not self.wb_url:
return
if self.wb_url.type != self.wb_url.LATEST_REPLAY:
return
self.is_timegate = True
accept_datetime = self.env.get('HTTP_ACCEPT_DATETIME')
if not accept_datetime:
return
try:
timestamp = http_date_to_timestamp(accept_datetime)
except Exception:
raise BadRequestException('Invalid Accept-Datetime: ' +
accept_datetime)
self.wb_url.set_replay_timestamp(timestamp)
#=================================================================
class MementoRequest(MementoReqMixin, WbRequest):
pass
#=================================================================
class MementoRespMixin(object):
def _init_derived(self, params):
wbrequest = params.get('wbrequest')
cdx = params.get('cdx')
if not wbrequest or not wbrequest.wb_url:
return
is_timegate = wbrequest.is_timegate
if is_timegate:
self.status_headers.headers.append(('Vary', 'accept-datetime'))
# Determine if memento:
# if no cdx included, definitely not a memento
if not cdx:
is_memento = False
# otherwise, if in proxy mode, then always a memento
elif wbrequest.is_proxy:
is_memento = True
# otherwise only for replay
else:
is_memento = (wbrequest.wb_url.type == wbrequest.wb_url.REPLAY)
if is_memento:
http_date = timestamp_to_http_date(cdx['timestamp'])
self.status_headers.headers.append(('Memento-Datetime', http_date))
req_url = wbrequest.wb_url.url
if is_memento and is_timegate:
link = self.make_link(req_url, 'original timegate')
elif is_memento:
timegate = wbrequest.urlrewriter.get_timestamp_url('')
link = []
link.append(self.make_link(req_url, 'original'))
link.append(self.make_link(timegate, 'timegate'))
link = ', '.join(link)
else:
link = self.make_link(req_url, 'original')
self.status_headers.headers.append(('Link', link))
def make_link(self, url, type):
return '<{0}>; rel="{1}"'.format(url, type)
#=================================================================
class MementoResponse(MementoRespMixin, WbResponse):
pass

View File

@ -10,23 +10,12 @@ from pywb.rewrite.url_rewriter import HttpsUrlRewriter
# only latest capture is available currently # only latest capture is available currently
#================================================================= #=================================================================
class ProxyArchivalRouter(ArchivalRouter): class ProxyArchivalRouter(ArchivalRouter):
def __init__(self, routes, def __init__(self, routes, **kwargs):
hostpaths=None, super(ProxyArchivalRouter, self).__init__(routes, **kwargs)
port=None, request_class = routes[0].request_class
abs_path=True, self.proxy = ProxyRouter(routes[0].handler,
home_view=None, request_class=request_class,
error_view=None): **kwargs)
(super(ProxyArchivalRouter, self).
__init__(routes,
hostpaths=hostpaths,
port=port,
abs_path=abs_path,
home_view=home_view,
error_view=error_view))
self.proxy = ProxyRouter(routes[0].handler, hostpaths, error_view)
#self.error_view = error_view
def __call__(self, env): def __call__(self, env):
response = self.proxy(env) response = self.proxy(env)
@ -44,11 +33,12 @@ class ProxyArchivalRouter(ArchivalRouter):
# Only supports latest capture replay at the moment # Only supports latest capture replay at the moment
#================================================================= #=================================================================
class ProxyRouter: class ProxyRouter:
def __init__(self, handler, hostpaths=None, error_view=None): def __init__(self, handler, **kwargs):
self.handler = handler self.handler = handler
self.hostpaths = hostpaths self.hostpaths = kwargs.get('hostpaths')
self.error_view = error_view self.error_view = kwargs.get('error_view')
self.request_class = kwargs.get('request_class')
def __call__(self, env): def __call__(self, env):
url = env['REL_REQUEST_URI'] url = env['REL_REQUEST_URI']
@ -59,10 +49,9 @@ class ProxyRouter:
if not url.startswith('http://'): if not url.startswith('http://'):
return None return None
wbrequest = WbRequest(env, wbrequest = self.request_class(env,
request_uri=url, request_uri=url,
wb_url_str=url, wb_url_str=url,
#rel_prefix=url,
host_prefix=self.hostpaths[0], host_prefix=self.hostpaths[0],
wburl_class=self.handler.get_wburl_type(), wburl_class=self.handler.get_wburl_type(),
urlrewriter_class=HttpsUrlRewriter, urlrewriter_class=HttpsUrlRewriter,

View File

@ -1,22 +1,8 @@
from pywb.utils.wbexception import WbException from pywb.utils.wbexception import WbException
class NotFoundException(WbException):
def status(self):
return '404 Not Found'
# Exceptions that effect a specific capture and result in a retry # Exceptions that effect a specific capture and result in a retry
class CaptureException(WbException): class CaptureException(WbException):
def status(self): def status(self):
return '502 Internal Server Error' return '502 Internal Server Error'
class InternalRedirect(WbException):
def __init__(self, location, status='302 Internal Redirect'):
WbException.__init__(self, 'Redirecting -> ' + location)
self.status = status
self.httpHeaders = [('Location', location)]
def status(self):
return self.status

View File

@ -1,9 +1,9 @@
from pywb.utils.statusandheaders import StatusAndHeaders from pywb.utils.statusandheaders import StatusAndHeaders
import pprint import pprint
#================================================================= #=================================================================
class WbRequest: class WbRequest(object):
""" """
Represents the main pywb request object. Represents the main pywb request object.
@ -84,6 +84,8 @@ class WbRequest:
# PERF # PERF
env['X_PERF'] = {} env['X_PERF'] = {}
self._parse_extra()
def _is_ajax(self): def _is_ajax(self):
value = self.env.get('HTTP_X_REQUESTED_WITH') value = self.env.get('HTTP_X_REQUESTED_WITH')
if not value: if not value:
@ -101,18 +103,25 @@ class WbRequest:
varstr = pprint.pformat(varlist) varstr = pprint.pformat(varlist)
return varstr return varstr
def _parse_extra(self):
pass
#================================================================= #=================================================================
class WbResponse: class WbResponse(object):
""" """
Represnts a pywb wsgi response object. Represnts a pywb wsgi response object.
Holds a status_headers object and a response iter, to be Holds a status_headers object and a response iter, to be
returned to wsgi container. returned to wsgi container.
""" """
def __init__(self, status_headers, value=[]): def __init__(self, status_headers, value=[], **kwargs):
self.status_headers = status_headers self.status_headers = status_headers
self.body = value self.body = value
self._init_derived(kwargs)
def _init_derived(self, params):
pass
@staticmethod @staticmethod
def text_stream(stream, status='200 OK', content_type='text/plain'): def text_stream(stream, status='200 OK', content_type='text/plain'):

View File

@ -1,7 +1,7 @@
from pywb.utils.wbexception import WbException from pywb.utils.wbexception import WbException, NotFoundException
from pywb.utils.loaders import load_yaml_config from pywb.utils.loaders import load_yaml_config
from wbexceptions import NotFoundException, InternalRedirect #from wbexceptions import InternalRedirect
from wbrequestresponse import WbResponse, StatusAndHeaders from wbrequestresponse import WbResponse, StatusAndHeaders
@ -66,8 +66,8 @@ class WSGIApp(object):
msg = 'No handler for "{0}"'.format(env['REL_REQUEST_URI']) msg = 'No handler for "{0}"'.format(env['REL_REQUEST_URI'])
raise NotFoundException(msg) raise NotFoundException(msg)
except InternalRedirect as ir: # except InternalRedirect as ir:
response = WbResponse(StatusAndHeaders(ir.status, ir.httpHeaders)) # return ir.response
except WbException as e: except WbException as e:
response = handle_exception(env, wb_router, e, False) response = handle_exception(env, wb_router, e, False)

View File

@ -1,9 +1,9 @@
from pywb.utils.canonicalize import UrlCanonicalizer from pywb.utils.canonicalize import UrlCanonicalizer
from pywb.utils.wbexception import NotFoundException
from pywb.framework.basehandlers import WbUrlHandler from pywb.framework.basehandlers import WbUrlHandler
from pywb.framework.archivalrouter import ArchivalRouter, Route from pywb.framework.archivalrouter import ArchivalRouter, Route
from pywb.framework.wbrequestresponse import WbResponse from pywb.framework.wbrequestresponse import WbResponse
from pywb.framework.wbexceptions import NotFoundException
BLOCK = '["block"]' BLOCK = '["block"]'
ALLOW = '["allow"]' ALLOW = '["allow"]'

View File

@ -214,6 +214,11 @@ class WbUrl(BaseWbUrl):
return True return True
def set_replay_timestamp(self, timestamp):
self.timestamp = timestamp
self.type = self.REPLAY
# Str Representation # Str Representation
# ==================== # ====================
def to_str(self, **overrides): def to_str(self, **overrides):

View File

@ -4,7 +4,7 @@
import surt import surt
import urlparse import urlparse
from wbexception import WbException from wbexception import BadRequestException
#================================================================= #=================================================================
@ -17,10 +17,8 @@ class UrlCanonicalizer(object):
#================================================================= #=================================================================
class UrlCanonicalizeException(WbException): class UrlCanonicalizeException(BadRequestException):
def status(self): pass
return '400 Bad Request'
#================================================================= #=================================================================
def canonicalize(url, surt_ordered=True): def canonicalize(url, surt_ordered=True):

View File

@ -8,6 +8,7 @@ import time
import datetime import datetime
import calendar import calendar
from itertools import imap from itertools import imap
from email.utils import parsedate, formatdate
#================================================================= #=================================================================
# str <-> datetime conversion # str <-> datetime conversion
@ -38,6 +39,30 @@ def iso_date_to_datetime(string):
return the_datetime return the_datetime
def http_date_to_datetime(string):
"""
>>> http_date_to_datetime('Thu, 26 Dec 2013 09:50:10 GMT')
datetime.datetime(2013, 12, 26, 9, 50, 10)
"""
return datetime.datetime(*parsedate(string)[:6])
def datetime_to_http_date(the_datetime):
"""
>>> datetime_to_http_date(datetime.datetime(2013, 12, 26, 9, 50, 10))
'Thu, 26 Dec 2013 09:50:10 GMT'
# Verify inverses
>>> x = 'Thu, 26 Dec 2013 09:50:10 GMT'
>>> datetime_to_http_date(http_date_to_datetime(x)) == x
True
"""
timeval = calendar.timegm(the_datetime.utctimetuple())
return formatdate(timeval=timeval,
localtime=False,
usegmt=True)
def datetime_to_timestamp(the_datetime): def datetime_to_timestamp(the_datetime):
""" """
>>> datetime_to_timestamp(datetime.datetime(2013, 12, 26, 10, 11, 12)) >>> datetime_to_timestamp(datetime.datetime(2013, 12, 26, 10, 11, 12))
@ -59,6 +84,17 @@ def iso_date_to_timestamp(string):
return datetime_to_timestamp(iso_date_to_datetime(string)) return datetime_to_timestamp(iso_date_to_datetime(string))
def http_date_to_timestamp(string):
"""
>>> http_date_to_timestamp('Thu, 26 Dec 2013 09:50:00 GMT')
'20131226095000'
>>> http_date_to_timestamp('Sun, 26 Jan 2014 20:08:04 GMT')
'20140126200804'
"""
return datetime_to_timestamp(http_date_to_datetime(string))
# pad to certain length (default 6) # pad to certain length (default 6)
def _pad_timestamp(string, pad_str=PAD_6): def _pad_timestamp(string, pad_str=PAD_6):
""" """
@ -215,6 +251,17 @@ def timestamp_to_sec(string):
return calendar.timegm(timestamp_to_datetime(string).utctimetuple()) return calendar.timegm(timestamp_to_datetime(string).utctimetuple())
def timestamp_to_http_date(string):
"""
>>> timestamp_to_http_date('20131226095000')
'Thu, 26 Dec 2013 09:50:00 GMT'
>>> timestamp_to_http_date('20140126200804')
'Sun, 26 Jan 2014 20:08:04 GMT'
"""
return datetime_to_http_date(timestamp_to_datetime(string))
if __name__ == "__main__": if __name__ == "__main__":
import doctest import doctest
doctest.testmod() doctest.testmod()

View File

@ -10,3 +10,15 @@ class WbException(Exception):
class AccessException(WbException): class AccessException(WbException):
def status(self): def status(self):
return '403 Access Denied' return '403 Access Denied'
#=================================================================
class BadRequestException(WbException):
def status(self):
return '400 Bad Request'
#=================================================================
class NotFoundException(WbException):
def status(self):
return '404 Not Found'

View File

@ -5,7 +5,7 @@ import yaml
@pytest.fixture @pytest.fixture
def testconfig(): def testconfig():
config = yaml.load(open('test_config.yaml')) config = yaml.load(open('tests/test_config.yaml'))
assert config assert config
if 'index_paths' not in config: if 'index_paths' not in config:
# !!! assumes this module is in a sub-directory of project root. # !!! assumes this module is in a sub-directory of project root.

View File

@ -80,10 +80,6 @@ absoulte_paths: true
static_routes: static_routes:
static/test/route: static/ static/test/route: static/
# ==== New / Experimental Settings ====
# Not yet production ready -- used primarily for testing
# Enable simple http proxy mode # Enable simple http proxy mode
enable_http_proxy: true enable_http_proxy: true
@ -100,5 +96,11 @@ reporter: !!python/object/new:tests.fixture.PrintReporter []
# custom rules for domain specific matching # custom rules for domain specific matching
#domain_specific_rules: rules.yaml #domain_specific_rules: rules.yaml
# ==== New / Experimental Settings ====
# Not yet production ready -- used primarily for testing
#perms_checker: !!python/object/new:pywb.cdx.perms.AllowAllPerms [] #perms_checker: !!python/object/new:pywb.cdx.perms.AllowAllPerms []
perms_policy: !!python/name:pywb.perms.test.test_perms_policy.perms_policy perms_policy: !!python/name:pywb.perms.test.test_perms_policy.perms_policy
# not testing memento here
enable_memento: False

View File

@ -4,7 +4,7 @@ from pywb.framework.wsgi_wrappers import init_app
from pywb.cdx.cdxobject import CDXObject from pywb.cdx.cdxobject import CDXObject
class TestWb: class TestWb:
TEST_CONFIG = 'test_config.yaml' TEST_CONFIG = 'tests/test_config.yaml'
def setup(self): def setup(self):
#self.app = pywb.wbapp.create_wb_app(pywb.pywb_init.pywb_config()) #self.app = pywb.wbapp.create_wb_app(pywb.pywb_init.pywb_config())

175
tests/test_memento.py Normal file
View File

@ -0,0 +1,175 @@
import webtest
from pywb.core.pywb_init import create_wb_router
from pywb.framework.wsgi_wrappers import init_app
from pywb.cdx.cdxobject import CDXObject
MEMENTO_DATETIME = 'Memento-Datetime'
ACCEPT_DATETIME = 'Accept-Datetime'
LINK = 'Link'
VARY = 'Vary'
class TestWb:
TEST_CONFIG = 'tests/test_config_memento.yaml'
def setup(self):
self.app = init_app(create_wb_router,
load_yaml=True,
config_file=self.TEST_CONFIG)
self.testapp = webtest.TestApp(self.app)
# Below functionality is for archival (non-proxy) mode
# It is designed to conform to Memento protocol Pattern 2.1
# http://www.mementoweb.org/guide/rfc/#Pattern2.1
def test_timegate_latest(self):
"""
TimeGate with no Accept-Datetime header
"""
resp = self.testapp.get('/pywb/http://www.iana.org/_css/2013.1/screen.css')
assert resp.status_int == 302
assert resp.headers[VARY] == 'accept-datetime'
assert resp.headers[LINK] == '<http://www.iana.org/_css/2013.1/screen.css>; rel="original"'
assert MEMENTO_DATETIME not in resp.headers
assert '/pywb/20140127171239/http://www.iana.org/_css/2013.1/screen.css' in resp.headers['Location']
def test_timegate_accept_datetime(self):
"""
TimeGate with Accept-Datetime header
"""
headers = {ACCEPT_DATETIME: 'Sun, 26 Jan 2014 20:08:04'}
resp = self.testapp.get('/pywb/http://www.iana.org/_css/2013.1/screen.css', headers=headers)
assert resp.status_int == 302
assert resp.headers[VARY] == 'accept-datetime'
assert resp.headers[LINK] == '<http://www.iana.org/_css/2013.1/screen.css>; rel="original"'
assert MEMENTO_DATETIME not in resp.headers
assert '/pywb/20140126200804/http://www.iana.org/_css/2013.1/screen.css' in resp.headers['Location']
def test_non_timegate_intermediate_redir(self):
"""
Not a timegate, but an 'intermediate resource', redirect to closest timestamp
"""
headers = {ACCEPT_DATETIME: 'Sun, 26 Jan 2014 20:08:04'}
# not a timegate, partial timestamp /2014/ present
resp = self.testapp.get('/pywb/2014/http://www.iana.org/_css/2013.1/screen.css', headers=headers)
assert resp.status_int == 302
# no vary header
assert VARY not in resp.headers
assert resp.headers[LINK] == '<http://www.iana.org/_css/2013.1/screen.css>; rel="original"'
assert MEMENTO_DATETIME not in resp.headers
# redirect to latest, not negotiation via Accept-Datetime
assert '/pywb/20140127171239/' in resp.headers['Location']
def test_memento_url(self):
"""
Memento response, 200 capture
"""
resp = self.testapp.get('/pywb/20140126200804/http://www.iana.org/_css/2013.1/screen.css')
assert resp.status_int == 200
assert VARY not in resp.headers
assert resp.headers[LINK] == '<http://www.iana.org/_css/2013.1/screen.css>; rel="original", \
<http://localhost:80/pywb/http://www.iana.org/_css/2013.1/screen.css>; rel="timegate"'
assert resp.headers[MEMENTO_DATETIME] == 'Sun, 26 Jan 2014 20:08:04 GMT'
def test_302_memento(self):
"""
Memento (capture) of a 302 response
"""
resp = self.testapp.get('/pywb/20140128051539/http://www.iana.org/domains/example')
assert resp.status_int == 302
assert VARY not in resp.headers
assert resp.headers[LINK] == '<http://www.iana.org/domains/example>; rel="original", \
<http://localhost:80/pywb/http://www.iana.org/domains/example>; rel="timegate"'
assert resp.headers[MEMENTO_DATETIME] == 'Tue, 28 Jan 2014 05:15:39 GMT'
# Below functions test pywb proxy mode behavior
# They are designed to roughly conform to Memento protocol Pattern 1.3
# with the exception that the original resource is not available
def test_proxy_latest_memento(self):
"""
Proxy Mode memento with no Accept-Datetime
Both a timegate and a memento
"""
# simulate proxy mode by setting REQUEST_URI
request_uri = 'http://www.iana.org/_css/2013.1/screen.css'
extra = dict(REQUEST_URI=request_uri, SCRIPT_NAME='')
resp = self.testapp.get('/x-ignore-this-x', extra_environ=extra)
assert resp.status_int == 200
# for timegate
assert resp.headers[VARY] == 'accept-datetime'
# for memento
assert resp.headers[LINK] == '<http://www.iana.org/_css/2013.1/screen.css>; rel="original timegate"'
assert resp.headers[MEMENTO_DATETIME] == 'Mon, 27 Jan 2014 17:12:39 GMT'
def test_proxy_accept_datetime_memento(self):
"""
Proxy Mode memento with specific Accept-Datetime
Both a timegate and a memento
"""
# simulate proxy mode by setting REQUEST_URI
request_uri = 'http://www.iana.org/_css/2013.1/screen.css'
extra = dict(REQUEST_URI=request_uri, SCRIPT_NAME='')
headers = {ACCEPT_DATETIME: 'Sun, 26 Jan 2014 20:08:04'}
resp = self.testapp.get('/x-ignore-this-x', extra_environ=extra, headers=headers)
assert resp.status_int == 200
# for timegate
assert resp.headers[VARY] == 'accept-datetime'
# for memento
assert resp.headers[LINK] == '<http://www.iana.org/_css/2013.1/screen.css>; rel="original timegate"'
assert resp.headers[MEMENTO_DATETIME] == 'Sun, 26 Jan 2014 20:08:04 GMT'
def test_error_bad_accept_datetime(self):
"""
400 response for bad accept_datetime
"""
headers = {ACCEPT_DATETIME: 'Sun'}
resp = self.testapp.get('/pywb/http://www.iana.org/_css/2013.1/screen.css', headers=headers, status=400)
assert resp.status_int == 400
def test_error_bad_accept_datetime_proxy(self):
"""
400 response for bad accept_datetime
with proxy mode
"""
request_uri = 'http://www.iana.org/_css/2013.1/screen.css'
extra = dict(REQUEST_URI=request_uri, SCRIPT_NAME='')
headers = {ACCEPT_DATETIME: 'Sun, abc'}
resp = self.testapp.get('/x-ignore-this-x', extra_environ=extra, headers=headers, status=400)
assert resp.status_int == 400

View File

@ -5,7 +5,7 @@ from pywb.perms.perms_handler import ALLOW, BLOCK
from pywb.framework.wsgi_wrappers import init_app from pywb.framework.wsgi_wrappers import init_app
class TestPermsApp: class TestPermsApp:
TEST_CONFIG = 'test_config.yaml' TEST_CONFIG = 'tests/test_config.yaml'
def setup(self): def setup(self):
self.app = init_app(create_perms_checker_app, self.app = init_app(create_perms_checker_app,