mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
Merge branch 'develop' for 0.5.0 release
This commit is contained in:
commit
c3fecb6e0d
30
CHANGES.rst
30
CHANGES.rst
@ -1,3 +1,33 @@
|
||||
pywb 0.5.0 changelist
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
* Catch live rewrite errors and display more friendly pywb error message.
|
||||
|
||||
* LiveRewriteHandler and WBHandler refactoring: LiveRewriteHandler now supports a root search page html template.
|
||||
|
||||
* Proxy mode option: 'unaltered_replay' to proxy archival data with no modifications (no banner, no server or client side rewriting).
|
||||
|
||||
* Fix client side rewriting (wombat.js) for proxy mode: only rewrite https -> http in absolute urls.
|
||||
|
||||
* Fixes to memento timemap/timegate to work with framed replay mode.
|
||||
|
||||
* Support for a fallback handler which will be called from a replay handler instead of a 404 response.
|
||||
|
||||
The handler, specified via the ``fallback`` option, can be the name of any other replay handler. Typically, it can be used with a live rewrite handler to fetch missing content from live instead of showing a 404.
|
||||
|
||||
* Live Rewrite can now be included as a 'collection type' in a pywb deployment by setting index path to ``$liveweb``.
|
||||
|
||||
* ``live-rewrite-server`` has optional ``--proxy host:port`` param to specify a loading live web data through an HTTP/S proxy, such as for use with a recording proxy.
|
||||
|
||||
* wombat: add document.cookie -> document.WB_wombat_cookie rewriting to check and rewrite Path= to archival url
|
||||
|
||||
* Better parent relative '../' path rewriting, resolved to correct absolute urls when rewritten. Additional testing for parent relative urls.
|
||||
|
||||
* New 'proxy_options' block, including 'use_default_coll' to allow defaulting to first collection w/o proxy auth.
|
||||
|
||||
* Improved support for proxy mode, allow different collections to be selected via proxy auth
|
||||
|
||||
|
||||
pywb 0.4.7 changelist
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
|
24
README.rst
24
README.rst
@ -1,4 +1,4 @@
|
||||
PyWb 0.4.7
|
||||
PyWb 0.5.0
|
||||
==========
|
||||
|
||||
.. image:: https://travis-ci.org/ikreymer/pywb.png?branch=master
|
||||
@ -11,9 +11,25 @@ pywb is a python implementation of web archival replay tools, sometimes also kno
|
||||
|
||||
pywb allows high-quality replay (browsing) of archived web data stored in standardized `ARC <http://en.wikipedia.org/wiki/ARC_(file_format)>`_ and `WARC <http://en.wikipedia.org/wiki/Web_ARChive>`_.
|
||||
|
||||
*For an example of deployed service using pywb, please see the https://webrecorder.io project*
|
||||
|
||||
pywb Tools
|
||||
Usage Examples
|
||||
-----------------------------
|
||||
|
||||
This README contains a basic overview of using pywb. After reading this intro, consider also taking a look at these seperate projects:
|
||||
|
||||
* `pywb-webrecorder <https://github.com/ikreymer/pywb-webrecorder>`_ demonstrates a way to use pywb and warcprox to record web content while browsing.
|
||||
|
||||
* `pywb-samples <https://github.com/ikreymer/pywb-samples>`_ provides additional archive samples with difficult-to-replay content.
|
||||
|
||||
|
||||
The following deployed applications use pywb:
|
||||
|
||||
* https://perma.cc embeds pywb as part of a larger `open source application <https://github.com/harvard-lil/perma>`_ to provide web archive replay for law libraries.
|
||||
|
||||
* https://webrecorder.io uses pywb and builds upon pywb-webrecorder to create a hosted web recording and replay system.
|
||||
|
||||
|
||||
pywb Tools Overview
|
||||
-----------------------------
|
||||
|
||||
In addition to the standard wayback machine (explained further below), pywb tool suite includes a
|
||||
@ -72,7 +88,7 @@ This process can be done by running the ``cdx-indexer`` script and only needs to
|
||||
|
||||
Given an archive of warcs at ``myarchive/warcs``
|
||||
|
||||
1. Create a dir for indexs, .eg. ``myarchive/cdx``
|
||||
1. Create a dir for indexes, .eg. ``myarchive/cdx``
|
||||
|
||||
2. Run ``cdx-indexer --sort myarchive/cdx myarchive/warcs`` to generate .cdx files for each
|
||||
warc/arc file in ``myarchive/warcs``
|
||||
|
@ -91,6 +91,12 @@ static_routes:
|
||||
# Enable simple http proxy mode
|
||||
enable_http_proxy: true
|
||||
|
||||
# Additional proxy options (defaults)
|
||||
#proxy_options:
|
||||
# use_default_coll: true
|
||||
#
|
||||
# unaltered_replay: false
|
||||
|
||||
# enable cdx server api for querying cdx directly (experimental)
|
||||
enable_cdx_api: true
|
||||
|
||||
|
@ -2,15 +2,36 @@ from pywb.framework.wsgi_wrappers import init_app, start_wsgi_server
|
||||
|
||||
from pywb.webapp.live_rewrite_handler import create_live_rewriter_app
|
||||
|
||||
from argparse import ArgumentParser
|
||||
|
||||
|
||||
#=================================================================
|
||||
# init cdx server app
|
||||
# init rewrite server app
|
||||
#=================================================================
|
||||
|
||||
application = init_app(create_live_rewriter_app, load_yaml=False)
|
||||
def create_app():
|
||||
parser = ArgumentParser(description='Live Rewrite Server')
|
||||
|
||||
parser.add_argument('-x', '--proxy',
|
||||
action='store',
|
||||
help='Specify host:port to use as HTTP/S proxy')
|
||||
|
||||
result, unknown = parser.parse_known_args()
|
||||
|
||||
config = dict(proxyhostport=result.proxy, framed_replay=True)
|
||||
|
||||
app = init_app(create_live_rewriter_app, load_yaml=False,
|
||||
config=config)
|
||||
|
||||
return app
|
||||
|
||||
|
||||
application = create_app()
|
||||
|
||||
|
||||
def main(): # pragma: no cover
|
||||
start_wsgi_server(application, 'Live Rewriter App', default_port=8090)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
@ -58,7 +58,7 @@ class BaseCDXServer(object):
|
||||
return self.load_cdx(**fuzzy_query_params)
|
||||
|
||||
msg = 'No Captures found for: ' + query.url
|
||||
raise NotFoundException(msg)
|
||||
raise NotFoundException(msg, url=query.url)
|
||||
|
||||
def _calc_search_keys(self, query):
|
||||
return calc_search_range(url=query.url,
|
||||
|
@ -29,16 +29,48 @@ class ArchivalRouter(object):
|
||||
self.error_view = kwargs.get('error_view')
|
||||
|
||||
def __call__(self, env):
|
||||
request_uri = env['REL_REQUEST_URI']
|
||||
|
||||
for route in self.routes:
|
||||
result = route(env, self.abs_path)
|
||||
if result:
|
||||
return result
|
||||
matcher, coll = route.is_handling(request_uri)
|
||||
if matcher:
|
||||
wbrequest = self.parse_request(route, env, matcher,
|
||||
coll, request_uri,
|
||||
use_abs_prefix=self.abs_path)
|
||||
|
||||
return route.handler(wbrequest)
|
||||
|
||||
# Default Home Page
|
||||
if env['REL_REQUEST_URI'] in ['/', '/index.html', '/index.htm']:
|
||||
if request_uri in ['/', '/index.html', '/index.htm']:
|
||||
return self.render_home_page(env)
|
||||
|
||||
return self.fallback(env, self.routes) if self.fallback else None
|
||||
return self.fallback(env, self) if self.fallback else None
|
||||
|
||||
def parse_request(self, route, env, matcher, coll, request_uri,
|
||||
use_abs_prefix=False):
|
||||
matched_str = matcher.group(0)
|
||||
if matched_str:
|
||||
rel_prefix = env['SCRIPT_NAME'] + '/' + matched_str + '/'
|
||||
# remove the '/' + rel_prefix part of uri
|
||||
wb_url_str = request_uri[len(matched_str) + 2:]
|
||||
else:
|
||||
rel_prefix = env['SCRIPT_NAME'] + '/'
|
||||
# the request_uri is the wb_url, since no coll
|
||||
wb_url_str = request_uri[1:]
|
||||
|
||||
wbrequest = route.request_class(env,
|
||||
request_uri=request_uri,
|
||||
wb_url_str=wb_url_str,
|
||||
rel_prefix=rel_prefix,
|
||||
coll=coll,
|
||||
use_abs_prefix=use_abs_prefix,
|
||||
wburl_class=route.handler.get_wburl_type(),
|
||||
urlrewriter_class=UrlRewriter)
|
||||
|
||||
# Allow for applying of additional filters
|
||||
route.apply_filters(wbrequest, matcher)
|
||||
|
||||
return wbrequest
|
||||
|
||||
def render_home_page(self, env):
|
||||
# render the homepage!
|
||||
@ -73,45 +105,15 @@ class Route(object):
|
||||
self.coll_group = coll_group
|
||||
self._custom_init(config)
|
||||
|
||||
def __call__(self, env, use_abs_prefix):
|
||||
wbrequest = self.parse_request(env, use_abs_prefix)
|
||||
return self.handler(wbrequest) if wbrequest else None
|
||||
|
||||
def parse_request(self, env, use_abs_prefix, request_uri=None):
|
||||
if not request_uri:
|
||||
request_uri = env['REL_REQUEST_URI']
|
||||
|
||||
def is_handling(self, request_uri):
|
||||
matcher = self.regex.match(request_uri[1:])
|
||||
if not matcher:
|
||||
return None
|
||||
|
||||
matched_str = matcher.group(0)
|
||||
if matched_str:
|
||||
rel_prefix = env['SCRIPT_NAME'] + '/' + matched_str + '/'
|
||||
# remove the '/' + rel_prefix part of uri
|
||||
wb_url_str = request_uri[len(matched_str) + 2:]
|
||||
else:
|
||||
rel_prefix = env['SCRIPT_NAME'] + '/'
|
||||
# the request_uri is the wb_url, since no coll
|
||||
wb_url_str = request_uri[1:]
|
||||
return None, None
|
||||
|
||||
coll = matcher.group(self.coll_group)
|
||||
return matcher, coll
|
||||
|
||||
wbrequest = self.request_class(env,
|
||||
request_uri=request_uri,
|
||||
wb_url_str=wb_url_str,
|
||||
rel_prefix=rel_prefix,
|
||||
coll=coll,
|
||||
use_abs_prefix=use_abs_prefix,
|
||||
wburl_class=self.handler.get_wburl_type(),
|
||||
urlrewriter_class=UrlRewriter)
|
||||
|
||||
# Allow for applying of additional filters
|
||||
self._apply_filters(wbrequest, matcher)
|
||||
|
||||
return wbrequest
|
||||
|
||||
def _apply_filters(self, wbrequest, matcher):
|
||||
def apply_filters(self, wbrequest, matcher):
|
||||
for filter in self.filters:
|
||||
last_grp = len(matcher.groups())
|
||||
filter_str = filter.format(matcher.group(last_grp))
|
||||
@ -136,9 +138,11 @@ class ReferRedirect:
|
||||
else:
|
||||
self.match_prefixs = [match_prefixs]
|
||||
|
||||
def __call__(self, env, routes):
|
||||
def __call__(self, env, the_router):
|
||||
referrer = env.get('HTTP_REFERER')
|
||||
|
||||
routes = the_router.routes
|
||||
|
||||
# ensure there is a referrer
|
||||
if referrer is None:
|
||||
return None
|
||||
@ -166,17 +170,15 @@ class ReferRedirect:
|
||||
ref_request = None
|
||||
|
||||
for route in routes:
|
||||
ref_request = route.parse_request(env, False, request_uri=path)
|
||||
if ref_request:
|
||||
matcher, coll = route.is_handling(path)
|
||||
if matcher:
|
||||
ref_request = the_router.parse_request(route, env,
|
||||
matcher, coll, path)
|
||||
ref_route = route
|
||||
break
|
||||
|
||||
# must have matched one of the routes
|
||||
if not ref_request:
|
||||
return None
|
||||
|
||||
# must have a rewriter
|
||||
if not ref_request.urlrewriter:
|
||||
# must have matched one of the routes with a urlrewriter
|
||||
if not ref_request or not ref_request.urlrewriter:
|
||||
return None
|
||||
|
||||
rewriter = ref_request.urlrewriter
|
||||
|
@ -11,15 +11,13 @@ LINK_FORMAT = 'application/link-format'
|
||||
#=================================================================
|
||||
class MementoReqMixin(object):
|
||||
def _parse_extra(self):
|
||||
self.is_timegate = False
|
||||
|
||||
if not self.wb_url:
|
||||
return
|
||||
|
||||
if self.wb_url.type != self.wb_url.LATEST_REPLAY:
|
||||
return
|
||||
|
||||
self.is_timegate = True
|
||||
self.options['is_timegate'] = True
|
||||
|
||||
accept_datetime = self.env.get('HTTP_ACCEPT_DATETIME')
|
||||
if not accept_datetime:
|
||||
@ -48,7 +46,7 @@ class MementoRespMixin(object):
|
||||
if not wbrequest or not wbrequest.wb_url:
|
||||
return
|
||||
|
||||
is_timegate = wbrequest.is_timegate
|
||||
is_timegate = wbrequest.options.get('is_timegate', False)
|
||||
|
||||
if is_timegate:
|
||||
self.status_headers.headers.append(('Vary', 'accept-datetime'))
|
||||
@ -59,7 +57,7 @@ class MementoRespMixin(object):
|
||||
is_memento = False
|
||||
|
||||
# otherwise, if in proxy mode, then always a memento
|
||||
elif wbrequest.is_proxy:
|
||||
elif wbrequest.options['is_proxy']:
|
||||
is_memento = True
|
||||
|
||||
# otherwise only for replay
|
||||
@ -80,7 +78,7 @@ class MementoRespMixin(object):
|
||||
link.append(self.make_link(req_url, 'original'))
|
||||
|
||||
# for now, include timemap only in non-proxy mode
|
||||
if not wbrequest.is_proxy and (is_memento or is_timegate):
|
||||
if not wbrequest.options['is_proxy'] and (is_memento or is_timegate):
|
||||
link.append(self.make_timemap_link(wbrequest))
|
||||
|
||||
if is_memento and not is_timegate:
|
||||
@ -117,6 +115,7 @@ def make_memento_link(cdx, prefix, datetime=None, rel='memento', end=',\n'):
|
||||
memento = '<{0}>; rel="{1}"; datetime="{2}"' + end
|
||||
|
||||
string = WbUrl.to_wburl_str(url=cdx['original'],
|
||||
mod='mp_',
|
||||
timestamp=cdx['timestamp'],
|
||||
type=WbUrl.REPLAY)
|
||||
|
||||
@ -140,7 +139,8 @@ def make_timemap(wbrequest, cdx_lines):
|
||||
# timemap link
|
||||
timemap = ('<{0}>; rel="self"; ' +
|
||||
'type="application/link-format"; from="{1}",\n')
|
||||
yield timemap.format(prefix + wbrequest.wb_url.to_str(), from_date)
|
||||
yield timemap.format(prefix + wbrequest.wb_url.to_str(),
|
||||
from_date)
|
||||
|
||||
# original link
|
||||
original = '<{0}>; rel="original",\n'
|
||||
@ -148,7 +148,7 @@ def make_timemap(wbrequest, cdx_lines):
|
||||
|
||||
# timegate link
|
||||
timegate = '<{0}>; rel="timegate",\n'
|
||||
yield timegate.format(prefix + url)
|
||||
yield timegate.format(prefix + 'mp_/' + url)
|
||||
|
||||
# first memento link
|
||||
yield make_memento_link(first_cdx, prefix,
|
||||
|
@ -1,8 +1,11 @@
|
||||
from wbrequestresponse import WbResponse, WbRequest
|
||||
from archivalrouter import ArchivalRouter
|
||||
|
||||
import urlparse
|
||||
import base64
|
||||
|
||||
from pywb.rewrite.url_rewriter import HttpsUrlRewriter
|
||||
from pywb.utils.statusandheaders import StatusAndHeaders
|
||||
|
||||
|
||||
#=================================================================
|
||||
@ -15,10 +18,7 @@ class ProxyArchivalRouter(ArchivalRouter):
|
||||
"""
|
||||
def __init__(self, routes, **kwargs):
|
||||
super(ProxyArchivalRouter, self).__init__(routes, **kwargs)
|
||||
request_class = routes[0].request_class
|
||||
self.proxy = ProxyRouter(routes[0].handler,
|
||||
request_class=request_class,
|
||||
**kwargs)
|
||||
self.proxy = ProxyRouter(routes, **kwargs)
|
||||
|
||||
def __call__(self, env):
|
||||
response = self.proxy(env)
|
||||
@ -43,12 +43,23 @@ class ProxyRouter(object):
|
||||
See: http://www.mementoweb.org/guide/rfc/#Pattern1.3
|
||||
for more details.
|
||||
"""
|
||||
def __init__(self, handler, **kwargs):
|
||||
self.handler = handler
|
||||
|
||||
def __init__(self, routes, **kwargs):
|
||||
self.routes = routes
|
||||
self.hostpaths = kwargs.get('hostpaths')
|
||||
|
||||
self.error_view = kwargs.get('error_view')
|
||||
self.request_class = kwargs.get('request_class')
|
||||
|
||||
proxy_options = kwargs.get('config', {})
|
||||
if proxy_options:
|
||||
proxy_options = proxy_options.get('proxy_options', {})
|
||||
|
||||
self.auth_msg = proxy_options.get('auth_msg',
|
||||
'Please enter name of a collection to use for proxy mode')
|
||||
|
||||
self.use_default_coll = proxy_options.get('use_default_coll', True)
|
||||
|
||||
self.unaltered = proxy_options.get('unaltered_replay', False)
|
||||
|
||||
def __call__(self, env):
|
||||
url = env['REL_REQUEST_URI']
|
||||
@ -59,16 +70,57 @@ class ProxyRouter(object):
|
||||
if not url.startswith('http://'):
|
||||
return None
|
||||
|
||||
wbrequest = self.request_class(env,
|
||||
proxy_auth = env.get('HTTP_PROXY_AUTHORIZATION')
|
||||
|
||||
route = None
|
||||
coll = None
|
||||
matcher = None
|
||||
|
||||
if proxy_auth:
|
||||
proxy_coll = self.read_basic_auth_coll(proxy_auth)
|
||||
|
||||
if not proxy_coll:
|
||||
return self.proxy_auth_coll_response()
|
||||
|
||||
proxy_coll = '/' + proxy_coll + '/'
|
||||
|
||||
for r in self.routes:
|
||||
matcher, c = r.is_handling(proxy_coll)
|
||||
if matcher:
|
||||
route = r
|
||||
coll = c
|
||||
break
|
||||
|
||||
if not route:
|
||||
return self.proxy_auth_coll_response()
|
||||
|
||||
# if 'use_default_coll' or only one collection, use that
|
||||
# for proxy mode
|
||||
elif self.use_default_coll or len(self.routes) == 1:
|
||||
route = self.routes[0]
|
||||
coll = self.routes[0].regex.pattern
|
||||
|
||||
# otherwise, require proxy auth 407 to select collection
|
||||
else:
|
||||
return self.proxy_auth_coll_response()
|
||||
|
||||
wbrequest = route.request_class(env,
|
||||
request_uri=url,
|
||||
wb_url_str=url,
|
||||
coll=coll,
|
||||
host_prefix=self.hostpaths[0],
|
||||
wburl_class=self.handler.get_wburl_type(),
|
||||
wburl_class=route.handler.get_wburl_type(),
|
||||
urlrewriter_class=HttpsUrlRewriter,
|
||||
use_abs_prefix=False,
|
||||
is_proxy=True)
|
||||
|
||||
return self.handler(wbrequest)
|
||||
if matcher:
|
||||
route.apply_filters(wbrequest, matcher)
|
||||
|
||||
if self.unaltered:
|
||||
wbrequest.wb_url.mod = 'id_'
|
||||
|
||||
return route.handler(wbrequest)
|
||||
|
||||
# Proxy Auto-Config (PAC) script for the proxy
|
||||
def make_pac_response(self, env):
|
||||
@ -97,3 +149,27 @@ class ProxyRouter(object):
|
||||
content_type = 'application/x-ns-proxy-autoconfig'
|
||||
|
||||
return WbResponse.text_response(buff, content_type=content_type)
|
||||
|
||||
def proxy_auth_coll_response(self):
|
||||
proxy_msg = 'Basic realm="{0}"'.format(self.auth_msg)
|
||||
|
||||
headers = [('Content-Type', 'text/plain'),
|
||||
('Proxy-Authenticate', proxy_msg)]
|
||||
|
||||
status_headers = StatusAndHeaders('407 Proxy Authentication', headers)
|
||||
|
||||
value = self.auth_msg
|
||||
|
||||
return WbResponse(status_headers, value=[value])
|
||||
|
||||
@staticmethod
|
||||
def read_basic_auth_coll(value):
|
||||
parts = value.split(' ')
|
||||
if parts[0].lower() != 'basic':
|
||||
return ''
|
||||
|
||||
if len(parts) != 2:
|
||||
return ''
|
||||
|
||||
user_pass = base64.b64decode(parts[1])
|
||||
return user_pass.split(':')[0]
|
||||
|
@ -1,7 +1,7 @@
|
||||
"""
|
||||
# Test WbRequest parsed via a Route
|
||||
# route with relative path, print resulting wbrequest
|
||||
>>> print_req(Route('web', WbUrlHandler())({'REL_REQUEST_URI': '/web/test.example.com', 'SCRIPT_NAME': ''}, False))
|
||||
>>> _test_route_req(Route('web', WbUrlHandler()), {'REL_REQUEST_URI': '/web/test.example.com', 'SCRIPT_NAME': ''})
|
||||
{'coll': 'web',
|
||||
'request_uri': '/web/test.example.com',
|
||||
'wb_prefix': '/web/',
|
||||
@ -9,21 +9,21 @@
|
||||
|
||||
|
||||
# route with absolute path, running at script /my_pywb, print resultingwbrequest
|
||||
>>> print_req(Route('web', WbUrlHandler())({'REL_REQUEST_URI': '/web/2013im_/test.example.com', 'SCRIPT_NAME': '/my_pywb', 'HTTP_HOST': 'localhost:8081', 'wsgi.url_scheme': 'https'}, True))
|
||||
>>> _test_route_req(Route('web', WbUrlHandler()), {'REL_REQUEST_URI': '/web/2013im_/test.example.com', 'SCRIPT_NAME': '/my_pywb', 'HTTP_HOST': 'localhost:8081', 'wsgi.url_scheme': 'https'}, True)
|
||||
{'coll': 'web',
|
||||
'request_uri': '/web/2013im_/test.example.com',
|
||||
'wb_prefix': 'https://localhost:8081/my_pywb/web/',
|
||||
'wb_url': ('replay', '2013', 'im_', 'http://test.example.com', '2013im_/http://test.example.com')}
|
||||
|
||||
# route with no collection
|
||||
>>> print_req(Route('', BaseHandler())({'REL_REQUEST_URI': 'http://example.com', 'SCRIPT_NAME': '/pywb'}, False))
|
||||
>>> _test_route_req(Route('', BaseHandler()), {'REL_REQUEST_URI': 'http://example.com', 'SCRIPT_NAME': '/pywb'})
|
||||
{'coll': '',
|
||||
'request_uri': 'http://example.com',
|
||||
'wb_prefix': '/pywb/',
|
||||
'wb_url': None}
|
||||
|
||||
# not matching route -- skipped
|
||||
>>> Route('web', BaseHandler())({'REL_REQUEST_URI': '/other/test.example.com', 'SCRIPT_NAME': ''}, False)
|
||||
>>> _test_route_req(Route('web', BaseHandler()), {'REL_REQUEST_URI': '/other/test.example.com', 'SCRIPT_NAME': ''})
|
||||
|
||||
|
||||
# Referer Redirect Test
|
||||
@ -84,11 +84,18 @@ False
|
||||
|
||||
"""
|
||||
|
||||
from pywb.framework.archivalrouter import Route, ReferRedirect
|
||||
from pywb.framework.archivalrouter import Route, ReferRedirect, ArchivalRouter
|
||||
from pywb.framework.basehandlers import BaseHandler, WbUrlHandler
|
||||
import pprint
|
||||
|
||||
def print_req(req):
|
||||
def _test_route_req(route, env, abs_path=False):
|
||||
matcher, coll = route.is_handling(env['REL_REQUEST_URI'])
|
||||
if not matcher:
|
||||
return
|
||||
|
||||
the_router = ArchivalRouter([route], abs_path=abs_path)
|
||||
req = the_router.parse_request(route, env, matcher, coll, env['REL_REQUEST_URI'], abs_path)
|
||||
|
||||
varlist = vars(req)
|
||||
the_dict = dict((k, varlist[k]) for k in ('request_uri', 'wb_prefix', 'wb_url', 'coll'))
|
||||
pprint.pprint(the_dict)
|
||||
@ -102,9 +109,11 @@ def _test_redir(match_host, request_uri, referrer, script_name = '', coll = 'col
|
||||
|
||||
routes = [Route(coll, WbUrlHandler())]
|
||||
|
||||
the_router = ArchivalRouter(routes)
|
||||
|
||||
redir = ReferRedirect(match_host)
|
||||
#req = WbRequest.from_uri(request_uri, env)
|
||||
rep = redir(env, routes)
|
||||
rep = redir(env, the_router)
|
||||
if not rep:
|
||||
return False
|
||||
|
||||
|
@ -22,7 +22,7 @@ class TestCustomErrApp:
|
||||
|
||||
|
||||
def initer(app_class):
|
||||
def init():
|
||||
def init(config=None):
|
||||
return app_class()
|
||||
return init
|
||||
|
||||
|
@ -78,12 +78,11 @@ class WbRequest(object):
|
||||
|
||||
self.referrer = env.get('HTTP_REFERER')
|
||||
|
||||
self.is_ajax = self._is_ajax()
|
||||
self.options = dict()
|
||||
self.options['is_ajax'] = self._is_ajax()
|
||||
self.options['is_proxy'] = is_proxy
|
||||
|
||||
self.query_filter = []
|
||||
|
||||
self.is_proxy = is_proxy
|
||||
|
||||
self.custom_params = {}
|
||||
|
||||
# PERF
|
||||
|
@ -112,7 +112,7 @@ DEFAULT_CONFIG_FILE = 'config.yaml'
|
||||
|
||||
|
||||
#=================================================================
|
||||
def init_app(init_func, load_yaml=True, config_file=None):
|
||||
def init_app(init_func, load_yaml=True, config_file=None, config={}):
|
||||
logging.basicConfig(format='%(asctime)s: [%(levelname)s]: %(message)s',
|
||||
level=logging.DEBUG)
|
||||
logging.debug('')
|
||||
@ -129,9 +129,7 @@ def init_app(init_func, load_yaml=True, config_file=None):
|
||||
|
||||
config = load_yaml_config(config_file)
|
||||
|
||||
wb_router = init_func(config)
|
||||
else:
|
||||
wb_router = init_func()
|
||||
wb_router = init_func(config)
|
||||
except:
|
||||
msg = '*** pywb app init FAILED config from "%s"!\n'
|
||||
logging.exception(msg, init_func.__name__)
|
||||
@ -146,17 +144,8 @@ def init_app(init_func, load_yaml=True, config_file=None):
|
||||
#=================================================================
|
||||
def start_wsgi_server(the_app, name, default_port=None): # pragma: no cover
|
||||
from wsgiref.simple_server import make_server
|
||||
from optparse import OptionParser
|
||||
|
||||
opt = OptionParser('%prog [OPTIONS]')
|
||||
opt.add_option('-p', '--port', type='int', default=None)
|
||||
|
||||
options, args = opt.parse_args()
|
||||
|
||||
port = options.port
|
||||
|
||||
if not port:
|
||||
port = the_app.port
|
||||
port = the_app.port
|
||||
|
||||
if not port:
|
||||
if default_port:
|
||||
|
@ -31,7 +31,7 @@ class HeaderRewriter:
|
||||
'xml': ['/xml', '+xml', '.xml', '.rss'],
|
||||
}
|
||||
|
||||
PROXY_HEADERS = ['content-type', 'content-disposition']
|
||||
PROXY_HEADERS = ['content-type', 'content-disposition', 'content-range']
|
||||
|
||||
URL_REWRITE_HEADERS = ['location', 'content-location', 'content-base']
|
||||
|
||||
|
@ -126,6 +126,7 @@ class JSLinkAndLocationRewriter(JSLinkOnlyRewriter):
|
||||
(r'(?<!/)\blocation\b', RegexRewriter.add_prefix(prefix), 0),
|
||||
(r'(?<=document\.)domain', RegexRewriter.add_prefix(prefix), 0),
|
||||
(r'(?<=document\.)referrer', RegexRewriter.add_prefix(prefix), 0),
|
||||
(r'(?<=document\.)cookie', RegexRewriter.add_prefix(prefix), 0),
|
||||
|
||||
#todo: move to mixin?
|
||||
(r'(?:[\s=(){]|^)(top)(?:[\s!=}()]|$)',
|
||||
|
@ -160,7 +160,7 @@ class RewriteContent:
|
||||
first_buff=first_buff)
|
||||
|
||||
@staticmethod
|
||||
def _decode_buff(buff, stream, encoding): # pragma: no coverage
|
||||
def _decode_buff(buff, stream, encoding): # pragma: no coverage
|
||||
try:
|
||||
buff = buff.decode(encoding)
|
||||
except UnicodeDecodeError, e:
|
||||
|
@ -5,6 +5,7 @@ Fetch a url from live web and apply rewriting rules
|
||||
import requests
|
||||
import datetime
|
||||
import mimetypes
|
||||
import logging
|
||||
|
||||
from urlparse import urlsplit
|
||||
|
||||
@ -19,24 +20,13 @@ from pywb.rewrite.rewrite_content import RewriteContent
|
||||
|
||||
#=================================================================
|
||||
class LiveRewriter(object):
|
||||
PROXY_HEADER_LIST = [('HTTP_USER_AGENT', 'User-Agent'),
|
||||
('HTTP_ACCEPT', 'Accept'),
|
||||
('HTTP_ACCEPT_LANGUAGE', 'Accept-Language'),
|
||||
('HTTP_ACCEPT_CHARSET', 'Accept-Charset'),
|
||||
('HTTP_ACCEPT_ENCODING', 'Accept-Encoding'),
|
||||
('HTTP_RANGE', 'Range'),
|
||||
('HTTP_CACHE_CONTROL', 'Cache-Control'),
|
||||
('HTTP_X_REQUESTED_WITH', 'X-Requested-With'),
|
||||
('HTTP_X_CSRF_TOKEN', 'X-CSRF-Token'),
|
||||
('HTTP_PE_TOKEN', 'PE-Token'),
|
||||
('HTTP_COOKIE', 'Cookie'),
|
||||
('CONTENT_TYPE', 'Content-Type'),
|
||||
('CONTENT_LENGTH', 'Content-Length'),
|
||||
('REL_REFERER', 'Referer'),
|
||||
]
|
||||
|
||||
def __init__(self, defmod=''):
|
||||
def __init__(self, defmod='', default_proxy=None):
|
||||
self.rewriter = RewriteContent(defmod=defmod)
|
||||
self.default_proxy = default_proxy
|
||||
if self.default_proxy:
|
||||
logging.debug('Live Rewrite via proxy ' + self.default_proxy)
|
||||
else:
|
||||
logging.debug('Live Rewrite Direct (no proxy)')
|
||||
|
||||
def fetch_local_file(self, uri):
|
||||
fh = open(uri)
|
||||
@ -89,6 +79,10 @@ class LiveRewriter(object):
|
||||
method = 'GET'
|
||||
data = None
|
||||
|
||||
if not proxies and self.default_proxy:
|
||||
proxies = {'http': self.default_proxy,
|
||||
'https': self.default_proxy}
|
||||
|
||||
if env is not None:
|
||||
method = env['REQUEST_METHOD'].upper()
|
||||
input_ = env['wsgi.input']
|
||||
@ -156,7 +150,8 @@ class LiveRewriter(object):
|
||||
'timestamp': timestamp,
|
||||
'original': url,
|
||||
'statuscode': status_headers.get_statuscode(),
|
||||
'mimetype': status_headers.get_header('Content-Type')
|
||||
'mimetype': status_headers.get_header('Content-Type'),
|
||||
'is_live': True,
|
||||
}
|
||||
|
||||
result = (self.rewriter.
|
||||
|
@ -53,6 +53,10 @@ r"""
|
||||
>>> _test_js('cool_Location = "//example.com/abc.html" //comment')
|
||||
'cool_Location = "/web/20131010em_/http://example.com/abc.html" //comment'
|
||||
|
||||
# document.cookie test
|
||||
>>> _test_js('document.cookie = "a=b; Path=/"')
|
||||
'document.WB_wombat_cookie = "a=b; Path=/"'
|
||||
|
||||
|
||||
#=================================================================
|
||||
# XML Rewriting
|
||||
@ -86,6 +90,12 @@ r"""
|
||||
>>> _test_css("background: url(file.jpeg)")
|
||||
'background: url(/web/20131010em_/http://example.com/file.jpeg)'
|
||||
|
||||
>>> _test_css("background:#abc url('/static/styles/../images/layout/logo.png')")
|
||||
"background:#abc url('/web/20131010em_/http://example.com/static/images/layout/logo.png')"
|
||||
|
||||
>>> _test_css("background:#000 url('/static/styles/../../images/layout/logo.png')")
|
||||
"background:#000 url('/web/20131010em_/http://example.com/images/layout/logo.png')"
|
||||
|
||||
>>> _test_css("background: url('')")
|
||||
"background: url('')"
|
||||
|
||||
|
@ -83,7 +83,8 @@ def test_example_domain_specific_3():
|
||||
assert '/* Bootloader.configurePage' in buff
|
||||
|
||||
def test_wombat_top():
|
||||
status_headers, buff = get_rewritten('https://assets-cdn.github.com/assets/github-0f06d0f46fe7bcfbf31f2380f23aec15ba21b8ec.js', urlrewriter)
|
||||
#status_headers, buff = get_rewritten('https://assets-cdn.github.com/assets/github-0f06d0f46fe7bcfbf31f2380f23aec15ba21b8ec.js', urlrewriter)
|
||||
status_headers, buff = get_rewritten(get_test_dir() + 'text_content/toptest.js', urlrewriter)
|
||||
|
||||
assert 'WB_wombat_top!==window' in buff
|
||||
|
||||
|
@ -1,4 +1,21 @@
|
||||
"""
|
||||
# urljoin tests
|
||||
|
||||
>>> UrlRewriter.urljoin('http://example.com/test/', '../file.html')
|
||||
'http://example.com/file.html'
|
||||
|
||||
>>> UrlRewriter.urljoin('http://example.com/test/', '../path/../../../file.html')
|
||||
'http://example.com/file.html'
|
||||
|
||||
>>> UrlRewriter.urljoin('http://example.com/test/', '/../file.html')
|
||||
'http://example.com/file.html'
|
||||
|
||||
>>> UrlRewriter.urljoin('http://example.com/', '/abc/../../file.html')
|
||||
'http://example.com/file.html'
|
||||
|
||||
>>> UrlRewriter.urljoin('http://example.com/path/more/', 'abc/../../file.html')
|
||||
'http://example.com/path/file.html'
|
||||
|
||||
# UrlRewriter tests
|
||||
>>> do_rewrite('other.html', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/')
|
||||
'https://web.archive.org/web/20131010/http://example.com/path/other.html'
|
||||
|
@ -57,7 +57,7 @@ class UrlRewriter(object):
|
||||
else:
|
||||
# optimize: join if not absolute url, otherwise just use that
|
||||
if not is_abs:
|
||||
new_url = urlparse.urljoin(wburl.url, url).replace('../', '')
|
||||
new_url = self.urljoin(wburl.url, url)
|
||||
else:
|
||||
new_url = url
|
||||
|
||||
@ -81,8 +81,6 @@ class UrlRewriter(object):
|
||||
if new_url.startswith(self.prefix):
|
||||
new_url = new_url[len(self.prefix):]
|
||||
|
||||
#new_wburl = copy.copy(self.wburl)
|
||||
#new_wburl.url = new_url
|
||||
new_wburl = WbUrl(new_url)
|
||||
return UrlRewriter(new_wburl, self.prefix)
|
||||
|
||||
@ -92,6 +90,39 @@ class UrlRewriter(object):
|
||||
def __repr__(self):
|
||||
return "UrlRewriter('{0}', '{1}')".format(self.wburl, self.prefix)
|
||||
|
||||
@staticmethod
|
||||
def urljoin(orig_url, url):
|
||||
new_url = urlparse.urljoin(orig_url, url)
|
||||
if '../' not in new_url:
|
||||
return new_url
|
||||
|
||||
parts = urlparse.urlsplit(new_url)
|
||||
scheme, netloc, path, query, frag = parts
|
||||
|
||||
path_parts = path.split('/')
|
||||
i = 0
|
||||
n = len(path_parts) - 1
|
||||
while i < n:
|
||||
if path_parts[i] == '..':
|
||||
del path_parts[i]
|
||||
n -= 1
|
||||
if i > 0:
|
||||
del path_parts[i - 1]
|
||||
n -= 1
|
||||
i -= 1
|
||||
else:
|
||||
i += 1
|
||||
|
||||
if path_parts == ['']:
|
||||
path = '/'
|
||||
else:
|
||||
path = '/'.join(path_parts)
|
||||
|
||||
parts = (scheme, netloc, path, query, frag)
|
||||
|
||||
new_url = urlparse.urlunsplit(parts)
|
||||
return new_url
|
||||
|
||||
|
||||
#=================================================================
|
||||
class HttpsUrlRewriter(object):
|
||||
|
@ -19,6 +19,12 @@ This file is part of pywb.
|
||||
|
||||
_wb_js = (function() {
|
||||
|
||||
|
||||
var labels = {LOADING_MSG: "Loading...",
|
||||
REPLAY_MSG: "This is an <b>archived</b> page from ",
|
||||
LIVE_MSG: "This is a <b>live</b> page loaded on "};
|
||||
|
||||
|
||||
function init_banner() {
|
||||
var PLAIN_BANNER_ID = "_wb_plain_banner";
|
||||
var FRAME_BANNER_ID = "_wb_frame_top_banner";
|
||||
@ -40,19 +46,33 @@ function init_banner() {
|
||||
|
||||
var banner = document.getElementById(bid);
|
||||
|
||||
if (!banner) {
|
||||
banner = document.createElement("wb_div");
|
||||
banner.setAttribute("id", bid);
|
||||
banner.setAttribute("lang", "en");
|
||||
|
||||
text = "This is an archived page ";
|
||||
if (wbinfo && wbinfo.capture_str) {
|
||||
text += " from <b id='_wb_capture_info'>" + wbinfo.capture_str + "</b>";
|
||||
}
|
||||
banner.innerHTML = text;
|
||||
|
||||
document.body.insertBefore(banner, document.body.firstChild);
|
||||
if (banner) {
|
||||
return;
|
||||
}
|
||||
|
||||
banner = document.createElement("wb_div");
|
||||
banner.setAttribute("id", bid);
|
||||
banner.setAttribute("lang", "en");
|
||||
|
||||
var text;
|
||||
|
||||
if (wbinfo.is_frame) {
|
||||
text = labels.LOADING_MSG;
|
||||
} else if (wbinfo.is_live) {
|
||||
text = labels.LIVE_MSG;
|
||||
} else {
|
||||
text = labels.REPLAY_MSG;
|
||||
}
|
||||
|
||||
text = "<span id='_wb_label'>" + text + "</span>";
|
||||
|
||||
var capture_str = (wbinfo ? wbinfo.capture_str : "");
|
||||
|
||||
text += "<b id='_wb_capture_info'>" + capture_str + "</b>";
|
||||
|
||||
banner.innerHTML = text;
|
||||
|
||||
document.body.insertBefore(banner, document.body.firstChild);
|
||||
}
|
||||
|
||||
function add_event(name, func, object) {
|
||||
@ -105,7 +125,10 @@ function notify_top(event) {
|
||||
}
|
||||
|
||||
if (window.top.update_wb_url) {
|
||||
window.top.update_wb_url(window.WB_wombat_location.href, wbinfo.timestamp, wbinfo.capture_str);
|
||||
window.top.update_wb_url(window.WB_wombat_location.href,
|
||||
wbinfo.timestamp,
|
||||
wbinfo.capture_str,
|
||||
wbinfo.is_live);
|
||||
}
|
||||
}
|
||||
|
||||
@ -126,4 +149,6 @@ if (wbinfo.is_frame_mp && wbinfo.canon_url &&
|
||||
window.location.replace(wbinfo.canon_url);
|
||||
}
|
||||
|
||||
return {'labels': labels};
|
||||
|
||||
})();
|
||||
|
@ -121,6 +121,15 @@ WB_wombat_init = (function() {
|
||||
return url;
|
||||
}
|
||||
|
||||
// proxy mode: If no wb_replay_prefix, only rewrite https:// -> http://
|
||||
if (!wb_replay_prefix) {
|
||||
if (starts_with(url, HTTPS_PREFIX)) {
|
||||
return HTTP_PREFIX + url.substr(HTTPS_PREFIX.length);
|
||||
} else {
|
||||
return url;
|
||||
}
|
||||
}
|
||||
|
||||
// just in case wombat reference made it into url!
|
||||
url = url.replace("WB_wombat_", "");
|
||||
|
||||
@ -181,6 +190,11 @@ WB_wombat_init = (function() {
|
||||
return "";
|
||||
}
|
||||
|
||||
// proxy mode: no extraction needed
|
||||
if (!wb_replay_prefix) {
|
||||
return href;
|
||||
}
|
||||
|
||||
href = href.toString();
|
||||
|
||||
var index = href.indexOf("/http", 1);
|
||||
@ -639,6 +653,31 @@ WB_wombat_init = (function() {
|
||||
}
|
||||
}
|
||||
|
||||
function init_cookies_override()
|
||||
{
|
||||
var cookie_path_regex = /\bPath=\'?\"?([^;'"\s]+)/i;
|
||||
|
||||
var get_cookie = function() {
|
||||
return document.cookie;
|
||||
}
|
||||
|
||||
var set_cookie = function(value) {
|
||||
var matched = value.match(cookie_path_regex);
|
||||
|
||||
// if has cookie path, rewrite and replace
|
||||
if (matched) {
|
||||
var rewritten = rewrite_url(matched[1]);
|
||||
value = value.replace(matched[1], rewritten);
|
||||
}
|
||||
|
||||
document.cookie = value;
|
||||
}
|
||||
|
||||
def_prop(document, "WB_wombat_cookie", document.cookie,
|
||||
set_cookie,
|
||||
get_cookie);
|
||||
}
|
||||
|
||||
//============================================
|
||||
function init_write_override()
|
||||
{
|
||||
@ -658,20 +697,22 @@ WB_wombat_init = (function() {
|
||||
//============================================
|
||||
function wombat_init(replay_prefix, capture_date, orig_scheme, orig_host, timestamp) {
|
||||
wb_replay_prefix = replay_prefix;
|
||||
|
||||
wb_replay_date_prefix = replay_prefix + capture_date + "em_/";
|
||||
|
||||
if (capture_date.length > 0) {
|
||||
wb_capture_date_part = "/" + capture_date + "/";
|
||||
} else {
|
||||
wb_capture_date_part = "";
|
||||
if (wb_replay_prefix) {
|
||||
wb_replay_date_prefix = replay_prefix + capture_date + "em_/";
|
||||
|
||||
if (capture_date.length > 0) {
|
||||
wb_capture_date_part = "/" + capture_date + "/";
|
||||
} else {
|
||||
wb_capture_date_part = "";
|
||||
}
|
||||
|
||||
wb_orig_scheme = orig_scheme + '://';
|
||||
|
||||
wb_orig_host = wb_orig_scheme + orig_host;
|
||||
|
||||
init_bad_prefixes(replay_prefix);
|
||||
}
|
||||
|
||||
wb_orig_scheme = orig_scheme + '://';
|
||||
|
||||
wb_orig_host = wb_orig_scheme + orig_host;
|
||||
|
||||
init_bad_prefixes(replay_prefix);
|
||||
|
||||
// Location
|
||||
var wombat_location = new WombatLocation(window.self.location);
|
||||
@ -747,6 +788,9 @@ WB_wombat_init = (function() {
|
||||
// Ajax
|
||||
init_ajax_rewrite();
|
||||
init_worker_override();
|
||||
|
||||
// Cookies
|
||||
init_cookies_override();
|
||||
|
||||
// DOM
|
||||
init_dom_override();
|
||||
|
@ -3,7 +3,6 @@
|
||||
<!-- Start WB Insert -->
|
||||
<script>
|
||||
wbinfo = {}
|
||||
// wbinfo.capture_str = "{{ timestamp | format_ts }}";
|
||||
wbinfo.capture_str = " ";
|
||||
wbinfo.is_embed = false;
|
||||
wbinfo.prefix = "{{ wbrequest.wb_prefix }}";
|
||||
@ -33,7 +32,7 @@ function make_inner_url(url, ts)
|
||||
}
|
||||
}
|
||||
|
||||
function push_state(url, timestamp, capture_str) {
|
||||
function push_state(url, timestamp, capture_str, is_live) {
|
||||
var curr_href = null;
|
||||
|
||||
if (window.frames[0].WB_wombat_location) {
|
||||
@ -41,7 +40,7 @@ function push_state(url, timestamp, capture_str) {
|
||||
}
|
||||
|
||||
if (url != curr_href) {
|
||||
update_status(capture_str);
|
||||
update_status(capture_str, is_live);
|
||||
return;
|
||||
}
|
||||
|
||||
@ -55,14 +54,15 @@ function push_state(url, timestamp, capture_str) {
|
||||
state.inner_url = make_inner_url(url, state.timestamp);
|
||||
state.url = url;
|
||||
state.capture_str = capture_str;
|
||||
state.is_live = is_live;
|
||||
|
||||
window.history.replaceState(state, "", state.outer_url);
|
||||
|
||||
update_status(state.capture_str);
|
||||
update_status(state.capture_str, is_live);
|
||||
}
|
||||
|
||||
function pop_state(state) {
|
||||
update_status(state.capture_str);
|
||||
update_status(state.capture_str, state.is_live);
|
||||
|
||||
window.frames[0].src = state.outer_url;
|
||||
}
|
||||
@ -81,10 +81,19 @@ function extract_ts(url)
|
||||
return url.substring(inx + 1);
|
||||
}
|
||||
|
||||
function update_status(str) {
|
||||
var elem = document.getElementById("_wb_capture_info");
|
||||
if (elem) {
|
||||
elem.innerHTML = str;
|
||||
function update_status(str, is_live) {
|
||||
var capture_info = document.getElementById("_wb_capture_info");
|
||||
if (capture_info) {
|
||||
capture_info.innerHTML = str;
|
||||
}
|
||||
|
||||
var label = document.getElementById("_wb_label");
|
||||
if (label) {
|
||||
if (is_live) {
|
||||
label.innerHTML = _wb_js.labels.LIVE_MSG;
|
||||
} else {
|
||||
label.innerHTML = _wb_js.labels.REPLAY_MSG;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -18,6 +18,7 @@
|
||||
wbinfo.is_embed = {{"true" if wbrequest.wb_url.is_embed else "false"}};
|
||||
wbinfo.is_frame_mp = {{"true" if wbrequest.wb_url.mod == 'mp_' else "false"}}
|
||||
wbinfo.canon_url = "{{ canon_url }}";
|
||||
wbinfo.is_live = {{ "true" if cdx.is_live else "false" }};
|
||||
</script>
|
||||
<script src='{{ wbrequest.host_prefix }}/{{ static_path }}/wb.js'> </script>
|
||||
<link rel='stylesheet' href='{{ wbrequest.host_prefix }}/{{ static_path }}/wb.css'/>
|
||||
|
@ -9,3 +9,14 @@ The following archive collections are available:
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
</ul>
|
||||
|
||||
Other endpoints in this deployment:
|
||||
|
||||
<ul>
|
||||
{% for route in routes %}
|
||||
{% if not route | is_wb_handler %}
|
||||
<li><b>{{ '/' + route.path }}</b> - {{ route | string }}</li>
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
</ul>
|
||||
|
||||
|
@ -215,7 +215,8 @@ def create_record_iter(arcv_iter, options):
|
||||
not append_post):
|
||||
continue
|
||||
|
||||
elif (not include_all and record.content_type == 'application/warc-fields'):
|
||||
elif (not include_all and
|
||||
record.content_type == 'application/warc-fields'):
|
||||
continue
|
||||
|
||||
entry = parse_warc_record(record)
|
||||
@ -226,7 +227,8 @@ def create_record_iter(arcv_iter, options):
|
||||
continue
|
||||
|
||||
if entry.url and not entry.key:
|
||||
entry.key = canonicalize(entry.url, options.get('surt_ordered', True))
|
||||
entry.key = canonicalize(entry.url,
|
||||
options.get('surt_ordered', True))
|
||||
|
||||
compute_digest = False
|
||||
|
||||
|
@ -77,6 +77,10 @@ class SortedCDXWriter(CDXWriter):
|
||||
return False
|
||||
|
||||
|
||||
#=================================================================
|
||||
ALLOWED_EXT = ('.arc', '.arc.gz', '.warc', '.warc.gz')
|
||||
|
||||
|
||||
#=================================================================
|
||||
def iter_file_or_dir(inputs):
|
||||
for input_ in inputs:
|
||||
@ -84,12 +88,13 @@ def iter_file_or_dir(inputs):
|
||||
yield input_, os.path.basename(input_)
|
||||
else:
|
||||
for filename in os.listdir(input_):
|
||||
yield os.path.join(input_, filename), filename
|
||||
if filename.endswith(ALLOWED_EXT):
|
||||
yield os.path.join(input_, filename), filename
|
||||
|
||||
|
||||
#=================================================================
|
||||
def remove_ext(filename):
|
||||
for ext in ('.arc', '.arc.gz', '.warc', '.warc.gz'):
|
||||
for ext in ALLOWED_EXT:
|
||||
if filename.endswith(ext):
|
||||
filename = filename[:-len(ext)]
|
||||
break
|
||||
|
@ -25,7 +25,7 @@ class CDXAPIHandler(BaseHandler):
|
||||
return WbResponse.text_stream(cdx_iter)
|
||||
|
||||
def __str__(self):
|
||||
return 'CDX Handler: ' + str(self.index_handler)
|
||||
return 'CDX Index Handler'
|
||||
|
||||
@staticmethod
|
||||
def extract_params_from_wsgi_env(env):
|
||||
|
@ -8,37 +8,23 @@ from pywb.utils.loaders import BlockLoader
|
||||
from pywb.framework.basehandlers import BaseHandler, WbUrlHandler
|
||||
from pywb.framework.wbrequestresponse import WbResponse
|
||||
|
||||
from pywb.warc.recordloader import ArcWarcRecordLoader
|
||||
from pywb.warc.resolvingloader import ResolvingLoader
|
||||
|
||||
from views import J2TemplateView, add_env_globals
|
||||
from replay_views import ReplayView
|
||||
|
||||
|
||||
#=================================================================
|
||||
# Standard WB Handler
|
||||
#=================================================================
|
||||
class WBHandler(WbUrlHandler):
|
||||
def __init__(self, index_reader, replay,
|
||||
search_view=None, config=None):
|
||||
|
||||
self.index_reader = index_reader
|
||||
|
||||
self.replay = replay
|
||||
|
||||
self.search_view = search_view
|
||||
|
||||
def __call__(self, wbrequest):
|
||||
if wbrequest.wb_url_str == '/':
|
||||
return self.render_search_page(wbrequest)
|
||||
|
||||
with PerfTimer(wbrequest.env.get('X_PERF'), 'query') as t:
|
||||
response = self.index_reader.load_for_request(wbrequest)
|
||||
|
||||
if isinstance(response, WbResponse):
|
||||
return response
|
||||
|
||||
cdx_lines = response[0]
|
||||
cdx_callback = response[1]
|
||||
|
||||
with PerfTimer(wbrequest.env.get('X_PERF'), 'replay') as t:
|
||||
return self.replay(wbrequest,
|
||||
cdx_lines,
|
||||
cdx_callback)
|
||||
class SearchPageWbUrlHandler(WbUrlHandler):
|
||||
"""
|
||||
Loads a default search page html template to be shown when
|
||||
the wb_url is empty
|
||||
"""
|
||||
def __init__(self, config):
|
||||
self.search_view = (J2TemplateView.
|
||||
create_template(config.get('search_html'),
|
||||
'Search Page'))
|
||||
|
||||
def render_search_page(self, wbrequest, **kwargs):
|
||||
if self.search_view:
|
||||
@ -48,6 +34,67 @@ class WBHandler(WbUrlHandler):
|
||||
else:
|
||||
return WbResponse.text_response('No Lookup Url Specified')
|
||||
|
||||
|
||||
#=================================================================
|
||||
# Standard WB Handler
|
||||
#=================================================================
|
||||
class WBHandler(SearchPageWbUrlHandler):
|
||||
def __init__(self, query_handler, config=None):
|
||||
super(WBHandler, self).__init__(config)
|
||||
|
||||
self.index_reader = query_handler
|
||||
|
||||
cookie_maker = config.get('cookie_maker')
|
||||
record_loader = ArcWarcRecordLoader(cookie_maker=cookie_maker)
|
||||
|
||||
paths = config.get('archive_paths')
|
||||
|
||||
resolving_loader = ResolvingLoader(paths=paths,
|
||||
record_loader=record_loader)
|
||||
|
||||
template_globals = config.get('template_globals')
|
||||
if template_globals:
|
||||
add_env_globals(template_globals)
|
||||
|
||||
self.replay = ReplayView(resolving_loader, config)
|
||||
|
||||
self.fallback_handler = None
|
||||
self.fallback_name = config.get('fallback')
|
||||
|
||||
def resolve_refs(self, handler_dict):
|
||||
if self.fallback_name:
|
||||
self.fallback_handler = handler_dict.get(self.fallback_name)
|
||||
|
||||
def __call__(self, wbrequest):
|
||||
if wbrequest.wb_url_str == '/':
|
||||
return self.render_search_page(wbrequest)
|
||||
|
||||
try:
|
||||
with PerfTimer(wbrequest.env.get('X_PERF'), 'query') as t:
|
||||
response = self.index_reader.load_for_request(wbrequest)
|
||||
except NotFoundException as nfe:
|
||||
return self.handle_not_found(wbrequest, nfe)
|
||||
|
||||
if isinstance(response, WbResponse):
|
||||
return response
|
||||
|
||||
cdx_lines, cdx_callback = response
|
||||
return self.handle_replay(wbrequest, cdx_lines, cdx_callback)
|
||||
|
||||
def handle_replay(self, wbrequest, cdx_lines, cdx_callback):
|
||||
with PerfTimer(wbrequest.env.get('X_PERF'), 'replay') as t:
|
||||
return self.replay(wbrequest,
|
||||
cdx_lines,
|
||||
cdx_callback)
|
||||
|
||||
def handle_not_found(self, wbrequest, nfe):
|
||||
if (not self.fallback_handler or
|
||||
wbrequest.wb_url.is_query() or
|
||||
wbrequest.wb_url.is_identity):
|
||||
raise
|
||||
|
||||
return self.fallback_handler(wbrequest)
|
||||
|
||||
def __str__(self):
|
||||
return 'Web Archive Replay Handler'
|
||||
|
||||
|
@ -2,23 +2,44 @@ from pywb.framework.basehandlers import WbUrlHandler
|
||||
from pywb.framework.wbrequestresponse import WbResponse
|
||||
from pywb.framework.archivalrouter import ArchivalRouter, Route
|
||||
|
||||
from handlers import StaticHandler
|
||||
from handlers import StaticHandler, SearchPageWbUrlHandler
|
||||
|
||||
from replay_views import RewriteLiveView
|
||||
|
||||
from pywb.utils.wbexception import WbException
|
||||
|
||||
|
||||
#=================================================================
|
||||
class RewriteHandler(WbUrlHandler):
|
||||
def __init__(self, config=dict(framed_replay=True)):
|
||||
class LiveResourceException(WbException):
|
||||
def status(self):
|
||||
return '400 Bad Live Resource'
|
||||
|
||||
|
||||
#=================================================================
|
||||
class RewriteHandler(SearchPageWbUrlHandler):
|
||||
def __init__(self, config):
|
||||
super(RewriteHandler, self).__init__(config)
|
||||
self.rewrite_view = RewriteLiveView(config)
|
||||
|
||||
def __call__(self, wbrequest):
|
||||
return self.rewrite_view(wbrequest)
|
||||
if wbrequest.wb_url_str == '/':
|
||||
return self.render_search_page(wbrequest)
|
||||
|
||||
try:
|
||||
return self.rewrite_view(wbrequest)
|
||||
|
||||
except Exception as exc:
|
||||
url = wbrequest.wb_url.url
|
||||
msg = 'Could not load the url from the live web: ' + url
|
||||
raise LiveResourceException(msg=msg, url=url)
|
||||
|
||||
def __str__(self):
|
||||
return 'Live Web Rewrite Handler'
|
||||
|
||||
|
||||
#=================================================================
|
||||
def create_live_rewriter_app():
|
||||
routes = [Route('rewrite', RewriteHandler()),
|
||||
def create_live_rewriter_app(config={}):
|
||||
routes = [Route('rewrite', RewriteHandler(config)),
|
||||
Route('static/default', StaticHandler('pywb/static/'))
|
||||
]
|
||||
|
||||
|
@ -6,13 +6,10 @@ from pywb.framework.wbrequestresponse import WbRequest
|
||||
from pywb.framework.memento import MementoRequest
|
||||
from pywb.framework.basehandlers import BaseHandler
|
||||
|
||||
from pywb.warc.recordloader import ArcWarcRecordLoader
|
||||
from pywb.warc.resolvingloader import ResolvingLoader
|
||||
|
||||
from views import J2TemplateView, add_env_globals
|
||||
from views import J2TemplateView
|
||||
from views import J2HtmlCapturesView, HeadInsertView
|
||||
|
||||
from replay_views import ReplayView
|
||||
from live_rewrite_handler import RewriteHandler
|
||||
|
||||
from query_handler import QueryHandler
|
||||
from handlers import WBHandler
|
||||
@ -62,31 +59,10 @@ class DictChain:
|
||||
|
||||
#=================================================================
|
||||
def create_wb_handler(query_handler, config):
|
||||
|
||||
cookie_maker = config.get('cookie_maker')
|
||||
record_loader = ArcWarcRecordLoader(cookie_maker=cookie_maker)
|
||||
|
||||
paths = config.get('archive_paths')
|
||||
|
||||
resolving_loader = ResolvingLoader(paths=paths,
|
||||
record_loader=record_loader)
|
||||
|
||||
template_globals = config.get('template_globals')
|
||||
if template_globals:
|
||||
add_env_globals(template_globals)
|
||||
|
||||
replayer = ReplayView(resolving_loader, config)
|
||||
|
||||
search_view = (J2TemplateView.
|
||||
create_template(config.get('search_html'),
|
||||
'Search Page'))
|
||||
|
||||
wb_handler_class = config.get('wb_handler_class', WBHandler)
|
||||
|
||||
wb_handler = wb_handler_class(
|
||||
query_handler,
|
||||
replayer,
|
||||
search_view=search_view,
|
||||
config=config,
|
||||
)
|
||||
|
||||
@ -94,23 +70,33 @@ def create_wb_handler(query_handler, config):
|
||||
|
||||
|
||||
#=================================================================
|
||||
def init_collection(value, config):
|
||||
def create_live_handler(config):
|
||||
live_handler = RewriteHandler(config)
|
||||
return live_handler
|
||||
|
||||
|
||||
#=================================================================
|
||||
def init_route_config(value, config):
|
||||
if isinstance(value, str):
|
||||
value = {'index_paths': value}
|
||||
value = dict(index_paths=value)
|
||||
|
||||
route_config = DictChain(value, config)
|
||||
return route_config
|
||||
|
||||
|
||||
#=================================================================
|
||||
def init_collection(route_config):
|
||||
ds_rules_file = route_config.get('domain_specific_rules', None)
|
||||
|
||||
html_view = (J2HtmlCapturesView.
|
||||
create_template(config.get('query_html'),
|
||||
create_template(route_config.get('query_html'),
|
||||
'Captures Page'))
|
||||
|
||||
query_handler = QueryHandler.init_from_config(route_config,
|
||||
ds_rules_file,
|
||||
html_view)
|
||||
|
||||
return route_config, query_handler
|
||||
return query_handler
|
||||
|
||||
|
||||
#=================================================================
|
||||
@ -139,8 +125,8 @@ def create_cdx_server_app(passed_config):
|
||||
routes = []
|
||||
|
||||
for name, value in collections.iteritems():
|
||||
result = init_collection(value, config)
|
||||
route_config, query_handler = result
|
||||
route_config = init_route_config(value, config)
|
||||
query_handler = init_collection(route_config)
|
||||
|
||||
cdx_api_suffix = route_config.get('enable_cdx_api', True)
|
||||
|
||||
@ -173,23 +159,32 @@ def create_wb_router(passed_config={}):
|
||||
else:
|
||||
request_class = WbRequest
|
||||
|
||||
#if config.get('use_lxml_parser', False):
|
||||
# use_lxml_parser()
|
||||
# store live and replay handlers
|
||||
handler_dict = {}
|
||||
|
||||
for name, value in collections.iteritems():
|
||||
|
||||
if isinstance(value, BaseHandler):
|
||||
handler_dict[name] = value
|
||||
routes.append(Route(name, value))
|
||||
continue
|
||||
|
||||
result = init_collection(value, config)
|
||||
route_config, query_handler = result
|
||||
route_config = init_route_config(value, config)
|
||||
|
||||
if route_config.get('index_paths') == '$liveweb':
|
||||
live = create_live_handler(route_config)
|
||||
handler_dict[name] = live
|
||||
routes.append(Route(name, live))
|
||||
continue
|
||||
|
||||
query_handler = init_collection(route_config)
|
||||
|
||||
wb_handler = create_wb_handler(
|
||||
query_handler=query_handler,
|
||||
config=route_config
|
||||
config=route_config,
|
||||
)
|
||||
|
||||
handler_dict[name] = wb_handler
|
||||
|
||||
logging.debug('Adding Collection: ' + name)
|
||||
|
||||
route_class = route_config.get('route_class', Route)
|
||||
@ -215,6 +210,12 @@ def create_wb_router(passed_config={}):
|
||||
for static_name, static_path in static_routes.iteritems():
|
||||
routes.append(Route(static_name, StaticHandler(static_path)))
|
||||
|
||||
# resolve any cross handler references
|
||||
for route in routes:
|
||||
if hasattr(route.handler, 'resolve_refs'):
|
||||
route.handler.resolve_refs(handler_dict)
|
||||
|
||||
|
||||
# Check for new proxy mode!
|
||||
if config.get('enable_http_proxy', False):
|
||||
router = ProxyArchivalRouter
|
||||
@ -237,5 +238,7 @@ def create_wb_router(passed_config={}):
|
||||
'Home Page'),
|
||||
|
||||
error_view=J2TemplateView.create_template(config.get('error_html'),
|
||||
'Error Page')
|
||||
'Error Page'),
|
||||
|
||||
config=config
|
||||
)
|
||||
|
@ -65,14 +65,15 @@ class BaseContentView(object):
|
||||
def __call__(self, wbrequest, *args):
|
||||
# render top level frame if in frame mode
|
||||
# (not supported in proxy mode)
|
||||
if (self.is_frame_mode and
|
||||
not wbrequest.is_proxy and
|
||||
not wbrequest.wb_url.mod):
|
||||
if (self.is_frame_mode and wbrequest.wb_url and
|
||||
not wbrequest.wb_url.mod and
|
||||
not wbrequest.options['is_proxy'] and
|
||||
not wbrequest.options.get('is_timegate', False)):
|
||||
|
||||
embed_url = wbrequest.wb_url.to_str(mod=self._mp_mod)
|
||||
timestamp = datetime_to_timestamp(datetime.datetime.utcnow())
|
||||
url = wbrequest.wb_url.url
|
||||
ctype='text/html'
|
||||
ctype = 'text/html'
|
||||
|
||||
return self.frame_insert_view.render_response(embed_url=embed_url,
|
||||
wbrequest=wbrequest,
|
||||
@ -88,7 +89,9 @@ class RewriteLiveView(BaseContentView):
|
||||
def __init__(self, config):
|
||||
super(RewriteLiveView, self).__init__(config)
|
||||
|
||||
self.rewriter = LiveRewriter(defmod=self._mp_mod)
|
||||
default_proxy = config.get('proxyhostport')
|
||||
self.rewriter = LiveRewriter(defmod=self._mp_mod,
|
||||
default_proxy=default_proxy)
|
||||
|
||||
def render_content(self, wbrequest, *args):
|
||||
head_insert_func = self.head_insert_view.create_insert_func(wbrequest)
|
||||
@ -257,12 +260,10 @@ class ReplayView(BaseContentView):
|
||||
return content
|
||||
|
||||
def _redirect_if_needed(self, wbrequest, cdx):
|
||||
if wbrequest.is_proxy:
|
||||
if wbrequest.options['is_proxy']:
|
||||
return None
|
||||
|
||||
# todo: generalize this?
|
||||
redir_needed = (hasattr(wbrequest, 'is_timegate') and
|
||||
wbrequest.is_timegate)
|
||||
redir_needed = (wbrequest.options.get('is_timegate', False))
|
||||
|
||||
if not redir_needed and self.redir_to_exact:
|
||||
redir_needed = (cdx['timestamp'] != wbrequest.wb_url.timestamp)
|
||||
|
@ -2,8 +2,6 @@ from pywb.utils.timeutils import timestamp_to_datetime
|
||||
from pywb.framework.wbrequestresponse import WbResponse
|
||||
from pywb.framework.memento import make_timemap, LINK_FORMAT
|
||||
|
||||
from handlers import WBHandler
|
||||
|
||||
import urlparse
|
||||
import logging
|
||||
|
||||
@ -62,7 +60,8 @@ def is_wb_handler(obj):
|
||||
if not hasattr(obj, 'handler'):
|
||||
return False
|
||||
|
||||
return isinstance(obj.handler, WBHandler)
|
||||
#return isinstance(obj.handler, WBHandler)
|
||||
return obj.handler.__class__.__name__ == "WBHandler"
|
||||
|
||||
|
||||
#=================================================================
|
||||
|
1
sample_archive/text_content/toptest.js
Normal file
1
sample_archive/text_content/toptest.js
Normal file
@ -0,0 +1 @@
|
||||
!function(){top!==window&&(alert("For security reasons, framing is not allowed."),top.location.replace(document.location))}
|
2
setup.py
2
setup.py
@ -34,7 +34,7 @@ class PyTest(TestCommand):
|
||||
|
||||
setup(
|
||||
name='pywb',
|
||||
version='0.4.7',
|
||||
version='0.5.0',
|
||||
url='https://github.com/ikreymer/pywb',
|
||||
author='Ilya Kreymer',
|
||||
author_email='ikreymer@gmail.com',
|
||||
|
@ -17,6 +17,10 @@ collections:
|
||||
index_paths: './sample_archive/cdx/'
|
||||
filters: ['filename:dupe*']
|
||||
|
||||
pywb-filt-2:
|
||||
index_paths: './sample_archive/cdx/'
|
||||
filters: ['!filename:dupe*']
|
||||
|
||||
pywb-nonframe:
|
||||
index_paths: './sample_archive/cdx/'
|
||||
framed_replay: false
|
||||
@ -26,6 +30,14 @@ collections:
|
||||
index_paths: './sample_archive/non-surt-cdx/'
|
||||
surt_ordered: false
|
||||
|
||||
# live collection
|
||||
live: $liveweb
|
||||
|
||||
# coll with fallback
|
||||
pywb-fallback:
|
||||
index_paths: ./sample_archive/cdx/
|
||||
fallback: live
|
||||
|
||||
|
||||
# indicate if cdx files are sorted by SURT keys -- eg: com,example)/
|
||||
# SURT keys are recommended for future indices, but non-SURT cdxs
|
||||
@ -94,6 +106,12 @@ static_routes:
|
||||
# Enable simple http proxy mode
|
||||
enable_http_proxy: true
|
||||
|
||||
# Additional proxy options (defaults)
|
||||
proxy_options:
|
||||
use_default_coll: true
|
||||
|
||||
unaltered_replay: false
|
||||
|
||||
# enable cdx server api for querying cdx directly (experimental)
|
||||
#enable_cdx_api: True
|
||||
# or specify suffix
|
||||
|
@ -14,7 +14,12 @@ enable_memento: true
|
||||
# Enable simple http proxy mode
|
||||
enable_http_proxy: true
|
||||
|
||||
# test unaltered replay for proxy as well
|
||||
proxy_options:
|
||||
unaltered_replay: true
|
||||
|
||||
# enable cdx server api for timemap
|
||||
enable_cdx_api: true
|
||||
|
||||
|
||||
# test memento with framed replay
|
||||
framed_replay: true
|
||||
|
@ -1,5 +1,6 @@
|
||||
from pytest import raises
|
||||
import webtest
|
||||
import base64
|
||||
from pywb.webapp.pywb_init import create_wb_router
|
||||
from pywb.framework.wsgi_wrappers import init_app
|
||||
from pywb.cdx.cdxobject import CDXObject
|
||||
@ -258,6 +259,14 @@ class TestWb:
|
||||
resp = self.testapp.get('/pywb/20140603030351mp_/http://example.com?example=3', status = 503)
|
||||
assert resp.status_int == 503
|
||||
|
||||
def test_live_frame(self):
|
||||
resp = self.testapp.get('/live/mp_/http://example.com/?test=test')
|
||||
assert resp.status_int == 200
|
||||
|
||||
def test_live_fallback(self):
|
||||
resp = self.testapp.get('/pywb-fallback/mp_/http://example.com/?test=test')
|
||||
assert resp.status_int == 200
|
||||
|
||||
def test_post_1(self):
|
||||
resp = self.testapp.post('/pywb/mp_/httpbin.org/post', {'foo': 'bar', 'test': 'abc'})
|
||||
|
||||
@ -317,6 +326,50 @@ class TestWb:
|
||||
assert 'Sun, Jan 26 2014 20:11:27' in resp.body
|
||||
assert 'wb.js' in resp.body
|
||||
|
||||
def test_proxy_replay_auth_filtered(self):
|
||||
headers = [('Proxy-Authorization', 'Basic ' + base64.b64encode('pywb-filt-2:'))]
|
||||
resp = self.testapp.get('/x-ignore-this-x', headers = headers,
|
||||
extra_environ = dict(REQUEST_URI = 'http://www.iana.org/', SCRIPT_NAME = ''))
|
||||
|
||||
self._assert_basic_html(resp)
|
||||
|
||||
assert 'Sun, Jan 26 2014 20:06:24' in resp.body
|
||||
assert 'wb.js' in resp.body
|
||||
|
||||
def test_proxy_replay_auth(self):
|
||||
headers = [('Proxy-Authorization', 'Basic ' + base64.b64encode('pywb'))]
|
||||
resp = self.testapp.get('/x-ignore-this-x', headers = headers,
|
||||
extra_environ = dict(REQUEST_URI = 'http://www.iana.org/', SCRIPT_NAME = ''))
|
||||
|
||||
self._assert_basic_html(resp)
|
||||
|
||||
assert 'Mon, Jan 27 2014 17:12:38' in resp.body
|
||||
assert 'wb.js' in resp.body
|
||||
|
||||
def test_proxy_replay_auth_no_coll(self):
|
||||
headers = [('Proxy-Authorization', 'Basic ' + base64.b64encode('no-such-coll'))]
|
||||
resp = self.testapp.get('/x-ignore-this-x', headers = headers,
|
||||
extra_environ = dict(REQUEST_URI = 'http://www.iana.org/', SCRIPT_NAME = ''),
|
||||
status=407)
|
||||
|
||||
assert resp.status_int == 407
|
||||
|
||||
def test_proxy_replay_auth_invalid_1(self):
|
||||
headers = [('Proxy-Authorization', 'abc' + base64.b64encode('no-such-coll'))]
|
||||
resp = self.testapp.get('/x-ignore-this-x', headers = headers,
|
||||
extra_environ = dict(REQUEST_URI = 'http://www.iana.org/', SCRIPT_NAME = ''),
|
||||
status=407)
|
||||
|
||||
assert resp.status_int == 407
|
||||
|
||||
def test_proxy_replay_auth_invalid_2(self):
|
||||
headers = [('Proxy-Authorization', 'basic')]
|
||||
resp = self.testapp.get('/x-ignore-this-x', headers = headers,
|
||||
extra_environ = dict(REQUEST_URI = 'http://www.iana.org/', SCRIPT_NAME = ''),
|
||||
status=407)
|
||||
|
||||
assert resp.status_int == 407
|
||||
|
||||
def test_proxy_pac(self):
|
||||
resp = self.testapp.get('/proxy.pac', extra_environ = dict(SERVER_NAME='pywb-proxy', SERVER_PORT='8080'))
|
||||
assert resp.content_type == 'application/x-ns-proxy-autoconfig'
|
||||
|
@ -4,7 +4,8 @@ import webtest
|
||||
|
||||
class TestLiveRewriter:
|
||||
def setup(self):
|
||||
self.app = init_app(create_live_rewriter_app, load_yaml=False)
|
||||
self.app = init_app(create_live_rewriter_app, load_yaml=False,
|
||||
config=dict(framed_replay=True))
|
||||
self.testapp = webtest.TestApp(self.app)
|
||||
|
||||
def test_live_rewrite_1(self):
|
||||
@ -22,4 +23,12 @@ class TestLiveRewriter:
|
||||
assert '<iframe ' in resp.body
|
||||
assert 'src="/rewrite/mp_/http://example.com/"' in resp.body
|
||||
|
||||
def test_live_invalid(self):
|
||||
resp = self.testapp.get('/rewrite/mp_/http://abcdef', status=400)
|
||||
assert resp.status_int == 400
|
||||
|
||||
def test_live_invalid_2(self):
|
||||
resp = self.testapp.get('/rewrite/mp_/@#$@#$', status=400)
|
||||
assert resp.status_int == 400
|
||||
|
||||
|
||||
|
@ -34,7 +34,7 @@ class TestWb:
|
||||
"""
|
||||
TimeGate with no Accept-Datetime header
|
||||
"""
|
||||
resp = self.testapp.get('/pywb/http://www.iana.org/_css/2013.1/screen.css')
|
||||
resp = self.testapp.get('/pywb/mp_/http://www.iana.org/_css/2013.1/screen.css')
|
||||
|
||||
assert resp.status_int == 302
|
||||
|
||||
@ -46,7 +46,7 @@ class TestWb:
|
||||
|
||||
assert MEMENTO_DATETIME not in resp.headers
|
||||
|
||||
assert '/pywb/20140127171239/http://www.iana.org/_css/2013.1/screen.css' in resp.headers['Location']
|
||||
assert '/pywb/20140127171239mp_/http://www.iana.org/_css/2013.1/screen.css' in resp.headers['Location']
|
||||
|
||||
|
||||
def test_timegate_accept_datetime(self):
|
||||
@ -54,7 +54,7 @@ class TestWb:
|
||||
TimeGate with Accept-Datetime header
|
||||
"""
|
||||
headers = {ACCEPT_DATETIME: 'Sun, 26 Jan 2014 20:08:04'}
|
||||
resp = self.testapp.get('/pywb/http://www.iana.org/_css/2013.1/screen.css', headers=headers)
|
||||
resp = self.testapp.get('/pywb/mp_/http://www.iana.org/_css/2013.1/screen.css', headers=headers)
|
||||
|
||||
assert resp.status_int == 302
|
||||
|
||||
@ -67,7 +67,7 @@ class TestWb:
|
||||
|
||||
assert MEMENTO_DATETIME not in resp.headers
|
||||
|
||||
assert '/pywb/20140126200804/http://www.iana.org/_css/2013.1/screen.css' in resp.headers['Location']
|
||||
assert '/pywb/20140126200804mp_/http://www.iana.org/_css/2013.1/screen.css' in resp.headers['Location']
|
||||
|
||||
|
||||
def test_non_timegate_intermediate_redir(self):
|
||||
@ -76,7 +76,7 @@ class TestWb:
|
||||
"""
|
||||
headers = {ACCEPT_DATETIME: 'Sun, 26 Jan 2014 20:08:04'}
|
||||
# not a timegate, partial timestamp /2014/ present
|
||||
resp = self.testapp.get('/pywb/2014/http://www.iana.org/_css/2013.1/screen.css', headers=headers)
|
||||
resp = self.testapp.get('/pywb/2014mp_/http://www.iana.org/_css/2013.1/screen.css', headers=headers)
|
||||
|
||||
assert resp.status_int == 302
|
||||
|
||||
@ -90,14 +90,14 @@ class TestWb:
|
||||
|
||||
|
||||
# redirect to latest, not negotiation via Accept-Datetime
|
||||
assert '/pywb/20140127171239/' in resp.headers['Location']
|
||||
assert '/pywb/20140127171239mp_/' in resp.headers['Location']
|
||||
|
||||
|
||||
def test_memento_url(self):
|
||||
"""
|
||||
Memento response, 200 capture
|
||||
"""
|
||||
resp = self.testapp.get('/pywb/20140126200804/http://www.iana.org/_css/2013.1/screen.css')
|
||||
resp = self.testapp.get('/pywb/20140126200804mp_/http://www.iana.org/_css/2013.1/screen.css')
|
||||
|
||||
assert resp.status_int == 200
|
||||
|
||||
@ -105,7 +105,7 @@ class TestWb:
|
||||
|
||||
links = self.get_links(resp)
|
||||
assert '<http://www.iana.org/_css/2013.1/screen.css>; rel="original"' in links
|
||||
assert '<http://localhost:80/pywb/http://www.iana.org/_css/2013.1/screen.css>; rel="timegate"' in links
|
||||
assert '<http://localhost:80/pywb/mp_/http://www.iana.org/_css/2013.1/screen.css>; rel="timegate"' in links
|
||||
assert self.make_timemap_link('http://www.iana.org/_css/2013.1/screen.css') in links
|
||||
|
||||
assert resp.headers[MEMENTO_DATETIME] == 'Sun, 26 Jan 2014 20:08:04 GMT'
|
||||
@ -115,7 +115,7 @@ class TestWb:
|
||||
"""
|
||||
Memento (capture) of a 302 response
|
||||
"""
|
||||
resp = self.testapp.get('/pywb/20140128051539/http://www.iana.org/domains/example')
|
||||
resp = self.testapp.get('/pywb/20140128051539mp_/http://www.iana.org/domains/example')
|
||||
|
||||
assert resp.status_int == 302
|
||||
|
||||
@ -123,7 +123,7 @@ class TestWb:
|
||||
|
||||
links = self.get_links(resp)
|
||||
assert '<http://www.iana.org/domains/example>; rel="original"' in links
|
||||
assert '<http://localhost:80/pywb/http://www.iana.org/domains/example>; rel="timegate"' in links
|
||||
assert '<http://localhost:80/pywb/mp_/http://www.iana.org/domains/example>; rel="timegate"' in links
|
||||
assert self.make_timemap_link('http://www.iana.org/domains/example') in links
|
||||
|
||||
assert resp.headers[MEMENTO_DATETIME] == 'Tue, 28 Jan 2014 05:15:39 GMT'
|
||||
@ -147,12 +147,12 @@ rel="self"; type="application/link-format"; from="Fri, 03 Jan 2014 03:03:21 GMT"
|
||||
|
||||
assert lines[1] == '<http://example.com?example=1>; rel="original",'
|
||||
|
||||
assert lines[2] == '<http://localhost:80/pywb/http://example.com?example=1>; rel="timegate",'
|
||||
assert lines[2] == '<http://localhost:80/pywb/mp_/http://example.com?example=1>; rel="timegate",'
|
||||
|
||||
assert lines[3] == '<http://localhost:80/pywb/20140103030321/http://example.com?example=1>; \
|
||||
assert lines[3] == '<http://localhost:80/pywb/20140103030321mp_/http://example.com?example=1>; \
|
||||
rel="memento"; datetime="Fri, 03 Jan 2014 03:03:21 GMT",'
|
||||
|
||||
assert lines[4] == '<http://localhost:80/pywb/20140103030341/http://example.com?example=1>; \
|
||||
assert lines[4] == '<http://localhost:80/pywb/20140103030341mp_/http://example.com?example=1>; \
|
||||
rel="memento"; datetime="Fri, 03 Jan 2014 03:03:41 GMT"'
|
||||
|
||||
def test_timemap_2(self):
|
||||
|
Loading…
x
Reference in New Issue
Block a user