mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
- add referrer self-redirect check and test case
- dispatching: cleanup wbrequestresponse, move tests to a seperate file - wbrequest: store both rel_prefix and host_prefix, with wb_prefix either full or rel path as needed, so that full and relative paths are both available in wbrequest - create WbUrlHandler to differentiate handlers which support WbUrl (timestamp[mod]/url) semantic vs other request handlers.
This commit is contained in:
parent
a4f1224d16
commit
9194e867ea
@ -3,13 +3,13 @@ import re
|
||||
|
||||
from wbrequestresponse import WbRequest, WbResponse
|
||||
from pywb.rewrite.url_rewriter import UrlRewriter
|
||||
from pywb.rewrite.wburl import WbUrl
|
||||
|
||||
|
||||
#=================================================================
|
||||
# ArchivalRouter -- route WB requests in archival mode
|
||||
#=================================================================
|
||||
class ArchivalRouter:
|
||||
def __init__(self, routes, hostpaths = None, abs_path = True, home_view = None, error_view = None):
|
||||
def __init__(self, routes, hostpaths=None, abs_path=True, home_view=None, error_view=None):
|
||||
self.routes = routes
|
||||
self.fallback = ReferRedirect(hostpaths)
|
||||
self.abs_path = abs_path
|
||||
@ -69,24 +69,25 @@ class Route:
|
||||
if not matcher:
|
||||
return None
|
||||
|
||||
rel_prefix = matcher.group(0)
|
||||
matched_str = matcher.group(0)
|
||||
|
||||
if rel_prefix:
|
||||
wb_prefix = env['SCRIPT_NAME'] + '/' + rel_prefix + '/'
|
||||
wb_url_str = request_uri[len(rel_prefix) + 2:] # remove the '/' + rel_prefix part of uri
|
||||
if matched_str:
|
||||
rel_prefix = env['SCRIPT_NAME'] + '/' + matched_str + '/'
|
||||
wb_url_str = request_uri[len(matched_str) + 2:] # remove the '/' + rel_prefix part of uri
|
||||
else:
|
||||
wb_prefix = env['SCRIPT_NAME'] + '/'
|
||||
rel_prefix = env['SCRIPT_NAME'] + '/'
|
||||
wb_url_str = request_uri[1:] # the request_uri is the wb_url, since no coll
|
||||
|
||||
coll = matcher.group(self.coll_group)
|
||||
|
||||
wbrequest = WbRequest(env,
|
||||
request_uri = request_uri,
|
||||
wb_url_str = wb_url_str,
|
||||
wb_prefix = wb_prefix,
|
||||
coll = coll,
|
||||
host_prefix = WbRequest.make_host_prefix(env) if use_abs_prefix else '',
|
||||
wburl_class = self.handler.get_wburl_type())
|
||||
request_uri=request_uri,
|
||||
wb_url_str=wb_url_str,
|
||||
rel_prefix=rel_prefix,
|
||||
coll=coll,
|
||||
use_abs_prefix=use_abs_prefix,
|
||||
wburl_class = self.handler.get_wburl_type(),
|
||||
urlrewriter_class=UrlRewriter)
|
||||
|
||||
|
||||
# Allow for applying of additional filters
|
||||
|
@ -132,8 +132,8 @@ org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db tex
|
||||
('filename', 'dupes.warc.gz')]
|
||||
|
||||
# NOTE: external dependency -- need self-contained test
|
||||
>>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'example.com', output = 'raw', limit = '2')
|
||||
>>> pprint.pprint(x.next().items())
|
||||
#>>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'example.com', output = 'raw', limit = '2')
|
||||
#>>> pprint.pprint(x.next().items())
|
||||
[('urlkey', 'com,example)/'),
|
||||
('timestamp', '20020120142510'),
|
||||
('original', 'http://example.com:80/'),
|
||||
|
@ -10,19 +10,28 @@ from wbexceptions import WbException, NotFoundException
|
||||
from views import TextCapturesView
|
||||
|
||||
|
||||
class BaseHandler:
|
||||
@staticmethod
|
||||
def get_wburl_type():
|
||||
return WbUrl
|
||||
|
||||
#=================================================================
|
||||
class BaseHandler(object):
|
||||
def __call__(self, wbrequest):
|
||||
return wbrequest
|
||||
|
||||
def get_wburl_type(self):
|
||||
return None
|
||||
|
||||
|
||||
#=================================================================
|
||||
class WbUrlHandler(BaseHandler):
|
||||
def get_wburl_type(self):
|
||||
return WbUrl
|
||||
|
||||
|
||||
#=================================================================
|
||||
# Standard WB Handler
|
||||
#=================================================================
|
||||
class WBHandler(BaseHandler):
|
||||
def __init__(self, index_reader, replay, html_view = None, search_view = None):
|
||||
class WBHandler(WbUrlHandler):
|
||||
def __init__(self, index_reader, replay,
|
||||
html_view=None, search_view=None):
|
||||
|
||||
self.index_reader = index_reader
|
||||
self.replay = replay
|
||||
|
||||
@ -31,7 +40,6 @@ class WBHandler(BaseHandler):
|
||||
self.html_view = html_view
|
||||
self.search_view = search_view
|
||||
|
||||
|
||||
def __call__(self, wbrequest):
|
||||
if wbrequest.wb_url_str == '/':
|
||||
return self.render_search_page(wbrequest)
|
||||
@ -61,6 +69,7 @@ class WBHandler(BaseHandler):
|
||||
def __str__(self):
|
||||
return 'WBHandler: ' + str(self.index_reader) + ', ' + str(self.replay)
|
||||
|
||||
|
||||
#=================================================================
|
||||
# CDX-Server Handler -- pass all params to cdx server
|
||||
#=================================================================
|
||||
@ -75,11 +84,6 @@ class CDXHandler(BaseHandler):
|
||||
|
||||
return self.view.render_response(wbrequest, cdx_lines)
|
||||
|
||||
|
||||
@staticmethod
|
||||
def get_wburl_type():
|
||||
return None
|
||||
|
||||
def __str__(self):
|
||||
return 'Index Reader: ' + str(self.index_reader)
|
||||
|
||||
@ -115,10 +119,6 @@ class StaticHandler(BaseHandler):
|
||||
except IOError:
|
||||
raise NotFoundException('Static File Not Found: ' + wbrequest.wb_url_str)
|
||||
|
||||
@staticmethod
|
||||
def get_wburl_type():
|
||||
return None
|
||||
|
||||
def __str__(self):
|
||||
return 'Static files from ' + self.static_path
|
||||
|
||||
@ -130,6 +130,7 @@ class DebugEchoEnvHandler(BaseHandler):
|
||||
def __call__(self, wbrequest):
|
||||
return WbResponse.text_response(str(wbrequest.env))
|
||||
|
||||
|
||||
#=================================================================
|
||||
class DebugEchoHandler(BaseHandler):
|
||||
def __call__(self, wbrequest):
|
||||
@ -150,5 +151,3 @@ class PerfTimer:
|
||||
self.end = time.clock()
|
||||
if self.perfdict is not None:
|
||||
self.perfdict[self.name] = str(self.end - self.start)
|
||||
|
||||
|
||||
|
@ -37,7 +37,7 @@ class IndexReader(object):
|
||||
def load_cdx(self, **params):
|
||||
return self.cdx_server.load_cdx(**params)
|
||||
|
||||
def get_query_params(self, wburl, limit = 150000, collapse_time = None, replay_closest = 10):
|
||||
def get_query_params(self, wburl, limit = 150000, collapse_time = None, replay_closest = 100):
|
||||
if wburl.type == wburl.URL_QUERY:
|
||||
raise NotImplementedError('Url Query Not Yet Supported')
|
||||
|
||||
|
@ -45,14 +45,14 @@ class ProxyRouter:
|
||||
return None
|
||||
|
||||
wbrequest = WbRequest(env,
|
||||
request_uri = url,
|
||||
wb_url_str = url,
|
||||
wb_prefix = '',
|
||||
coll = '',
|
||||
host_prefix = self.hostpaths[0],
|
||||
wburl_class = self.handler.get_wburl_type(),
|
||||
url_rewriter_class = ProxyHttpsUrlRewriter,
|
||||
is_proxy = True)
|
||||
request_uri=url,
|
||||
wb_url_str=url,
|
||||
#rel_prefix=url,
|
||||
#host_prefix=self.hostpaths[0],
|
||||
wburl_class=self.handler.get_wburl_type(),
|
||||
urlrewriter_class=ProxyHttpsUrlRewriter,
|
||||
use_abs_prefix=False,
|
||||
is_proxy=True)
|
||||
|
||||
return self.handler(wbrequest)
|
||||
|
||||
|
@ -7,7 +7,6 @@ from wbrequestresponse import WbResponse
|
||||
from wbexceptions import CaptureException, InternalRedirect
|
||||
from pywb.warc.recordloader import ArchiveLoadFailed
|
||||
|
||||
|
||||
#=================================================================
|
||||
class ReplayView:
|
||||
def __init__(self, content_loader, content_rewriter, head_insert_view = None,
|
||||
@ -49,6 +48,9 @@ class ReplayView:
|
||||
# check if redir is needed
|
||||
self._redirect_if_needed(wbrequest, cdx)
|
||||
|
||||
# one more check for referrer-based self-redirect
|
||||
self._reject_referrer_self_redirect(wbrequest, status_headers)
|
||||
|
||||
response = None
|
||||
|
||||
if self.content_rewriter and wbrequest.wb_url.mod != 'id_':
|
||||
@ -148,6 +150,7 @@ class ReplayView:
|
||||
|
||||
|
||||
def _reject_self_redirect(self, wbrequest, cdx, status_headers):
|
||||
# self-redirect via location
|
||||
if status_headers.statusline.startswith('3'):
|
||||
request_url = wbrequest.wb_url.url.lower()
|
||||
location_url = status_headers.get_header('Location').lower()
|
||||
@ -156,3 +159,16 @@ class ReplayView:
|
||||
if (UrlRewriter.strip_protocol(request_url) == UrlRewriter.strip_protocol(location_url)):
|
||||
raise CaptureException('Self Redirect: ' + str(cdx))
|
||||
|
||||
def _reject_referrer_self_redirect(self, wbrequest, status_headers):
|
||||
# at correct timestamp now, but must check for referrer redirect
|
||||
# indirect self-redirect, via meta-refresh, if referrer is same as current url
|
||||
if status_headers.statusline.startswith('2'):
|
||||
# build full url even if using relative-rewriting
|
||||
request_url = wbrequest.host_prefix + wbrequest.rel_prefix + str(wbrequest.wb_url)
|
||||
referrer_url = wbrequest.referrer
|
||||
if (referrer_url and UrlRewriter.strip_protocol(request_url) == UrlRewriter.strip_protocol(referrer_url)):
|
||||
raise CaptureException('Self Redirect via Referrer: ' + str(wbrequest.wb_url))
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -24,9 +24,9 @@ def test_example_2():
|
||||
|
||||
|
||||
|
||||
def test_example_3():
|
||||
status_headers, buff = get_rewritten('http://archive.org/', urlrewriter)
|
||||
#def test_example_3():
|
||||
# status_headers, buff = get_rewritten('http://archive.org/', urlrewriter)
|
||||
|
||||
assert '/pywb/20131226101010/http://example.com/about/terms.php' in buff, buff
|
||||
# assert '/pywb/20131226101010/http://example.com/about/terms.php' in buff, buff
|
||||
|
||||
|
||||
|
@ -103,10 +103,12 @@ class UrlRewriter:
|
||||
|
||||
return self.prefix + self.wburl.to_str(timestamp=timestamp, url=url)
|
||||
|
||||
|
||||
def set_base_url(self, newUrl):
|
||||
self.wburl.url = newUrl
|
||||
|
||||
def __repr__(self):
|
||||
return "UrlRewriter('{0}', '{1}')".format(self.wburl, self.prefix)
|
||||
|
||||
@staticmethod
|
||||
def strip_protocol(url):
|
||||
for protocol in UrlRewriter.PROTOCOLS:
|
||||
|
@ -1,13 +1,19 @@
|
||||
"""
|
||||
Test Route
|
||||
# route with relative path
|
||||
>>> Route('web', BaseHandler())({'REL_REQUEST_URI': '/web/test.example.com', 'SCRIPT_NAME': ''}, False)
|
||||
{'wb_url': ('latest_replay', '', '', 'http://test.example.com', 'http://test.example.com'), 'coll': 'web', 'wb_prefix': '/web/', 'request_uri': '/web/test.example.com'}
|
||||
# Test WbRequest parsed via a Route
|
||||
# route with relative path, print resulting wbrequest
|
||||
>>> print_req(Route('web', WbUrlHandler())({'REL_REQUEST_URI': '/web/test.example.com', 'SCRIPT_NAME': ''}, False))
|
||||
{'coll': 'web',
|
||||
'request_uri': '/web/test.example.com',
|
||||
'wb_prefix': '/web/',
|
||||
'wb_url': ('latest_replay', '', '', 'http://test.example.com', 'http://test.example.com')}
|
||||
|
||||
# route with absolute path, running at script /my_pywb
|
||||
>>> Route('web', BaseHandler())({'REL_REQUEST_URI': '/web/2013im_/test.example.com', 'SCRIPT_NAME': '/my_pywb', 'HTTP_HOST': 'localhost:8081', 'wsgi.url_scheme': 'https'}, True)
|
||||
{'wb_url': ('replay', '2013', 'im_', 'http://test.example.com', '2013im_/http://test.example.com'), 'coll': 'web', 'wb_prefix': 'https://localhost:8081/my_pywb/web/', 'request_uri': '/web/2013im_/test.example.com'}
|
||||
|
||||
# route with absolute path, running at script /my_pywb, print resultingwbrequest
|
||||
>>> print_req(Route('web', WbUrlHandler())({'REL_REQUEST_URI': '/web/2013im_/test.example.com', 'SCRIPT_NAME': '/my_pywb', 'HTTP_HOST': 'localhost:8081', 'wsgi.url_scheme': 'https'}, True))
|
||||
{'coll': 'web',
|
||||
'request_uri': '/web/2013im_/test.example.com',
|
||||
'wb_prefix': 'https://localhost:8081/my_pywb/web/',
|
||||
'wb_url': ('replay', '2013', 'im_', 'http://test.example.com', '2013im_/http://test.example.com')}
|
||||
|
||||
# not matching route -- skipped
|
||||
>>> Route('web', BaseHandler())({'REL_REQUEST_URI': '/other/test.example.com', 'SCRIPT_NAME': ''}, False)
|
||||
@ -65,7 +71,12 @@ False
|
||||
"""
|
||||
|
||||
from pywb.archivalrouter import Route, ReferRedirect
|
||||
from pywb.handlers import BaseHandler
|
||||
from pywb.handlers import BaseHandler, WbUrlHandler
|
||||
import pprint
|
||||
|
||||
def print_req(req):
|
||||
varlist = vars(req)
|
||||
pprint.pprint({k: varlist[k] for k in ('request_uri', 'wb_prefix', 'wb_url', 'coll')})
|
||||
|
||||
|
||||
def _test_redir(match_host, request_uri, referrer, script_name = '', coll = 'coll', http_host = None):
|
||||
@ -74,7 +85,7 @@ def _test_redir(match_host, request_uri, referrer, script_name = '', coll = 'col
|
||||
if http_host:
|
||||
env['HTTP_HOST'] = http_host
|
||||
|
||||
routes = [Route(coll, BaseHandler())]
|
||||
routes = [Route(coll, WbUrlHandler())]
|
||||
|
||||
redir = ReferRedirect(match_host)
|
||||
#req = WbRequest.from_uri(request_uri, env)
|
||||
@ -85,4 +96,6 @@ def _test_redir(match_host, request_uri, referrer, script_name = '', coll = 'col
|
||||
return rep.status_headers.get_header('Location')
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import doctest
|
||||
doctest.testmod()
|
87
pywb/test/test_wbrequestresponse.py
Normal file
87
pywb/test/test_wbrequestresponse.py
Normal file
@ -0,0 +1,87 @@
|
||||
"""
|
||||
# WbRequest Tests
|
||||
# =================
|
||||
>>> print_req_from_uri('/save/_embed/example.com/?a=b')
|
||||
{'wb_url': ('latest_replay', '', '', 'http://_embed/example.com/?a=b', 'http://_embed/example.com/?a=b'), 'coll': 'save', 'wb_prefix': '/save/', 'request_uri': '/save/_embed/example.com/?a=b'}
|
||||
|
||||
>>> print_req_from_uri('/2345/20101024101112im_/example.com/?b=c')
|
||||
{'wb_url': ('replay', '20101024101112', 'im_', 'http://example.com/?b=c', '20101024101112im_/http://example.com/?b=c'), 'coll': '2345', 'wb_prefix': '/2345/', 'request_uri': '/2345/20101024101112im_/example.com/?b=c'}
|
||||
|
||||
>>> print_req_from_uri('/2010/example.com')
|
||||
{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': '/2010/', 'request_uri': '/2010/example.com'}
|
||||
|
||||
>>> print_req_from_uri('../example.com')
|
||||
{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '', 'wb_prefix': '/', 'request_uri': '../example.com'}
|
||||
|
||||
# Abs path
|
||||
>>> print_req_from_uri('/2010/example.com', {'wsgi.url_scheme': 'https', 'HTTP_HOST': 'localhost:8080'}, use_abs_prefix = True)
|
||||
{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': 'https://localhost:8080/2010/', 'request_uri': '/2010/example.com'}
|
||||
|
||||
# No Scheme, so stick to relative
|
||||
>>> print_req_from_uri('/2010/example.com', {'HTTP_HOST': 'localhost:8080'}, use_abs_prefix = True)
|
||||
{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': '/2010/', 'request_uri': '/2010/example.com'}
|
||||
|
||||
|
||||
|
||||
# WbResponse Tests
|
||||
# =================
|
||||
>>> WbResponse.text_response('Test')
|
||||
{'body': ['Test'], 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [('Content-Type', 'text/plain')])}
|
||||
|
||||
>>> WbResponse.text_stream(['Test', 'Another'], '404')
|
||||
{'body': ['Test', 'Another'], 'status_headers': StatusAndHeaders(protocol = '', statusline = '404', headers = [('Content-Type', 'text/plain')])}
|
||||
|
||||
>>> WbResponse.redir_response('http://example.com/otherfile')
|
||||
{'body': [], 'status_headers': StatusAndHeaders(protocol = '', statusline = '302 Redirect', headers = [('Location', 'http://example.com/otherfile')])}
|
||||
|
||||
"""
|
||||
|
||||
|
||||
from pywb.rewrite.wburl import WbUrl
|
||||
from pywb.rewrite.url_rewriter import UrlRewriter
|
||||
from pywb.utils.statusandheaders import StatusAndHeaders
|
||||
|
||||
from pywb.wbrequestresponse import WbRequest, WbResponse
|
||||
|
||||
|
||||
def print_req_from_uri(request_uri, env={}, use_abs_prefix=False):
|
||||
response = req_from_uri(request_uri, env, use_abs_prefix)
|
||||
varlist = vars(response)
|
||||
print str({k: varlist[k] for k in ('request_uri', 'wb_prefix', 'wb_url', 'coll')})
|
||||
|
||||
|
||||
def req_from_uri(request_uri, env={}, use_abs_prefix=False):
|
||||
if not request_uri:
|
||||
request_uri = env.get('REL_REQUEST_URI')
|
||||
|
||||
parts = request_uri.split('/', 2)
|
||||
|
||||
# Has coll prefix
|
||||
if len(parts) == 3:
|
||||
rel_prefix = '/' + parts[1] + '/'
|
||||
wb_url_str = parts[2]
|
||||
coll = parts[1]
|
||||
# No Coll Prefix
|
||||
elif len(parts) == 2:
|
||||
rel_prefix = '/'
|
||||
wb_url_str = parts[1]
|
||||
coll = ''
|
||||
else:
|
||||
rel_prefix = '/'
|
||||
wb_url_str = parts[0]
|
||||
coll = ''
|
||||
|
||||
return WbRequest(env,
|
||||
request_uri=request_uri,
|
||||
rel_prefix=rel_prefix,
|
||||
wb_url_str=wb_url_str,
|
||||
coll=coll,
|
||||
wburl_class=WbUrl,
|
||||
urlrewriter_class=UrlRewriter,
|
||||
use_abs_prefix=use_abs_prefix)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import doctest
|
||||
doctest.testmod()
|
||||
|
@ -1,99 +1,75 @@
|
||||
from pywb.rewrite.wburl import WbUrl
|
||||
from pywb.rewrite.url_rewriter import UrlRewriter
|
||||
from pywb.utils.statusandheaders import StatusAndHeaders
|
||||
|
||||
import pprint
|
||||
#WB Request and Response
|
||||
|
||||
|
||||
#=================================================================
|
||||
class WbRequest:
|
||||
"""
|
||||
>>> WbRequest.from_uri('/save/_embed/example.com/?a=b')
|
||||
{'wb_url': ('latest_replay', '', '', 'http://_embed/example.com/?a=b', 'http://_embed/example.com/?a=b'), 'coll': 'save', 'wb_prefix': '/save/', 'request_uri': '/save/_embed/example.com/?a=b'}
|
||||
Represents the main pywb request object.
|
||||
|
||||
>>> WbRequest.from_uri('/2345/20101024101112im_/example.com/?b=c')
|
||||
{'wb_url': ('replay', '20101024101112', 'im_', 'http://example.com/?b=c', '20101024101112im_/http://example.com/?b=c'), 'coll': '2345', 'wb_prefix': '/2345/', 'request_uri': '/2345/20101024101112im_/example.com/?b=c'}
|
||||
Contains various info from wsgi env, add additional info
|
||||
about the request, such as coll, relative prefix,
|
||||
host prefix, absolute prefix.
|
||||
|
||||
>>> WbRequest.from_uri('/2010/example.com')
|
||||
{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': '/2010/', 'request_uri': '/2010/example.com'}
|
||||
|
||||
>>> WbRequest.from_uri('../example.com')
|
||||
{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '', 'wb_prefix': '/', 'request_uri': '../example.com'}
|
||||
|
||||
# Abs path
|
||||
>>> WbRequest.from_uri('/2010/example.com', {'wsgi.url_scheme': 'https', 'HTTP_HOST': 'localhost:8080'}, use_abs_prefix = True)
|
||||
{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': 'https://localhost:8080/2010/', 'request_uri': '/2010/example.com'}
|
||||
|
||||
# No Scheme, so stick to relative
|
||||
>>> WbRequest.from_uri('/2010/example.com', {'HTTP_HOST': 'localhost:8080'}, use_abs_prefix = True)
|
||||
{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': '/2010/', 'request_uri': '/2010/example.com'}
|
||||
If a wburl and url rewriter classes are specified, the class
|
||||
also contains the url rewriter.
|
||||
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def from_uri(request_uri, env = {}, use_abs_prefix = False):
|
||||
if not request_uri:
|
||||
request_uri = env.get('REL_REQUEST_URI')
|
||||
|
||||
parts = request_uri.split('/', 2)
|
||||
|
||||
# Has coll prefix
|
||||
if len(parts) == 3:
|
||||
wb_prefix = '/' + parts[1] + '/'
|
||||
wb_url_str = parts[2]
|
||||
coll = parts[1]
|
||||
# No Coll Prefix
|
||||
elif len(parts) == 2:
|
||||
wb_prefix = '/'
|
||||
wb_url_str = parts[1]
|
||||
coll = ''
|
||||
else:
|
||||
wb_prefix = '/'
|
||||
wb_url_str = parts[0]
|
||||
coll = ''
|
||||
|
||||
host_prefix = WbRequest.make_host_prefix(env) if use_abs_prefix else ''
|
||||
|
||||
return WbRequest(env, request_uri, wb_prefix, wb_url_str, coll, host_prefix = host_prefix)
|
||||
|
||||
|
||||
@staticmethod
|
||||
def make_host_prefix(env):
|
||||
try:
|
||||
return env['wsgi.url_scheme'] + '://' + env['HTTP_HOST']
|
||||
host = env.get('HTTP_HOST')
|
||||
if not host:
|
||||
host = env['SERVER_NAME'] + ':' + env['SERVER_PORT']
|
||||
|
||||
return env['wsgi.url_scheme'] + '://' + host
|
||||
except KeyError:
|
||||
return ''
|
||||
|
||||
|
||||
def __init__(self, env, request_uri, wb_prefix, wb_url_str, coll,
|
||||
host_prefix = '',
|
||||
wburl_class = WbUrl,
|
||||
url_rewriter_class = UrlRewriter,
|
||||
is_proxy = False):
|
||||
def __init__(self, env,
|
||||
request_uri=None,
|
||||
rel_prefix='',
|
||||
wb_url_str='/',
|
||||
coll='',
|
||||
host_prefix='',
|
||||
use_abs_prefix=False,
|
||||
wburl_class=None,
|
||||
urlrewriter_class=None,
|
||||
is_proxy=False):
|
||||
|
||||
self.env = env
|
||||
|
||||
self.request_uri = request_uri if request_uri else env.get('REL_REQUEST_URI')
|
||||
|
||||
self.host_prefix = host_prefix
|
||||
self.coll = coll
|
||||
|
||||
if not host_prefix:
|
||||
host_prefix = self.make_host_prefix(env)
|
||||
|
||||
self.host_prefix = host_prefix
|
||||
self.rel_prefix = rel_prefix
|
||||
|
||||
if use_abs_prefix:
|
||||
self.wb_prefix = host_prefix + rel_prefix
|
||||
else:
|
||||
self.wb_prefix = rel_prefix
|
||||
|
||||
self.wb_prefix = host_prefix + wb_prefix
|
||||
|
||||
if not wb_url_str:
|
||||
wb_url_str = '/'
|
||||
|
||||
self.wb_url_str = wb_url_str
|
||||
|
||||
# wb_url present and not root page
|
||||
if wb_url_str != '/' and wburl_class:
|
||||
self.wb_url_str = wb_url_str
|
||||
self.wb_url = wburl_class(wb_url_str)
|
||||
self.urlrewriter = url_rewriter_class(self.wb_url, self.wb_prefix)
|
||||
self.urlrewriter = urlrewriter_class(self.wb_url, self.wb_prefix)
|
||||
else:
|
||||
# no wb_url, just store blank wb_url
|
||||
self.wb_url_str = wb_url_str
|
||||
self.wb_url = None
|
||||
self.urlrewriter = None
|
||||
|
||||
self.coll = coll
|
||||
|
||||
self.referrer = env.get('HTTP_REFERER')
|
||||
|
||||
self.is_ajax = self._is_ajax()
|
||||
@ -122,24 +98,19 @@ class WbRequest:
|
||||
|
||||
|
||||
def __repr__(self):
|
||||
#return "WbRequest(env, '" + (self.wb_url) + "', '" + (self.coll) + "')"
|
||||
#return str(vars(self))
|
||||
varlist = vars(self)
|
||||
return str({k: varlist[k] for k in ('request_uri', 'wb_prefix', 'wb_url', 'coll')})
|
||||
varstr = pprint.pformat(varlist)
|
||||
return varstr
|
||||
|
||||
|
||||
#=================================================================
|
||||
class WbResponse:
|
||||
"""
|
||||
>>> WbResponse.text_response('Test')
|
||||
{'body': ['Test'], 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [('Content-Type', 'text/plain')])}
|
||||
Represnts a pywb wsgi response object.
|
||||
|
||||
>>> WbResponse.text_stream(['Test', 'Another'], '404')
|
||||
{'body': ['Test', 'Another'], 'status_headers': StatusAndHeaders(protocol = '', statusline = '404', headers = [('Content-Type', 'text/plain')])}
|
||||
|
||||
>>> WbResponse.redir_response('http://example.com/otherfile')
|
||||
{'body': [], 'status_headers': StatusAndHeaders(protocol = '', statusline = '302 Redirect', headers = [('Location', 'http://example.com/otherfile')])}
|
||||
Holds a status_headers object and a response iter, to be
|
||||
returned to wsgi container.
|
||||
"""
|
||||
|
||||
def __init__(self, status_headers, value = []):
|
||||
self.status_headers = status_headers
|
||||
self.body = value
|
||||
@ -180,8 +151,3 @@ class WbResponse:
|
||||
|
||||
def __repr__(self):
|
||||
return str(vars(self))
|
||||
|
||||
if __name__ == "__main__":
|
||||
import doctest
|
||||
doctest.testmod()
|
||||
|
||||
|
@ -124,6 +124,20 @@ class TestWb:
|
||||
assert resp.content_type == 'text/css'
|
||||
|
||||
|
||||
def test_referrer_self_redirect(self):
|
||||
uri = '/pywb/20140127171239/http://www.iana.org/_css/2013.1/screen.css'
|
||||
host = 'somehost:8082'
|
||||
referrer = 'http://' + host + uri
|
||||
|
||||
# capture is normally a 200
|
||||
resp = self.testapp.get(uri)
|
||||
assert resp.status_int == 200
|
||||
|
||||
# redirect causes skip of this capture, redirect to next
|
||||
resp = self.testapp.get(uri, headers = [('Referer', referrer), ('Host', host)], status = 302)
|
||||
assert resp.status_int == 302
|
||||
|
||||
|
||||
def test_excluded_content(self):
|
||||
resp = self.testapp.get('/pywb/http://www.iana.org/_img/bookmark_icon.ico', status = 403)
|
||||
assert resp.status_int == 403
|
||||
|
Loading…
x
Reference in New Issue
Block a user