1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

referer redirect: check against registered routes

js rewriter: only rewrite quoted strings, support relative redirect
Jinja view: add 'host' filter for extracting hostname
css tweak
This commit is contained in:
Ilya Kreymer 2014-02-09 01:42:42 -08:00
parent a757f53bd5
commit 232ac733ab
4 changed files with 84 additions and 39 deletions

View File

@ -28,10 +28,7 @@ class ArchivalRouter:
if env['REL_REQUEST_URI'] in ['/', '/index.html', '/index.htm']:
return self.render_home_page()
if not self.fallback:
return None
return self.fallback(WbRequest.from_uri(None, env))
return self.fallback(env, self.routes) if self.fallback else None
def render_home_page(self):
@ -76,7 +73,13 @@ class Route:
def __call__(self, env, use_abs_prefix):
request_uri = env['REL_REQUEST_URI']
wbrequest = self.parse_request(env, use_abs_prefix)
return self.handler(wbrequest) if wbrequest else None
def parse_request(self, env, use_abs_prefix, request_uri = None):
if not request_uri:
request_uri = env['REL_REQUEST_URI']
matcher = self.regex.match(request_uri[1:])
if not matcher:
return None
@ -104,7 +107,8 @@ class Route:
# Allow for applying of additional filters
self._apply_filters(wbrequest, matcher)
return self._handle_request(wbrequest)
return wbrequest
def _apply_filters(self, wbrequest, matcher):
for filter in self.filters:
@ -114,9 +118,6 @@ class Route:
def _custom_init(self, config):
self.filters = config.get('filters', [])
def _handle_request(self, wbrequest):
return self.handler(wbrequest)
def __str__(self):
#return '* ' + self.regex_str + ' => ' + str(self.handler)
return str(self.handler)
@ -143,6 +144,10 @@ class ReferRedirect:
>>> test_redir('http://localhost:8080/', '/../../other.html', 'http://localhost:8080/coll/20131010/http://example.com/index.html')
'http://localhost:8080/coll/20131010/http://example.com/other.html'
# Custom collection
>>> test_redir('http://localhost:8080/', '/other.html', 'http://localhost:8080/complex/123/20131010/http://example.com/path/page.html', coll='complex/123')
'http://localhost:8080/complex/123/20131010/http://example.com/path/other.html'
# With timestamp included
>>> test_redir('http://localhost:8080/', '/20131010/other.html', 'http://localhost:8080/coll/20131010/http://example.com/index.html')
'http://localhost:8080/coll/20131010/http://example.com/other.html'
@ -151,6 +156,7 @@ class ReferRedirect:
>>> test_redir('http://localhost:8080/', '/20131010/path/other.html', 'http://localhost:8080/coll/20131010/http://example.com/some/index.html')
'http://localhost:8080/coll/20131010/http://example.com/path/other.html'
# Wrong Host
>>> test_redir('http://example:8080/', '/other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html')
False
@ -175,30 +181,48 @@ class ReferRedirect:
self.match_prefixs = [match_prefixs]
def __call__(self, wbrequest):
if wbrequest.referrer is None:
def __call__(self, env, routes):
referrer = env.get('HTTP_REFERER')
# ensure there is a referrer
if referrer is None:
return None
if not any (wbrequest.referrer.startswith(i) for i in self.match_prefixs):
# ensure referrer starts with one of allowed hosts
if not any (referrer.startswith(i) for i in self.match_prefixs):
return None
ref_split = urlparse.urlsplit(wbrequest.referrer)
# get referrer path name
ref_split = urlparse.urlsplit(referrer)
path = ref_split.path
script_name = wbrequest.env['SCRIPT_NAME']
if not path.startswith(script_name):
app_path = env['SCRIPT_NAME']
if app_path:
# must start with current app name, if not root
if not path.startswith(app_path):
return None
path = path[len(app_path):]
for route in routes:
ref_request = route.parse_request(env, False, request_uri = path)
if ref_request:
break
# must have matched one of the routes
if not ref_request:
return None
ref_path = path[len(script_name) + 1:].split('/', 1)
# No match on any exception
try:
rewriter = UrlRewriter(ref_path[1], script_name + '/' + ref_path[0] + '/')
except Exception:
# must have a rewriter
if not ref_request.urlrewriter:
return None
rel_request_uri = wbrequest.request_uri[1:]
rewriter = ref_request.urlrewriter
rel_request_uri = env['REL_REQUEST_URI'][1:]
timestamp_path = rewriter.wburl.timestamp + '/'
@ -218,12 +242,13 @@ if __name__ == "__main__" or utils.enable_doctests():
import handlers
def test_redir(match_host, request_uri, referrer, script_name = ''):
def test_redir(match_host, request_uri, referrer, script_name = '', coll = 'coll'):
env = {'REL_REQUEST_URI': request_uri, 'HTTP_REFERER': referrer, 'SCRIPT_NAME': script_name}
routes = [Route(coll, handlers.BaseHandler())]
redir = ReferRedirect(match_host)
req = WbRequest.from_uri(request_uri, env)
rep = redir(req)
#req = WbRequest.from_uri(request_uri, env)
rep = redir(env, routes)
if not rep:
return False

View File

@ -34,6 +34,8 @@ class RegexRewriter:
HTTPX_MATCH_STR = r'https?:\\?/\\?/[A-Za-z0-9:_@.-]+'
DEFAULT_OP = add_prefix
@ -101,6 +103,12 @@ class JSRewriter(RegexRewriter):
>>> test_js(r'location = "http:\\/\\/example.com/abc.html"')
'WB_wombat_location = "/web/20131010im_/http:\\\\/\\\\/example.com/abc.html"'
>>> test_js(r"location = 'http://example.com/abc.html/'")
"WB_wombat_location = '/web/20131010im_/http://example.com/abc.html/'"
>>> test_js(r'location = http://example.com/abc.html/')
'WB_wombat_location = http://example.com/abc.html/'
>>> test_js(r'location = /http:\/\/example.com/abc.html/')
'WB_wombat_location = /http:\\\\/\\\\/example.com/abc.html/'
@ -120,8 +128,14 @@ class JSRewriter(RegexRewriter):
>>> test_js('window.location = "http://example.com/abc.html"; some_func(); ', [('some_func\(\).*', RegexRewriter.comment_out, 0)])
'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"; /*some_func(); */'
# scheme-agnostic
>>> test_js('cool_Location = "//example.com/abc.html" //comment')
'cool_Location = "/web/20131010im_///example.com/abc.html" //comment'
"""
JS_HTTPX = r'(?<="|\')(?:https?:)?\\?/\\?/[A-Za-z0-9:_@.-]+'
def __init__(self, rewriter, extra = []):
rules = self._create_rules(rewriter.get_abs_url())
rules.extend(extra)
@ -131,7 +145,7 @@ class JSRewriter(RegexRewriter):
def _create_rules(self, http_prefix):
return [
(r'(?<!/)\b' + RegexRewriter.HTTPX_MATCH_STR, http_prefix, 0),
(self.JS_HTTPX, http_prefix, 0),
(r'(?<!/)\blocation\b', 'WB_wombat_', 0),
(r'(?<=document\.)domain', 'WB_wombat_', 0),
]

View File

@ -1,18 +1,19 @@
#_wayback_banner
{
display: block !important;
top: 0px !important;
left: 0px !important;
font-family: "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif !important;
position: inherit !important;
padding: 4px !important;
width: 100% !important;
font-size: 24px !important;
border: 1px solid !important;
background-color: lightYellow !important;
color: black !important;
text-align: center !important;
z-index: 2147483643 !important;
display: block !important;
top: 0px !important;
left: 0px !important;
font-family: "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif !important;
position: absolute !important;
padding: 4px !important;
width: 100% !important;
font-size: 24px !important;
border: 1px solid !important;
background-color: lightYellow !important;
color: black !important;
text-align: center !important;
z-index: 2147483643 !important;
line-height: normal !important;
}

View File

@ -3,6 +3,7 @@ import utils
import wbrequestresponse
import wbexceptions
import time
import urlparse
from os import path
from itertools import imap
@ -38,6 +39,7 @@ class J2TemplateView:
jinja_env = Environment(loader = loader, trim_blocks = True)
jinja_env.filters['format_ts'] = J2TemplateView.format_ts
jinja_env.filters['host'] = J2TemplateView.get_host
return jinja_env
def render_to_string(self, **kwargs):
@ -59,6 +61,9 @@ class J2TemplateView:
value = utils.timestamp_to_datetime(value)
return time.strftime(format, value)
@staticmethod
def get_host(url):
return urlparse.urlsplit(url).netloc