1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

referer redirect: check against registered routes

js rewriter: only rewrite quoted strings, support relative redirect
Jinja view: add 'host' filter for extracting hostname
css tweak
This commit is contained in:
Ilya Kreymer 2014-02-09 01:42:42 -08:00
parent a757f53bd5
commit 232ac733ab
4 changed files with 84 additions and 39 deletions

View File

@ -28,10 +28,7 @@ class ArchivalRouter:
if env['REL_REQUEST_URI'] in ['/', '/index.html', '/index.htm']: if env['REL_REQUEST_URI'] in ['/', '/index.html', '/index.htm']:
return self.render_home_page() return self.render_home_page()
if not self.fallback: return self.fallback(env, self.routes) if self.fallback else None
return None
return self.fallback(WbRequest.from_uri(None, env))
def render_home_page(self): def render_home_page(self):
@ -76,7 +73,13 @@ class Route:
def __call__(self, env, use_abs_prefix): def __call__(self, env, use_abs_prefix):
request_uri = env['REL_REQUEST_URI'] wbrequest = self.parse_request(env, use_abs_prefix)
return self.handler(wbrequest) if wbrequest else None
def parse_request(self, env, use_abs_prefix, request_uri = None):
if not request_uri:
request_uri = env['REL_REQUEST_URI']
matcher = self.regex.match(request_uri[1:]) matcher = self.regex.match(request_uri[1:])
if not matcher: if not matcher:
return None return None
@ -104,7 +107,8 @@ class Route:
# Allow for applying of additional filters # Allow for applying of additional filters
self._apply_filters(wbrequest, matcher) self._apply_filters(wbrequest, matcher)
return self._handle_request(wbrequest) return wbrequest
def _apply_filters(self, wbrequest, matcher): def _apply_filters(self, wbrequest, matcher):
for filter in self.filters: for filter in self.filters:
@ -114,9 +118,6 @@ class Route:
def _custom_init(self, config): def _custom_init(self, config):
self.filters = config.get('filters', []) self.filters = config.get('filters', [])
def _handle_request(self, wbrequest):
return self.handler(wbrequest)
def __str__(self): def __str__(self):
#return '* ' + self.regex_str + ' => ' + str(self.handler) #return '* ' + self.regex_str + ' => ' + str(self.handler)
return str(self.handler) return str(self.handler)
@ -143,6 +144,10 @@ class ReferRedirect:
>>> test_redir('http://localhost:8080/', '/../../other.html', 'http://localhost:8080/coll/20131010/http://example.com/index.html') >>> test_redir('http://localhost:8080/', '/../../other.html', 'http://localhost:8080/coll/20131010/http://example.com/index.html')
'http://localhost:8080/coll/20131010/http://example.com/other.html' 'http://localhost:8080/coll/20131010/http://example.com/other.html'
# Custom collection
>>> test_redir('http://localhost:8080/', '/other.html', 'http://localhost:8080/complex/123/20131010/http://example.com/path/page.html', coll='complex/123')
'http://localhost:8080/complex/123/20131010/http://example.com/path/other.html'
# With timestamp included # With timestamp included
>>> test_redir('http://localhost:8080/', '/20131010/other.html', 'http://localhost:8080/coll/20131010/http://example.com/index.html') >>> test_redir('http://localhost:8080/', '/20131010/other.html', 'http://localhost:8080/coll/20131010/http://example.com/index.html')
'http://localhost:8080/coll/20131010/http://example.com/other.html' 'http://localhost:8080/coll/20131010/http://example.com/other.html'
@ -151,6 +156,7 @@ class ReferRedirect:
>>> test_redir('http://localhost:8080/', '/20131010/path/other.html', 'http://localhost:8080/coll/20131010/http://example.com/some/index.html') >>> test_redir('http://localhost:8080/', '/20131010/path/other.html', 'http://localhost:8080/coll/20131010/http://example.com/some/index.html')
'http://localhost:8080/coll/20131010/http://example.com/path/other.html' 'http://localhost:8080/coll/20131010/http://example.com/path/other.html'
# Wrong Host
>>> test_redir('http://example:8080/', '/other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html') >>> test_redir('http://example:8080/', '/other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html')
False False
@ -175,30 +181,48 @@ class ReferRedirect:
self.match_prefixs = [match_prefixs] self.match_prefixs = [match_prefixs]
def __call__(self, wbrequest): def __call__(self, env, routes):
if wbrequest.referrer is None: referrer = env.get('HTTP_REFERER')
# ensure there is a referrer
if referrer is None:
return None return None
if not any (wbrequest.referrer.startswith(i) for i in self.match_prefixs): # ensure referrer starts with one of allowed hosts
if not any (referrer.startswith(i) for i in self.match_prefixs):
return None return None
ref_split = urlparse.urlsplit(wbrequest.referrer) # get referrer path name
ref_split = urlparse.urlsplit(referrer)
path = ref_split.path path = ref_split.path
script_name = wbrequest.env['SCRIPT_NAME']
if not path.startswith(script_name): app_path = env['SCRIPT_NAME']
if app_path:
# must start with current app name, if not root
if not path.startswith(app_path):
return None
path = path[len(app_path):]
for route in routes:
ref_request = route.parse_request(env, False, request_uri = path)
if ref_request:
break
# must have matched one of the routes
if not ref_request:
return None return None
ref_path = path[len(script_name) + 1:].split('/', 1) # must have a rewriter
if not ref_request.urlrewriter:
# No match on any exception
try:
rewriter = UrlRewriter(ref_path[1], script_name + '/' + ref_path[0] + '/')
except Exception:
return None return None
rel_request_uri = wbrequest.request_uri[1:] rewriter = ref_request.urlrewriter
rel_request_uri = env['REL_REQUEST_URI'][1:]
timestamp_path = rewriter.wburl.timestamp + '/' timestamp_path = rewriter.wburl.timestamp + '/'
@ -218,12 +242,13 @@ if __name__ == "__main__" or utils.enable_doctests():
import handlers import handlers
def test_redir(match_host, request_uri, referrer, script_name = ''): def test_redir(match_host, request_uri, referrer, script_name = '', coll = 'coll'):
env = {'REL_REQUEST_URI': request_uri, 'HTTP_REFERER': referrer, 'SCRIPT_NAME': script_name} env = {'REL_REQUEST_URI': request_uri, 'HTTP_REFERER': referrer, 'SCRIPT_NAME': script_name}
routes = [Route(coll, handlers.BaseHandler())]
redir = ReferRedirect(match_host) redir = ReferRedirect(match_host)
req = WbRequest.from_uri(request_uri, env) #req = WbRequest.from_uri(request_uri, env)
rep = redir(req) rep = redir(env, routes)
if not rep: if not rep:
return False return False

View File

@ -34,6 +34,8 @@ class RegexRewriter:
HTTPX_MATCH_STR = r'https?:\\?/\\?/[A-Za-z0-9:_@.-]+' HTTPX_MATCH_STR = r'https?:\\?/\\?/[A-Za-z0-9:_@.-]+'
DEFAULT_OP = add_prefix DEFAULT_OP = add_prefix
@ -101,6 +103,12 @@ class JSRewriter(RegexRewriter):
>>> test_js(r'location = "http:\\/\\/example.com/abc.html"') >>> test_js(r'location = "http:\\/\\/example.com/abc.html"')
'WB_wombat_location = "/web/20131010im_/http:\\\\/\\\\/example.com/abc.html"' 'WB_wombat_location = "/web/20131010im_/http:\\\\/\\\\/example.com/abc.html"'
>>> test_js(r"location = 'http://example.com/abc.html/'")
"WB_wombat_location = '/web/20131010im_/http://example.com/abc.html/'"
>>> test_js(r'location = http://example.com/abc.html/')
'WB_wombat_location = http://example.com/abc.html/'
>>> test_js(r'location = /http:\/\/example.com/abc.html/') >>> test_js(r'location = /http:\/\/example.com/abc.html/')
'WB_wombat_location = /http:\\\\/\\\\/example.com/abc.html/' 'WB_wombat_location = /http:\\\\/\\\\/example.com/abc.html/'
@ -120,8 +128,14 @@ class JSRewriter(RegexRewriter):
>>> test_js('window.location = "http://example.com/abc.html"; some_func(); ', [('some_func\(\).*', RegexRewriter.comment_out, 0)]) >>> test_js('window.location = "http://example.com/abc.html"; some_func(); ', [('some_func\(\).*', RegexRewriter.comment_out, 0)])
'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"; /*some_func(); */' 'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"; /*some_func(); */'
# scheme-agnostic
>>> test_js('cool_Location = "//example.com/abc.html" //comment')
'cool_Location = "/web/20131010im_///example.com/abc.html" //comment'
""" """
JS_HTTPX = r'(?<="|\')(?:https?:)?\\?/\\?/[A-Za-z0-9:_@.-]+'
def __init__(self, rewriter, extra = []): def __init__(self, rewriter, extra = []):
rules = self._create_rules(rewriter.get_abs_url()) rules = self._create_rules(rewriter.get_abs_url())
rules.extend(extra) rules.extend(extra)
@ -131,7 +145,7 @@ class JSRewriter(RegexRewriter):
def _create_rules(self, http_prefix): def _create_rules(self, http_prefix):
return [ return [
(r'(?<!/)\b' + RegexRewriter.HTTPX_MATCH_STR, http_prefix, 0), (self.JS_HTTPX, http_prefix, 0),
(r'(?<!/)\blocation\b', 'WB_wombat_', 0), (r'(?<!/)\blocation\b', 'WB_wombat_', 0),
(r'(?<=document\.)domain', 'WB_wombat_', 0), (r'(?<=document\.)domain', 'WB_wombat_', 0),
] ]

View File

@ -1,18 +1,19 @@
#_wayback_banner #_wayback_banner
{ {
display: block !important; display: block !important;
top: 0px !important; top: 0px !important;
left: 0px !important; left: 0px !important;
font-family: "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif !important; font-family: "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif !important;
position: inherit !important; position: absolute !important;
padding: 4px !important; padding: 4px !important;
width: 100% !important; width: 100% !important;
font-size: 24px !important; font-size: 24px !important;
border: 1px solid !important; border: 1px solid !important;
background-color: lightYellow !important; background-color: lightYellow !important;
color: black !important; color: black !important;
text-align: center !important; text-align: center !important;
z-index: 2147483643 !important; z-index: 2147483643 !important;
line-height: normal !important;
} }

View File

@ -3,6 +3,7 @@ import utils
import wbrequestresponse import wbrequestresponse
import wbexceptions import wbexceptions
import time import time
import urlparse
from os import path from os import path
from itertools import imap from itertools import imap
@ -38,6 +39,7 @@ class J2TemplateView:
jinja_env = Environment(loader = loader, trim_blocks = True) jinja_env = Environment(loader = loader, trim_blocks = True)
jinja_env.filters['format_ts'] = J2TemplateView.format_ts jinja_env.filters['format_ts'] = J2TemplateView.format_ts
jinja_env.filters['host'] = J2TemplateView.get_host
return jinja_env return jinja_env
def render_to_string(self, **kwargs): def render_to_string(self, **kwargs):
@ -59,6 +61,9 @@ class J2TemplateView:
value = utils.timestamp_to_datetime(value) value = utils.timestamp_to_datetime(value)
return time.strftime(format, value) return time.strftime(format, value)
@staticmethod
def get_host(url):
return urlparse.urlsplit(url).netloc