diff --git a/pywb/framework/test/test_wbrequestresponse.py b/pywb/framework/test/test_wbrequestresponse.py index f090a6ae..493ca0c2 100644 --- a/pywb/framework/test/test_wbrequestresponse.py +++ b/pywb/framework/test/test_wbrequestresponse.py @@ -25,6 +25,16 @@ >>> print_req_from_uri('/2010/example.com', {'HTTP_HOST': 'localhost:8080'}, use_abs_prefix = True) {'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': '/2010/', 'request_uri': '/2010/example.com'} +# Referrer extraction +>>> WbUrl(req_from_uri('/web/2010/example.com', {'wsgi.url_scheme': 'http', 'HTTP_HOST': 'localhost:8080', 'HTTP_REFERER': 'http://localhost:8080/web/2011/blah.example.com/'}).extract_referrer_wburl_str()).url +'http://blah.example.com/' + +# incorrect referer +>>> req_from_uri('/web/2010/example.com', {'wsgi.url_scheme': 'http', 'HTTP_HOST': 'localhost:8080', 'HTTP_REFERER': 'http://other.example.com/web/2011/blah.example.com/'}).extract_referrer_wburl_str() + + +# no referer +>>> req_from_uri('/web/2010/example.com', {'wsgi.url_scheme': 'http', 'HTTP_HOST': 'localhost:8080'}).extract_referrer_wburl_str() # WbResponse Tests diff --git a/pywb/framework/wbrequestresponse.py b/pywb/framework/wbrequestresponse.py index 11fd99db..80156aff 100644 --- a/pywb/framework/wbrequestresponse.py +++ b/pywb/framework/wbrequestresponse.py @@ -105,6 +105,16 @@ class WbRequest(object): def _parse_extra(self): pass + def extract_referrer_wburl_str(self): + if not self.referrer: + return None + + if not self.referrer.startswith(self.host_prefix + self.rel_prefix): + return None + + wburl_str = self.referrer[len(self.host_prefix + self.rel_prefix):] + return wburl_str + #================================================================= class WbResponse(object): diff --git a/pywb/rewrite/rewrite_live.py b/pywb/rewrite/rewrite_live.py index d3746198..61113114 100644 --- a/pywb/rewrite/rewrite_live.py +++ b/pywb/rewrite/rewrite_live.py @@ -17,7 +17,12 @@ from pywb.rewrite.rewrite_content import RewriteContent #================================================================= class LiveRewriter(object): - PROXY_HEADER_LIST = [('HTTP_USER_AGENT', 'User-Agent')] + PROXY_HEADER_LIST = [('HTTP_USER_AGENT', 'User-Agent'), + ('HTTP_ACCEPT', 'Accept'), + ('HTTP_ACCEPT_LANGUAGE', 'Accept-Language'), + ('HTTP_ACCEPT_CHARSET', 'Accept-Charset'), + ('HTTP_REFERER', 'Referer'), + ] def __init__(self, defmod=''): self.rewriter = RewriteContent(defmod=defmod) diff --git a/pywb/rewrite/test/test_wburl.py b/pywb/rewrite/test/test_wburl.py index 955e24df..bcad948e 100644 --- a/pywb/rewrite/test/test_wburl.py +++ b/pywb/rewrite/test/test_wburl.py @@ -60,13 +60,14 @@ # Error Urls # ====================== ->>> x = WbUrl('/#$%#/') +# no longer rejecting this here +#>>> x = WbUrl('/#$%#/') Traceback (most recent call last): Exception: Bad Request Url: http://#$%#/ ->>> x = WbUrl('/http://example.com:abc/') -Traceback (most recent call last): -Exception: Bad Request Url: http://example.com:abc/ +#>>> x = WbUrl('/http://example.com:abc/') +#Traceback (most recent call last): +#Exception: Bad Request Url: http://example.com:abc/ >>> x = WbUrl('') Traceback (most recent call last): diff --git a/pywb/rewrite/wburl.py b/pywb/rewrite/wburl.py index 982743ae..c2ac9b23 100644 --- a/pywb/rewrite/wburl.py +++ b/pywb/rewrite/wburl.py @@ -39,7 +39,6 @@ wayback url format. """ import re -import rfc3987 #================================================================= @@ -104,14 +103,6 @@ class WbUrl(BaseWbUrl): if inx < len(self.url) and self.url[inx] != '/': self.url = self.url[:inx] + '/' + self.url[inx:] - # BUG?: adding upper() because rfc3987 lib - # rejects lower case %-encoding - # %2F is fine, but %2f -- standard supports either - matcher = rfc3987.match(self.url.upper(), 'IRI') - - if not matcher: - raise Exception('Bad Request Url: ' + self.url) - # Match query regex # ====================== def _init_query(self, url): diff --git a/pywb/webapp/live_rewrite_handler.py b/pywb/webapp/live_rewrite_handler.py index d554c010..46392dc1 100644 --- a/pywb/webapp/live_rewrite_handler.py +++ b/pywb/webapp/live_rewrite_handler.py @@ -3,6 +3,7 @@ from pywb.framework.wbrequestresponse import WbResponse from pywb.framework.archivalrouter import ArchivalRouter, Route from pywb.rewrite.rewrite_live import LiveRewriter +from pywb.rewrite.wburl import WbUrl from handlers import StaticHandler @@ -22,16 +23,22 @@ class RewriteHandler(WbUrlHandler): #use_lxml_parser() self.rewriter = LiveRewriter(defmod='mp_') - head_insert = config.get('head_insert_html', - 'ui/head_insert.html') + view = config.get('head_insert_view') + if not view: + head_insert = config.get('head_insert_html', + 'ui/head_insert.html') + view = HeadInsertView.create_template(head_insert, 'Head Insert') - frame_insert = config.get('frame_insert_html', - 'ui/frame_insert.html') - - view = HeadInsertView.create_template(head_insert, 'Head Insert') self.head_insert_view = view - view = J2TemplateView.create_template(frame_insert, 'Frame Insert') + + view = config.get('frame_insert_view') + if not view: + frame_insert = config.get('frame_insert_html', + 'ui/frame_insert.html') + + view = J2TemplateView.create_template(frame_insert, 'Frame Insert') + self.frame_insert_view = view def __call__(self, wbrequest): @@ -49,6 +56,11 @@ class RewriteHandler(WbUrlHandler): head_insert_func = self.head_insert_view.create_insert_func(wbrequest) + ref_wburl_str = wbrequest.extract_referrer_wburl_str() + if ref_wburl_str: + wbrequest.env['HTTP_REFERER'] = WbUrl(ref_wburl_str).url + + result = self.rewriter.fetch_request(url, wbrequest.urlrewriter, head_insert_func=head_insert_func, env=wbrequest.env) diff --git a/setup.py b/setup.py index 329537c0..91279b4f 100755 --- a/setup.py +++ b/setup.py @@ -64,7 +64,6 @@ setup( glob.glob('sample_archive/text_content/*')), ], install_requires=[ - 'rfc3987', 'chardet', 'requests', 'redis', diff --git a/tests/test_live_rewriter.py b/tests/test_live_rewriter.py index 2cc9e108..b2a6dada 100644 --- a/tests/test_live_rewriter.py +++ b/tests/test_live_rewriter.py @@ -8,7 +8,7 @@ class TestLiveRewriter: self.testapp = webtest.TestApp(self.app) def test_live_rewrite_1(self): - headers = [('User-Agent', 'python')] + headers = [('User-Agent', 'python'), ('Referer', 'http://localhost:80/rewrite/other.example.com')] resp = self.testapp.get('/rewrite/mp_/http://example.com/', headers=headers) assert resp.status_int == 200