1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

remove rfc3987 validation, was rejecting valid urls

add extract_referer_wburl_str() to extract WbUrl str, if any,
from the referrer. Use that for live_rewrite_handler to override
default referrer
This commit is contained in:
Ilya Kreymer 2014-04-15 16:38:53 -07:00
parent 611b9093bd
commit 85593696fa
8 changed files with 51 additions and 23 deletions

View File

@ -25,6 +25,16 @@
>>> print_req_from_uri('/2010/example.com', {'HTTP_HOST': 'localhost:8080'}, use_abs_prefix = True) >>> print_req_from_uri('/2010/example.com', {'HTTP_HOST': 'localhost:8080'}, use_abs_prefix = True)
{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': '/2010/', 'request_uri': '/2010/example.com'} {'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': '/2010/', 'request_uri': '/2010/example.com'}
# Referrer extraction
>>> WbUrl(req_from_uri('/web/2010/example.com', {'wsgi.url_scheme': 'http', 'HTTP_HOST': 'localhost:8080', 'HTTP_REFERER': 'http://localhost:8080/web/2011/blah.example.com/'}).extract_referrer_wburl_str()).url
'http://blah.example.com/'
# incorrect referer
>>> req_from_uri('/web/2010/example.com', {'wsgi.url_scheme': 'http', 'HTTP_HOST': 'localhost:8080', 'HTTP_REFERER': 'http://other.example.com/web/2011/blah.example.com/'}).extract_referrer_wburl_str()
# no referer
>>> req_from_uri('/web/2010/example.com', {'wsgi.url_scheme': 'http', 'HTTP_HOST': 'localhost:8080'}).extract_referrer_wburl_str()
# WbResponse Tests # WbResponse Tests

View File

@ -105,6 +105,16 @@ class WbRequest(object):
def _parse_extra(self): def _parse_extra(self):
pass pass
def extract_referrer_wburl_str(self):
if not self.referrer:
return None
if not self.referrer.startswith(self.host_prefix + self.rel_prefix):
return None
wburl_str = self.referrer[len(self.host_prefix + self.rel_prefix):]
return wburl_str
#================================================================= #=================================================================
class WbResponse(object): class WbResponse(object):

View File

@ -17,7 +17,12 @@ from pywb.rewrite.rewrite_content import RewriteContent
#================================================================= #=================================================================
class LiveRewriter(object): class LiveRewriter(object):
PROXY_HEADER_LIST = [('HTTP_USER_AGENT', 'User-Agent')] PROXY_HEADER_LIST = [('HTTP_USER_AGENT', 'User-Agent'),
('HTTP_ACCEPT', 'Accept'),
('HTTP_ACCEPT_LANGUAGE', 'Accept-Language'),
('HTTP_ACCEPT_CHARSET', 'Accept-Charset'),
('HTTP_REFERER', 'Referer'),
]
def __init__(self, defmod=''): def __init__(self, defmod=''):
self.rewriter = RewriteContent(defmod=defmod) self.rewriter = RewriteContent(defmod=defmod)

View File

@ -60,13 +60,14 @@
# Error Urls # Error Urls
# ====================== # ======================
>>> x = WbUrl('/#$%#/') # no longer rejecting this here
#>>> x = WbUrl('/#$%#/')
Traceback (most recent call last): Traceback (most recent call last):
Exception: Bad Request Url: http://#$%#/ Exception: Bad Request Url: http://#$%#/
>>> x = WbUrl('/http://example.com:abc/') #>>> x = WbUrl('/http://example.com:abc/')
Traceback (most recent call last): #Traceback (most recent call last):
Exception: Bad Request Url: http://example.com:abc/ #Exception: Bad Request Url: http://example.com:abc/
>>> x = WbUrl('') >>> x = WbUrl('')
Traceback (most recent call last): Traceback (most recent call last):

View File

@ -39,7 +39,6 @@ wayback url format.
""" """
import re import re
import rfc3987
#================================================================= #=================================================================
@ -104,14 +103,6 @@ class WbUrl(BaseWbUrl):
if inx < len(self.url) and self.url[inx] != '/': if inx < len(self.url) and self.url[inx] != '/':
self.url = self.url[:inx] + '/' + self.url[inx:] self.url = self.url[:inx] + '/' + self.url[inx:]
# BUG?: adding upper() because rfc3987 lib
# rejects lower case %-encoding
# %2F is fine, but %2f -- standard supports either
matcher = rfc3987.match(self.url.upper(), 'IRI')
if not matcher:
raise Exception('Bad Request Url: ' + self.url)
# Match query regex # Match query regex
# ====================== # ======================
def _init_query(self, url): def _init_query(self, url):

View File

@ -3,6 +3,7 @@ from pywb.framework.wbrequestresponse import WbResponse
from pywb.framework.archivalrouter import ArchivalRouter, Route from pywb.framework.archivalrouter import ArchivalRouter, Route
from pywb.rewrite.rewrite_live import LiveRewriter from pywb.rewrite.rewrite_live import LiveRewriter
from pywb.rewrite.wburl import WbUrl
from handlers import StaticHandler from handlers import StaticHandler
@ -22,16 +23,22 @@ class RewriteHandler(WbUrlHandler):
#use_lxml_parser() #use_lxml_parser()
self.rewriter = LiveRewriter(defmod='mp_') self.rewriter = LiveRewriter(defmod='mp_')
head_insert = config.get('head_insert_html', view = config.get('head_insert_view')
'ui/head_insert.html') if not view:
head_insert = config.get('head_insert_html',
'ui/head_insert.html')
view = HeadInsertView.create_template(head_insert, 'Head Insert')
frame_insert = config.get('frame_insert_html',
'ui/frame_insert.html')
view = HeadInsertView.create_template(head_insert, 'Head Insert')
self.head_insert_view = view self.head_insert_view = view
view = J2TemplateView.create_template(frame_insert, 'Frame Insert')
view = config.get('frame_insert_view')
if not view:
frame_insert = config.get('frame_insert_html',
'ui/frame_insert.html')
view = J2TemplateView.create_template(frame_insert, 'Frame Insert')
self.frame_insert_view = view self.frame_insert_view = view
def __call__(self, wbrequest): def __call__(self, wbrequest):
@ -49,6 +56,11 @@ class RewriteHandler(WbUrlHandler):
head_insert_func = self.head_insert_view.create_insert_func(wbrequest) head_insert_func = self.head_insert_view.create_insert_func(wbrequest)
ref_wburl_str = wbrequest.extract_referrer_wburl_str()
if ref_wburl_str:
wbrequest.env['HTTP_REFERER'] = WbUrl(ref_wburl_str).url
result = self.rewriter.fetch_request(url, wbrequest.urlrewriter, result = self.rewriter.fetch_request(url, wbrequest.urlrewriter,
head_insert_func=head_insert_func, head_insert_func=head_insert_func,
env=wbrequest.env) env=wbrequest.env)

View File

@ -64,7 +64,6 @@ setup(
glob.glob('sample_archive/text_content/*')), glob.glob('sample_archive/text_content/*')),
], ],
install_requires=[ install_requires=[
'rfc3987',
'chardet', 'chardet',
'requests', 'requests',
'redis', 'redis',

View File

@ -8,7 +8,7 @@ class TestLiveRewriter:
self.testapp = webtest.TestApp(self.app) self.testapp = webtest.TestApp(self.app)
def test_live_rewrite_1(self): def test_live_rewrite_1(self):
headers = [('User-Agent', 'python')] headers = [('User-Agent', 'python'), ('Referer', 'http://localhost:80/rewrite/other.example.com')]
resp = self.testapp.get('/rewrite/mp_/http://example.com/', headers=headers) resp = self.testapp.get('/rewrite/mp_/http://example.com/', headers=headers)
assert resp.status_int == 200 assert resp.status_int == 200