1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

remove rfc3987 validation, was rejecting valid urls

add extract_referer_wburl_str() to extract WbUrl str, if any,
from the referrer. Use that for live_rewrite_handler to override
default referrer
This commit is contained in:
Ilya Kreymer 2014-04-15 16:38:53 -07:00
parent 611b9093bd
commit 85593696fa
8 changed files with 51 additions and 23 deletions

View File

@ -25,6 +25,16 @@
>>> print_req_from_uri('/2010/example.com', {'HTTP_HOST': 'localhost:8080'}, use_abs_prefix = True)
{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': '/2010/', 'request_uri': '/2010/example.com'}
# Referrer extraction
>>> WbUrl(req_from_uri('/web/2010/example.com', {'wsgi.url_scheme': 'http', 'HTTP_HOST': 'localhost:8080', 'HTTP_REFERER': 'http://localhost:8080/web/2011/blah.example.com/'}).extract_referrer_wburl_str()).url
'http://blah.example.com/'
# incorrect referer
>>> req_from_uri('/web/2010/example.com', {'wsgi.url_scheme': 'http', 'HTTP_HOST': 'localhost:8080', 'HTTP_REFERER': 'http://other.example.com/web/2011/blah.example.com/'}).extract_referrer_wburl_str()
# no referer
>>> req_from_uri('/web/2010/example.com', {'wsgi.url_scheme': 'http', 'HTTP_HOST': 'localhost:8080'}).extract_referrer_wburl_str()
# WbResponse Tests

View File

@ -105,6 +105,16 @@ class WbRequest(object):
def _parse_extra(self):
pass
def extract_referrer_wburl_str(self):
if not self.referrer:
return None
if not self.referrer.startswith(self.host_prefix + self.rel_prefix):
return None
wburl_str = self.referrer[len(self.host_prefix + self.rel_prefix):]
return wburl_str
#=================================================================
class WbResponse(object):

View File

@ -17,7 +17,12 @@ from pywb.rewrite.rewrite_content import RewriteContent
#=================================================================
class LiveRewriter(object):
PROXY_HEADER_LIST = [('HTTP_USER_AGENT', 'User-Agent')]
PROXY_HEADER_LIST = [('HTTP_USER_AGENT', 'User-Agent'),
('HTTP_ACCEPT', 'Accept'),
('HTTP_ACCEPT_LANGUAGE', 'Accept-Language'),
('HTTP_ACCEPT_CHARSET', 'Accept-Charset'),
('HTTP_REFERER', 'Referer'),
]
def __init__(self, defmod=''):
self.rewriter = RewriteContent(defmod=defmod)

View File

@ -60,13 +60,14 @@
# Error Urls
# ======================
>>> x = WbUrl('/#$%#/')
# no longer rejecting this here
#>>> x = WbUrl('/#$%#/')
Traceback (most recent call last):
Exception: Bad Request Url: http://#$%#/
>>> x = WbUrl('/http://example.com:abc/')
Traceback (most recent call last):
Exception: Bad Request Url: http://example.com:abc/
#>>> x = WbUrl('/http://example.com:abc/')
#Traceback (most recent call last):
#Exception: Bad Request Url: http://example.com:abc/
>>> x = WbUrl('')
Traceback (most recent call last):

View File

@ -39,7 +39,6 @@ wayback url format.
"""
import re
import rfc3987
#=================================================================
@ -104,14 +103,6 @@ class WbUrl(BaseWbUrl):
if inx < len(self.url) and self.url[inx] != '/':
self.url = self.url[:inx] + '/' + self.url[inx:]
# BUG?: adding upper() because rfc3987 lib
# rejects lower case %-encoding
# %2F is fine, but %2f -- standard supports either
matcher = rfc3987.match(self.url.upper(), 'IRI')
if not matcher:
raise Exception('Bad Request Url: ' + self.url)
# Match query regex
# ======================
def _init_query(self, url):

View File

@ -3,6 +3,7 @@ from pywb.framework.wbrequestresponse import WbResponse
from pywb.framework.archivalrouter import ArchivalRouter, Route
from pywb.rewrite.rewrite_live import LiveRewriter
from pywb.rewrite.wburl import WbUrl
from handlers import StaticHandler
@ -22,16 +23,22 @@ class RewriteHandler(WbUrlHandler):
#use_lxml_parser()
self.rewriter = LiveRewriter(defmod='mp_')
head_insert = config.get('head_insert_html',
'ui/head_insert.html')
view = config.get('head_insert_view')
if not view:
head_insert = config.get('head_insert_html',
'ui/head_insert.html')
view = HeadInsertView.create_template(head_insert, 'Head Insert')
frame_insert = config.get('frame_insert_html',
'ui/frame_insert.html')
view = HeadInsertView.create_template(head_insert, 'Head Insert')
self.head_insert_view = view
view = J2TemplateView.create_template(frame_insert, 'Frame Insert')
view = config.get('frame_insert_view')
if not view:
frame_insert = config.get('frame_insert_html',
'ui/frame_insert.html')
view = J2TemplateView.create_template(frame_insert, 'Frame Insert')
self.frame_insert_view = view
def __call__(self, wbrequest):
@ -49,6 +56,11 @@ class RewriteHandler(WbUrlHandler):
head_insert_func = self.head_insert_view.create_insert_func(wbrequest)
ref_wburl_str = wbrequest.extract_referrer_wburl_str()
if ref_wburl_str:
wbrequest.env['HTTP_REFERER'] = WbUrl(ref_wburl_str).url
result = self.rewriter.fetch_request(url, wbrequest.urlrewriter,
head_insert_func=head_insert_func,
env=wbrequest.env)

View File

@ -64,7 +64,6 @@ setup(
glob.glob('sample_archive/text_content/*')),
],
install_requires=[
'rfc3987',
'chardet',
'requests',
'redis',

View File

@ -8,7 +8,7 @@ class TestLiveRewriter:
self.testapp = webtest.TestApp(self.app)
def test_live_rewrite_1(self):
headers = [('User-Agent', 'python')]
headers = [('User-Agent', 'python'), ('Referer', 'http://localhost:80/rewrite/other.example.com')]
resp = self.testapp.get('/rewrite/mp_/http://example.com/', headers=headers)
assert resp.status_int == 200