mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
remove rfc3987 validation, was rejecting valid urls
add extract_referer_wburl_str() to extract WbUrl str, if any, from the referrer. Use that for live_rewrite_handler to override default referrer
This commit is contained in:
parent
611b9093bd
commit
85593696fa
@ -25,6 +25,16 @@
|
|||||||
>>> print_req_from_uri('/2010/example.com', {'HTTP_HOST': 'localhost:8080'}, use_abs_prefix = True)
|
>>> print_req_from_uri('/2010/example.com', {'HTTP_HOST': 'localhost:8080'}, use_abs_prefix = True)
|
||||||
{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': '/2010/', 'request_uri': '/2010/example.com'}
|
{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': '/2010/', 'request_uri': '/2010/example.com'}
|
||||||
|
|
||||||
|
# Referrer extraction
|
||||||
|
>>> WbUrl(req_from_uri('/web/2010/example.com', {'wsgi.url_scheme': 'http', 'HTTP_HOST': 'localhost:8080', 'HTTP_REFERER': 'http://localhost:8080/web/2011/blah.example.com/'}).extract_referrer_wburl_str()).url
|
||||||
|
'http://blah.example.com/'
|
||||||
|
|
||||||
|
# incorrect referer
|
||||||
|
>>> req_from_uri('/web/2010/example.com', {'wsgi.url_scheme': 'http', 'HTTP_HOST': 'localhost:8080', 'HTTP_REFERER': 'http://other.example.com/web/2011/blah.example.com/'}).extract_referrer_wburl_str()
|
||||||
|
|
||||||
|
|
||||||
|
# no referer
|
||||||
|
>>> req_from_uri('/web/2010/example.com', {'wsgi.url_scheme': 'http', 'HTTP_HOST': 'localhost:8080'}).extract_referrer_wburl_str()
|
||||||
|
|
||||||
|
|
||||||
# WbResponse Tests
|
# WbResponse Tests
|
||||||
|
@ -105,6 +105,16 @@ class WbRequest(object):
|
|||||||
def _parse_extra(self):
|
def _parse_extra(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
def extract_referrer_wburl_str(self):
|
||||||
|
if not self.referrer:
|
||||||
|
return None
|
||||||
|
|
||||||
|
if not self.referrer.startswith(self.host_prefix + self.rel_prefix):
|
||||||
|
return None
|
||||||
|
|
||||||
|
wburl_str = self.referrer[len(self.host_prefix + self.rel_prefix):]
|
||||||
|
return wburl_str
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class WbResponse(object):
|
class WbResponse(object):
|
||||||
|
@ -17,7 +17,12 @@ from pywb.rewrite.rewrite_content import RewriteContent
|
|||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class LiveRewriter(object):
|
class LiveRewriter(object):
|
||||||
PROXY_HEADER_LIST = [('HTTP_USER_AGENT', 'User-Agent')]
|
PROXY_HEADER_LIST = [('HTTP_USER_AGENT', 'User-Agent'),
|
||||||
|
('HTTP_ACCEPT', 'Accept'),
|
||||||
|
('HTTP_ACCEPT_LANGUAGE', 'Accept-Language'),
|
||||||
|
('HTTP_ACCEPT_CHARSET', 'Accept-Charset'),
|
||||||
|
('HTTP_REFERER', 'Referer'),
|
||||||
|
]
|
||||||
|
|
||||||
def __init__(self, defmod=''):
|
def __init__(self, defmod=''):
|
||||||
self.rewriter = RewriteContent(defmod=defmod)
|
self.rewriter = RewriteContent(defmod=defmod)
|
||||||
|
@ -60,13 +60,14 @@
|
|||||||
|
|
||||||
# Error Urls
|
# Error Urls
|
||||||
# ======================
|
# ======================
|
||||||
>>> x = WbUrl('/#$%#/')
|
# no longer rejecting this here
|
||||||
|
#>>> x = WbUrl('/#$%#/')
|
||||||
Traceback (most recent call last):
|
Traceback (most recent call last):
|
||||||
Exception: Bad Request Url: http://#$%#/
|
Exception: Bad Request Url: http://#$%#/
|
||||||
|
|
||||||
>>> x = WbUrl('/http://example.com:abc/')
|
#>>> x = WbUrl('/http://example.com:abc/')
|
||||||
Traceback (most recent call last):
|
#Traceback (most recent call last):
|
||||||
Exception: Bad Request Url: http://example.com:abc/
|
#Exception: Bad Request Url: http://example.com:abc/
|
||||||
|
|
||||||
>>> x = WbUrl('')
|
>>> x = WbUrl('')
|
||||||
Traceback (most recent call last):
|
Traceback (most recent call last):
|
||||||
|
@ -39,7 +39,6 @@ wayback url format.
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
import re
|
import re
|
||||||
import rfc3987
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
@ -104,14 +103,6 @@ class WbUrl(BaseWbUrl):
|
|||||||
if inx < len(self.url) and self.url[inx] != '/':
|
if inx < len(self.url) and self.url[inx] != '/':
|
||||||
self.url = self.url[:inx] + '/' + self.url[inx:]
|
self.url = self.url[:inx] + '/' + self.url[inx:]
|
||||||
|
|
||||||
# BUG?: adding upper() because rfc3987 lib
|
|
||||||
# rejects lower case %-encoding
|
|
||||||
# %2F is fine, but %2f -- standard supports either
|
|
||||||
matcher = rfc3987.match(self.url.upper(), 'IRI')
|
|
||||||
|
|
||||||
if not matcher:
|
|
||||||
raise Exception('Bad Request Url: ' + self.url)
|
|
||||||
|
|
||||||
# Match query regex
|
# Match query regex
|
||||||
# ======================
|
# ======================
|
||||||
def _init_query(self, url):
|
def _init_query(self, url):
|
||||||
|
@ -3,6 +3,7 @@ from pywb.framework.wbrequestresponse import WbResponse
|
|||||||
from pywb.framework.archivalrouter import ArchivalRouter, Route
|
from pywb.framework.archivalrouter import ArchivalRouter, Route
|
||||||
|
|
||||||
from pywb.rewrite.rewrite_live import LiveRewriter
|
from pywb.rewrite.rewrite_live import LiveRewriter
|
||||||
|
from pywb.rewrite.wburl import WbUrl
|
||||||
|
|
||||||
from handlers import StaticHandler
|
from handlers import StaticHandler
|
||||||
|
|
||||||
@ -22,16 +23,22 @@ class RewriteHandler(WbUrlHandler):
|
|||||||
#use_lxml_parser()
|
#use_lxml_parser()
|
||||||
self.rewriter = LiveRewriter(defmod='mp_')
|
self.rewriter = LiveRewriter(defmod='mp_')
|
||||||
|
|
||||||
head_insert = config.get('head_insert_html',
|
view = config.get('head_insert_view')
|
||||||
'ui/head_insert.html')
|
if not view:
|
||||||
|
head_insert = config.get('head_insert_html',
|
||||||
|
'ui/head_insert.html')
|
||||||
|
view = HeadInsertView.create_template(head_insert, 'Head Insert')
|
||||||
|
|
||||||
frame_insert = config.get('frame_insert_html',
|
|
||||||
'ui/frame_insert.html')
|
|
||||||
|
|
||||||
view = HeadInsertView.create_template(head_insert, 'Head Insert')
|
|
||||||
self.head_insert_view = view
|
self.head_insert_view = view
|
||||||
|
|
||||||
view = J2TemplateView.create_template(frame_insert, 'Frame Insert')
|
|
||||||
|
view = config.get('frame_insert_view')
|
||||||
|
if not view:
|
||||||
|
frame_insert = config.get('frame_insert_html',
|
||||||
|
'ui/frame_insert.html')
|
||||||
|
|
||||||
|
view = J2TemplateView.create_template(frame_insert, 'Frame Insert')
|
||||||
|
|
||||||
self.frame_insert_view = view
|
self.frame_insert_view = view
|
||||||
|
|
||||||
def __call__(self, wbrequest):
|
def __call__(self, wbrequest):
|
||||||
@ -49,6 +56,11 @@ class RewriteHandler(WbUrlHandler):
|
|||||||
|
|
||||||
head_insert_func = self.head_insert_view.create_insert_func(wbrequest)
|
head_insert_func = self.head_insert_view.create_insert_func(wbrequest)
|
||||||
|
|
||||||
|
ref_wburl_str = wbrequest.extract_referrer_wburl_str()
|
||||||
|
if ref_wburl_str:
|
||||||
|
wbrequest.env['HTTP_REFERER'] = WbUrl(ref_wburl_str).url
|
||||||
|
|
||||||
|
|
||||||
result = self.rewriter.fetch_request(url, wbrequest.urlrewriter,
|
result = self.rewriter.fetch_request(url, wbrequest.urlrewriter,
|
||||||
head_insert_func=head_insert_func,
|
head_insert_func=head_insert_func,
|
||||||
env=wbrequest.env)
|
env=wbrequest.env)
|
||||||
|
1
setup.py
1
setup.py
@ -64,7 +64,6 @@ setup(
|
|||||||
glob.glob('sample_archive/text_content/*')),
|
glob.glob('sample_archive/text_content/*')),
|
||||||
],
|
],
|
||||||
install_requires=[
|
install_requires=[
|
||||||
'rfc3987',
|
|
||||||
'chardet',
|
'chardet',
|
||||||
'requests',
|
'requests',
|
||||||
'redis',
|
'redis',
|
||||||
|
@ -8,7 +8,7 @@ class TestLiveRewriter:
|
|||||||
self.testapp = webtest.TestApp(self.app)
|
self.testapp = webtest.TestApp(self.app)
|
||||||
|
|
||||||
def test_live_rewrite_1(self):
|
def test_live_rewrite_1(self):
|
||||||
headers = [('User-Agent', 'python')]
|
headers = [('User-Agent', 'python'), ('Referer', 'http://localhost:80/rewrite/other.example.com')]
|
||||||
resp = self.testapp.get('/rewrite/mp_/http://example.com/', headers=headers)
|
resp = self.testapp.get('/rewrite/mp_/http://example.com/', headers=headers)
|
||||||
assert resp.status_int == 200
|
assert resp.status_int == 200
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user