mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
rewrite: add optional cookie_rewriter, created by urlrewriter and called from header_rewriter
cookie_rewriter works correctly with a concatenated set-cookie list, returns a list of rewritten 'set-cookie' headers rewrite_live: add proxying of Host, Origin, additional headers split header rewriter tests into test_header_rewriter, add test_cookie_rewriter bump version to 0.4.0!
This commit is contained in:
parent
89da165467
commit
871cc26fa4
@ -1,4 +1,4 @@
|
||||
PyWb 0.3.1
|
||||
PyWb 0.4.0 Beta
|
||||
=============
|
||||
|
||||
.. image:: https://travis-ci.org/ikreymer/pywb.png?branch=develop
|
||||
|
29
pywb/rewrite/cookie_rewriter.py
Normal file
29
pywb/rewrite/cookie_rewriter.py
Normal file
@ -0,0 +1,29 @@
|
||||
import Cookie
|
||||
|
||||
|
||||
#=================================================================
|
||||
class WbUrlCookieRewriter(object):
|
||||
""" Cookie rewriter for wburl-based requests
|
||||
Remove the domain and rewrite path, if any, to match
|
||||
given WbUrl using the url rewriter.
|
||||
"""
|
||||
def __init__(self, url_rewriter):
|
||||
self.url_rewriter = url_rewriter
|
||||
|
||||
def rewrite(self, cookie_str, header='Set-Cookie'):
|
||||
cookie = Cookie.SimpleCookie()
|
||||
cookie.load(cookie_str)
|
||||
|
||||
results = []
|
||||
|
||||
for name, morsel in cookie.iteritems():
|
||||
if morsel.get('domain'):
|
||||
del morsel['domain']
|
||||
if morsel.get('path'):
|
||||
morsel['path'] = self.url_rewriter.rewrite(morsel['path'])
|
||||
if morsel.get('expires'):
|
||||
del morsel['expires']
|
||||
|
||||
results.append((header, morsel.OutputString()))
|
||||
|
||||
return results
|
@ -39,6 +39,8 @@ class HeaderRewriter:
|
||||
|
||||
PROXY_NO_REWRITE_HEADERS = ['content-length']
|
||||
|
||||
COOKIE_HEADERS = ['set-cookie', 'cookie']
|
||||
|
||||
def __init__(self, header_prefix='X-Archive-Orig-'):
|
||||
self.header_prefix = header_prefix
|
||||
|
||||
@ -86,6 +88,8 @@ class HeaderRewriter:
|
||||
new_headers = []
|
||||
removed_header_dict = {}
|
||||
|
||||
cookie_rewriter = urlrewriter.get_cookie_rewriter()
|
||||
|
||||
for (name, value) in headers:
|
||||
|
||||
lowername = name.lower()
|
||||
@ -109,6 +113,11 @@ class HeaderRewriter:
|
||||
not content_rewritten):
|
||||
new_headers.append((name, value))
|
||||
|
||||
elif (lowername in self.COOKIE_HEADERS and
|
||||
cookie_rewriter):
|
||||
cookie_list = cookie_rewriter.rewrite(value)
|
||||
new_headers.extend(cookie_list)
|
||||
|
||||
else:
|
||||
new_headers.append((self.header_prefix + name, value))
|
||||
|
||||
|
@ -6,7 +6,9 @@ import requests
|
||||
import datetime
|
||||
import mimetypes
|
||||
|
||||
from pywb.utils.loaders import is_http
|
||||
from urlparse import urlsplit
|
||||
|
||||
from pywb.utils.loaders import is_http, LimitReader
|
||||
from pywb.utils.timeutils import datetime_to_timestamp
|
||||
from pywb.utils.statusandheaders import StatusAndHeaders
|
||||
from pywb.utils.canonicalize import canonicalize
|
||||
@ -23,6 +25,12 @@ class LiveRewriter(object):
|
||||
('HTTP_ACCEPT_CHARSET', 'Accept-Charset'),
|
||||
('HTTP_ACCEPT_ENCODING', 'Accept-Encoding'),
|
||||
('HTTP_RANGE', 'Range'),
|
||||
('HTTP_CACHE_CONTROL', 'Cache-Control'),
|
||||
('HTTP_X_REQUESTED_WITH', 'X-Requested-With'),
|
||||
('HTTP_X_CSRF_TOKEN', 'X-CSRF-Token'),
|
||||
('HTTP_COOKIE', 'Cookie'),
|
||||
('CONTENT_TYPE', 'Content-Type'),
|
||||
('CONTENT_LENGTH', 'Content-Length'),
|
||||
('REL_REFERER', 'Referer'),
|
||||
]
|
||||
|
||||
@ -67,10 +75,23 @@ class LiveRewriter(object):
|
||||
method = env['REQUEST_METHOD'].upper()
|
||||
input_ = env['wsgi.input']
|
||||
|
||||
host = env.get('HTTP_HOST')
|
||||
origin = env.get('HTTP_ORIGIN')
|
||||
if host or origin:
|
||||
splits = urlsplit(url)
|
||||
if host:
|
||||
req_headers['Host'] = splits.netloc
|
||||
if origin:
|
||||
req_headers['Origin'] = (splits.scheme + '://' + splits.netloc)
|
||||
|
||||
req_headers.update(self.translate_headers(env))
|
||||
|
||||
if method in ('POST', 'PUT'):
|
||||
data = input_
|
||||
len_ = env.get('CONTENT_LENGTH')
|
||||
if len_:
|
||||
data = LimitReader(input_, int(len_))
|
||||
else:
|
||||
data = input_
|
||||
|
||||
response = requests.request(method=method,
|
||||
url=url,
|
||||
|
25
pywb/rewrite/test/test_cookie_rewriter.py
Normal file
25
pywb/rewrite/test/test_cookie_rewriter.py
Normal file
@ -0,0 +1,25 @@
|
||||
r"""
|
||||
# No rewriting
|
||||
>>> rewrite_cookie('a=b; c=d;')
|
||||
[('Set-Cookie', 'a=b'), ('Set-Cookie', 'c=d')]
|
||||
|
||||
>>> rewrite_cookie('some=value; Domain=foo.com; Path=/;')
|
||||
[('Set-Cookie', 'some=value; Path=/pywb/20131226101010/http://example.com/')]
|
||||
|
||||
>>> rewrite_cookie('some=value; Domain=foo.com; Path=/diff/path/;')
|
||||
[('Set-Cookie', 'some=value; Path=/pywb/20131226101010/http://example.com/diff/path/')]
|
||||
|
||||
>>> rewrite_cookie('abc=def; Path=file.html; Expires=Wed, 13 Jan 2021 22:23:01 GMT')
|
||||
[('Set-Cookie', 'abc=def; Path=/pywb/20131226101010/http://example.com/some/path/file.html')]
|
||||
|
||||
"""
|
||||
|
||||
|
||||
from pywb.rewrite.cookie_rewriter import WbUrlCookieRewriter
|
||||
from pywb.rewrite.url_rewriter import UrlRewriter
|
||||
|
||||
urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/pywb/')
|
||||
|
||||
def rewrite_cookie(cookie_str):
|
||||
return WbUrlCookieRewriter(urlrewriter).rewrite(cookie_str)
|
||||
|
80
pywb/rewrite/test/test_header_rewriter.py
Normal file
80
pywb/rewrite/test/test_header_rewriter.py
Normal file
@ -0,0 +1,80 @@
|
||||
"""
|
||||
#=================================================================
|
||||
HTTP Headers Rewriting
|
||||
#=================================================================
|
||||
|
||||
# Text with charset
|
||||
>>> _test_headers([('Date', 'Fri, 03 Jan 2014 03:03:21 GMT'), ('Content-Length', '5'), ('Content-Type', 'text/html;charset=UTF-8')])
|
||||
{'charset': 'utf-8',
|
||||
'removed_header_dict': {},
|
||||
'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Date', 'Fri, 03 Jan 2014 03:03:21 GMT'),
|
||||
('X-Archive-Orig-Content-Length', '5'),
|
||||
('Content-Type', 'text/html;charset=UTF-8')]),
|
||||
'text_type': 'html'}
|
||||
|
||||
# Redirect
|
||||
>>> _test_headers([('Connection', 'close'), ('Location', '/other.html')], '302 Redirect')
|
||||
{'charset': None,
|
||||
'removed_header_dict': {},
|
||||
'status_headers': StatusAndHeaders(protocol = '', statusline = '302 Redirect', headers = [ ('X-Archive-Orig-Connection', 'close'),
|
||||
('Location', '/web/20131010/http://example.com/other.html')]),
|
||||
'text_type': None}
|
||||
|
||||
# cookie, host/origin rewriting
|
||||
>>> _test_headers([('Connection', 'close'), ('Set-Cookie', 'foo=bar; Path=/; Domain=.example.com, abc=def; Path=somefile.html'), ('Host', 'example.com'), ('Origin', 'https://example.com')])
|
||||
{'charset': None,
|
||||
'removed_header_dict': {},
|
||||
'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Connection', 'close'),
|
||||
('Set-Cookie', 'foo=bar; Path=/web/20131010/http://example.com/'),
|
||||
( 'Set-Cookie',
|
||||
'abc=def; Path=/web/20131010/http://example.com/somefile.html'),
|
||||
('X-Archive-Orig-Host', 'example.com'),
|
||||
('X-Archive-Orig-Origin', 'https://example.com')]),
|
||||
'text_type': None}
|
||||
|
||||
|
||||
|
||||
# gzip
|
||||
>>> _test_headers([('Content-Length', '199999'), ('Content-Type', 'text/javascript'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked')])
|
||||
{'charset': None,
|
||||
'removed_header_dict': {'content-encoding': 'gzip',
|
||||
'transfer-encoding': 'chunked'},
|
||||
'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Content-Length', '199999'),
|
||||
('Content-Type', 'text/javascript')]),
|
||||
'text_type': 'js'}
|
||||
|
||||
# Binary -- transfer-encoding removed
|
||||
>>> _test_headers([('Content-Length', '200000'), ('Content-Type', 'image/png'), ('Set-Cookie', 'foo=bar; Path=/; Domain=.example.com'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked')])
|
||||
{'charset': None,
|
||||
'removed_header_dict': {'transfer-encoding': 'chunked'},
|
||||
'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('Content-Length', '200000'),
|
||||
('Content-Type', 'image/png'),
|
||||
('Set-Cookie', 'foo=bar; Path=/web/20131010/http://example.com/'),
|
||||
('Content-Encoding', 'gzip')]),
|
||||
'text_type': None}
|
||||
|
||||
"""
|
||||
|
||||
|
||||
|
||||
from pywb.rewrite.header_rewriter import HeaderRewriter
|
||||
from pywb.rewrite.url_rewriter import UrlRewriter
|
||||
from pywb.utils.statusandheaders import StatusAndHeaders
|
||||
|
||||
import pprint
|
||||
|
||||
urlrewriter = UrlRewriter('20131010/http://example.com/', '/web/')
|
||||
|
||||
|
||||
headerrewriter = HeaderRewriter()
|
||||
|
||||
def _test_headers(headers, status = '200 OK'):
|
||||
rewritten = headerrewriter.rewrite(StatusAndHeaders(status, headers), urlrewriter)
|
||||
return pprint.pprint(vars(rewritten))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import doctest
|
||||
doctest.testmod()
|
||||
|
||||
|
@ -116,61 +116,13 @@ r"""
|
||||
>>> _test_css("@import url(/url.css)\n@import url(/anotherurl.css)\n @import url(/and_a_third.css)")
|
||||
'@import url(/web/20131010em_/http://example.com/url.css)\n@import url(/web/20131010em_/http://example.com/anotherurl.css)\n @import url(/web/20131010em_/http://example.com/and_a_third.css)'
|
||||
|
||||
#=================================================================
|
||||
HTTP Headers Rewriting
|
||||
#=================================================================
|
||||
|
||||
# Text with charset
|
||||
>>> _test_headers([('Date', 'Fri, 03 Jan 2014 03:03:21 GMT'), ('Content-Length', '5'), ('Content-Type', 'text/html;charset=UTF-8')])
|
||||
{'charset': 'utf-8',
|
||||
'removed_header_dict': {},
|
||||
'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Date', 'Fri, 03 Jan 2014 03:03:21 GMT'),
|
||||
('X-Archive-Orig-Content-Length', '5'),
|
||||
('Content-Type', 'text/html;charset=UTF-8')]),
|
||||
'text_type': 'html'}
|
||||
|
||||
# Redirect
|
||||
>>> _test_headers([('Connection', 'close'), ('Location', '/other.html')], '302 Redirect')
|
||||
{'charset': None,
|
||||
'removed_header_dict': {},
|
||||
'status_headers': StatusAndHeaders(protocol = '', statusline = '302 Redirect', headers = [ ('X-Archive-Orig-Connection', 'close'),
|
||||
('Location', '/web/20131010/http://example.com/other.html')]),
|
||||
'text_type': None}
|
||||
|
||||
# gzip
|
||||
>>> _test_headers([('Content-Length', '199999'), ('Content-Type', 'text/javascript'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked')])
|
||||
{'charset': None,
|
||||
'removed_header_dict': {'content-encoding': 'gzip',
|
||||
'transfer-encoding': 'chunked'},
|
||||
'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Content-Length', '199999'),
|
||||
('Content-Type', 'text/javascript')]),
|
||||
'text_type': 'js'}
|
||||
|
||||
# Binary
|
||||
>>> _test_headers([('Content-Length', '200000'), ('Content-Type', 'image/png'), ('Cookie', 'blah'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked')])
|
||||
{'charset': None,
|
||||
'removed_header_dict': {'transfer-encoding': 'chunked'},
|
||||
'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('Content-Length', '200000'),
|
||||
('Content-Type', 'image/png'),
|
||||
('X-Archive-Orig-Cookie', 'blah'),
|
||||
('Content-Encoding', 'gzip')]),
|
||||
'text_type': None}
|
||||
|
||||
Removing Transfer-Encoding always, Was:
|
||||
('Content-Encoding', 'gzip'),
|
||||
('Transfer-Encoding', 'chunked')]), 'charset': None, 'text_type': None, 'removed_header_dict': {}}
|
||||
|
||||
|
||||
"""
|
||||
|
||||
|
||||
#=================================================================
|
||||
from pywb.rewrite.url_rewriter import UrlRewriter
|
||||
from pywb.rewrite.regex_rewriters import RegexRewriter, JSRewriter, CSSRewriter, XMLRewriter
|
||||
from pywb.rewrite.header_rewriter import HeaderRewriter
|
||||
|
||||
from pywb.utils.statusandheaders import StatusAndHeaders
|
||||
|
||||
import pprint
|
||||
|
||||
urlrewriter = UrlRewriter('20131010/http://example.com/', '/web/')
|
||||
|
||||
@ -184,12 +136,6 @@ def _test_xml(string):
|
||||
def _test_css(string):
|
||||
return CSSRewriter(urlrewriter).rewrite(string)
|
||||
|
||||
headerrewriter = HeaderRewriter()
|
||||
|
||||
def _test_headers(headers, status = '200 OK'):
|
||||
rewritten = headerrewriter.rewrite(StatusAndHeaders(status, headers), urlrewriter)
|
||||
return pprint.pprint(vars(rewritten))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import doctest
|
||||
|
@ -3,6 +3,8 @@ from pywb.rewrite.url_rewriter import UrlRewriter
|
||||
|
||||
from pywb import get_test_dir
|
||||
|
||||
from io import BytesIO
|
||||
|
||||
# This module has some rewriting tests against the 'live web'
|
||||
# As such, the content may change and the test may break
|
||||
|
||||
@ -81,5 +83,17 @@ def test_example_domain_specific_3():
|
||||
assert '/* Bootloader.configurePage' in buff
|
||||
|
||||
|
||||
def test_post():
|
||||
buff = BytesIO('ABCDEF')
|
||||
|
||||
env = {'REQUEST_METHOD': 'POST',
|
||||
'HTTP_ORIGIN': 'http://example.com',
|
||||
'HTTP_HOST': 'example.com',
|
||||
'wsgi.input': buff}
|
||||
|
||||
status_headers, resp_buff = get_rewritten('http://example.com/', urlrewriter, env=env)
|
||||
assert status_headers.get_statuscode() == '200', status_headers
|
||||
|
||||
|
||||
def get_rewritten(*args, **kwargs):
|
||||
return LiveRewriter().get_rewritten(*args, **kwargs)
|
||||
|
@ -2,6 +2,7 @@ import copy
|
||||
import urlparse
|
||||
|
||||
from wburl import WbUrl
|
||||
from cookie_rewriter import WbUrlCookieRewriter
|
||||
|
||||
|
||||
#=================================================================
|
||||
@ -82,6 +83,9 @@ class UrlRewriter(object):
|
||||
new_wburl.url = new_url
|
||||
return UrlRewriter(new_wburl, self.prefix)
|
||||
|
||||
def get_cookie_rewriter(self):
|
||||
return WbUrlCookieRewriter(self)
|
||||
|
||||
def __repr__(self):
|
||||
return "UrlRewriter('{0}', '{1}')".format(self.wburl, self.prefix)
|
||||
|
||||
@ -114,3 +118,6 @@ class HttpsUrlRewriter(object):
|
||||
|
||||
def rebase_rewriter(self, new_url):
|
||||
return self
|
||||
|
||||
def get_cookie_rewriter(self):
|
||||
return None
|
||||
|
Loading…
x
Reference in New Issue
Block a user