1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

rewrite: add optional cookie_rewriter, created by urlrewriter and called from header_rewriter

cookie_rewriter works correctly with a concatenated set-cookie list, returns a list of rewritten 'set-cookie' headers
rewrite_live: add proxying of Host, Origin, additional headers
split header rewriter tests into test_header_rewriter, add test_cookie_rewriter
bump version to 0.4.0!
This commit is contained in:
Ilya Kreymer 2014-05-13 17:07:41 -07:00
parent 89da165467
commit 871cc26fa4
10 changed files with 190 additions and 59 deletions

View File

@ -1,4 +1,4 @@
PyWb 0.3.1
PyWb 0.4.0 Beta
=============
.. image:: https://travis-ci.org/ikreymer/pywb.png?branch=develop

View File

@ -0,0 +1,29 @@
import Cookie
#=================================================================
class WbUrlCookieRewriter(object):
""" Cookie rewriter for wburl-based requests
Remove the domain and rewrite path, if any, to match
given WbUrl using the url rewriter.
"""
def __init__(self, url_rewriter):
self.url_rewriter = url_rewriter
def rewrite(self, cookie_str, header='Set-Cookie'):
cookie = Cookie.SimpleCookie()
cookie.load(cookie_str)
results = []
for name, morsel in cookie.iteritems():
if morsel.get('domain'):
del morsel['domain']
if morsel.get('path'):
morsel['path'] = self.url_rewriter.rewrite(morsel['path'])
if morsel.get('expires'):
del morsel['expires']
results.append((header, morsel.OutputString()))
return results

View File

@ -39,6 +39,8 @@ class HeaderRewriter:
PROXY_NO_REWRITE_HEADERS = ['content-length']
COOKIE_HEADERS = ['set-cookie', 'cookie']
def __init__(self, header_prefix='X-Archive-Orig-'):
self.header_prefix = header_prefix
@ -86,6 +88,8 @@ class HeaderRewriter:
new_headers = []
removed_header_dict = {}
cookie_rewriter = urlrewriter.get_cookie_rewriter()
for (name, value) in headers:
lowername = name.lower()
@ -109,6 +113,11 @@ class HeaderRewriter:
not content_rewritten):
new_headers.append((name, value))
elif (lowername in self.COOKIE_HEADERS and
cookie_rewriter):
cookie_list = cookie_rewriter.rewrite(value)
new_headers.extend(cookie_list)
else:
new_headers.append((self.header_prefix + name, value))

View File

@ -6,7 +6,9 @@ import requests
import datetime
import mimetypes
from pywb.utils.loaders import is_http
from urlparse import urlsplit
from pywb.utils.loaders import is_http, LimitReader
from pywb.utils.timeutils import datetime_to_timestamp
from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.utils.canonicalize import canonicalize
@ -23,6 +25,12 @@ class LiveRewriter(object):
('HTTP_ACCEPT_CHARSET', 'Accept-Charset'),
('HTTP_ACCEPT_ENCODING', 'Accept-Encoding'),
('HTTP_RANGE', 'Range'),
('HTTP_CACHE_CONTROL', 'Cache-Control'),
('HTTP_X_REQUESTED_WITH', 'X-Requested-With'),
('HTTP_X_CSRF_TOKEN', 'X-CSRF-Token'),
('HTTP_COOKIE', 'Cookie'),
('CONTENT_TYPE', 'Content-Type'),
('CONTENT_LENGTH', 'Content-Length'),
('REL_REFERER', 'Referer'),
]
@ -67,10 +75,23 @@ class LiveRewriter(object):
method = env['REQUEST_METHOD'].upper()
input_ = env['wsgi.input']
host = env.get('HTTP_HOST')
origin = env.get('HTTP_ORIGIN')
if host or origin:
splits = urlsplit(url)
if host:
req_headers['Host'] = splits.netloc
if origin:
req_headers['Origin'] = (splits.scheme + '://' + splits.netloc)
req_headers.update(self.translate_headers(env))
if method in ('POST', 'PUT'):
data = input_
len_ = env.get('CONTENT_LENGTH')
if len_:
data = LimitReader(input_, int(len_))
else:
data = input_
response = requests.request(method=method,
url=url,

View File

@ -0,0 +1,25 @@
r"""
# No rewriting
>>> rewrite_cookie('a=b; c=d;')
[('Set-Cookie', 'a=b'), ('Set-Cookie', 'c=d')]
>>> rewrite_cookie('some=value; Domain=foo.com; Path=/;')
[('Set-Cookie', 'some=value; Path=/pywb/20131226101010/http://example.com/')]
>>> rewrite_cookie('some=value; Domain=foo.com; Path=/diff/path/;')
[('Set-Cookie', 'some=value; Path=/pywb/20131226101010/http://example.com/diff/path/')]
>>> rewrite_cookie('abc=def; Path=file.html; Expires=Wed, 13 Jan 2021 22:23:01 GMT')
[('Set-Cookie', 'abc=def; Path=/pywb/20131226101010/http://example.com/some/path/file.html')]
"""
from pywb.rewrite.cookie_rewriter import WbUrlCookieRewriter
from pywb.rewrite.url_rewriter import UrlRewriter
urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/pywb/')
def rewrite_cookie(cookie_str):
return WbUrlCookieRewriter(urlrewriter).rewrite(cookie_str)

View File

@ -0,0 +1,80 @@
"""
#=================================================================
HTTP Headers Rewriting
#=================================================================
# Text with charset
>>> _test_headers([('Date', 'Fri, 03 Jan 2014 03:03:21 GMT'), ('Content-Length', '5'), ('Content-Type', 'text/html;charset=UTF-8')])
{'charset': 'utf-8',
'removed_header_dict': {},
'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Date', 'Fri, 03 Jan 2014 03:03:21 GMT'),
('X-Archive-Orig-Content-Length', '5'),
('Content-Type', 'text/html;charset=UTF-8')]),
'text_type': 'html'}
# Redirect
>>> _test_headers([('Connection', 'close'), ('Location', '/other.html')], '302 Redirect')
{'charset': None,
'removed_header_dict': {},
'status_headers': StatusAndHeaders(protocol = '', statusline = '302 Redirect', headers = [ ('X-Archive-Orig-Connection', 'close'),
('Location', '/web/20131010/http://example.com/other.html')]),
'text_type': None}
# cookie, host/origin rewriting
>>> _test_headers([('Connection', 'close'), ('Set-Cookie', 'foo=bar; Path=/; Domain=.example.com, abc=def; Path=somefile.html'), ('Host', 'example.com'), ('Origin', 'https://example.com')])
{'charset': None,
'removed_header_dict': {},
'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Connection', 'close'),
('Set-Cookie', 'foo=bar; Path=/web/20131010/http://example.com/'),
( 'Set-Cookie',
'abc=def; Path=/web/20131010/http://example.com/somefile.html'),
('X-Archive-Orig-Host', 'example.com'),
('X-Archive-Orig-Origin', 'https://example.com')]),
'text_type': None}
# gzip
>>> _test_headers([('Content-Length', '199999'), ('Content-Type', 'text/javascript'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked')])
{'charset': None,
'removed_header_dict': {'content-encoding': 'gzip',
'transfer-encoding': 'chunked'},
'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Content-Length', '199999'),
('Content-Type', 'text/javascript')]),
'text_type': 'js'}
# Binary -- transfer-encoding removed
>>> _test_headers([('Content-Length', '200000'), ('Content-Type', 'image/png'), ('Set-Cookie', 'foo=bar; Path=/; Domain=.example.com'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked')])
{'charset': None,
'removed_header_dict': {'transfer-encoding': 'chunked'},
'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('Content-Length', '200000'),
('Content-Type', 'image/png'),
('Set-Cookie', 'foo=bar; Path=/web/20131010/http://example.com/'),
('Content-Encoding', 'gzip')]),
'text_type': None}
"""
from pywb.rewrite.header_rewriter import HeaderRewriter
from pywb.rewrite.url_rewriter import UrlRewriter
from pywb.utils.statusandheaders import StatusAndHeaders
import pprint
urlrewriter = UrlRewriter('20131010/http://example.com/', '/web/')
headerrewriter = HeaderRewriter()
def _test_headers(headers, status = '200 OK'):
rewritten = headerrewriter.rewrite(StatusAndHeaders(status, headers), urlrewriter)
return pprint.pprint(vars(rewritten))
if __name__ == "__main__":
import doctest
doctest.testmod()

View File

@ -116,61 +116,13 @@ r"""
>>> _test_css("@import url(/url.css)\n@import url(/anotherurl.css)\n @import url(/and_a_third.css)")
'@import url(/web/20131010em_/http://example.com/url.css)\n@import url(/web/20131010em_/http://example.com/anotherurl.css)\n @import url(/web/20131010em_/http://example.com/and_a_third.css)'
#=================================================================
HTTP Headers Rewriting
#=================================================================
# Text with charset
>>> _test_headers([('Date', 'Fri, 03 Jan 2014 03:03:21 GMT'), ('Content-Length', '5'), ('Content-Type', 'text/html;charset=UTF-8')])
{'charset': 'utf-8',
'removed_header_dict': {},
'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Date', 'Fri, 03 Jan 2014 03:03:21 GMT'),
('X-Archive-Orig-Content-Length', '5'),
('Content-Type', 'text/html;charset=UTF-8')]),
'text_type': 'html'}
# Redirect
>>> _test_headers([('Connection', 'close'), ('Location', '/other.html')], '302 Redirect')
{'charset': None,
'removed_header_dict': {},
'status_headers': StatusAndHeaders(protocol = '', statusline = '302 Redirect', headers = [ ('X-Archive-Orig-Connection', 'close'),
('Location', '/web/20131010/http://example.com/other.html')]),
'text_type': None}
# gzip
>>> _test_headers([('Content-Length', '199999'), ('Content-Type', 'text/javascript'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked')])
{'charset': None,
'removed_header_dict': {'content-encoding': 'gzip',
'transfer-encoding': 'chunked'},
'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Content-Length', '199999'),
('Content-Type', 'text/javascript')]),
'text_type': 'js'}
# Binary
>>> _test_headers([('Content-Length', '200000'), ('Content-Type', 'image/png'), ('Cookie', 'blah'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked')])
{'charset': None,
'removed_header_dict': {'transfer-encoding': 'chunked'},
'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('Content-Length', '200000'),
('Content-Type', 'image/png'),
('X-Archive-Orig-Cookie', 'blah'),
('Content-Encoding', 'gzip')]),
'text_type': None}
Removing Transfer-Encoding always, Was:
('Content-Encoding', 'gzip'),
('Transfer-Encoding', 'chunked')]), 'charset': None, 'text_type': None, 'removed_header_dict': {}}
"""
#=================================================================
from pywb.rewrite.url_rewriter import UrlRewriter
from pywb.rewrite.regex_rewriters import RegexRewriter, JSRewriter, CSSRewriter, XMLRewriter
from pywb.rewrite.header_rewriter import HeaderRewriter
from pywb.utils.statusandheaders import StatusAndHeaders
import pprint
urlrewriter = UrlRewriter('20131010/http://example.com/', '/web/')
@ -184,12 +136,6 @@ def _test_xml(string):
def _test_css(string):
return CSSRewriter(urlrewriter).rewrite(string)
headerrewriter = HeaderRewriter()
def _test_headers(headers, status = '200 OK'):
rewritten = headerrewriter.rewrite(StatusAndHeaders(status, headers), urlrewriter)
return pprint.pprint(vars(rewritten))
if __name__ == "__main__":
import doctest

View File

@ -3,6 +3,8 @@ from pywb.rewrite.url_rewriter import UrlRewriter
from pywb import get_test_dir
from io import BytesIO
# This module has some rewriting tests against the 'live web'
# As such, the content may change and the test may break
@ -81,5 +83,17 @@ def test_example_domain_specific_3():
assert '/* Bootloader.configurePage' in buff
def test_post():
buff = BytesIO('ABCDEF')
env = {'REQUEST_METHOD': 'POST',
'HTTP_ORIGIN': 'http://example.com',
'HTTP_HOST': 'example.com',
'wsgi.input': buff}
status_headers, resp_buff = get_rewritten('http://example.com/', urlrewriter, env=env)
assert status_headers.get_statuscode() == '200', status_headers
def get_rewritten(*args, **kwargs):
return LiveRewriter().get_rewritten(*args, **kwargs)

View File

@ -2,6 +2,7 @@ import copy
import urlparse
from wburl import WbUrl
from cookie_rewriter import WbUrlCookieRewriter
#=================================================================
@ -82,6 +83,9 @@ class UrlRewriter(object):
new_wburl.url = new_url
return UrlRewriter(new_wburl, self.prefix)
def get_cookie_rewriter(self):
return WbUrlCookieRewriter(self)
def __repr__(self):
return "UrlRewriter('{0}', '{1}')".format(self.wburl, self.prefix)
@ -114,3 +118,6 @@ class HttpsUrlRewriter(object):
def rebase_rewriter(self, new_url):
return self
def get_cookie_rewriter(self):
return None

View File

@ -34,7 +34,7 @@ class PyTest(TestCommand):
setup(
name='pywb',
version='0.3.1',
version='0.4.0',
url='https://github.com/ikreymer/pywb',
author='Ilya Kreymer',
author_email='ikreymer@gmail.com',