mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
rewrite: add optional cookie_rewriter, created by urlrewriter and called from header_rewriter
cookie_rewriter works correctly with a concatenated set-cookie list, returns a list of rewritten 'set-cookie' headers rewrite_live: add proxying of Host, Origin, additional headers split header rewriter tests into test_header_rewriter, add test_cookie_rewriter bump version to 0.4.0!
This commit is contained in:
parent
89da165467
commit
871cc26fa4
@ -1,4 +1,4 @@
|
|||||||
PyWb 0.3.1
|
PyWb 0.4.0 Beta
|
||||||
=============
|
=============
|
||||||
|
|
||||||
.. image:: https://travis-ci.org/ikreymer/pywb.png?branch=develop
|
.. image:: https://travis-ci.org/ikreymer/pywb.png?branch=develop
|
||||||
|
29
pywb/rewrite/cookie_rewriter.py
Normal file
29
pywb/rewrite/cookie_rewriter.py
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
import Cookie
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
class WbUrlCookieRewriter(object):
|
||||||
|
""" Cookie rewriter for wburl-based requests
|
||||||
|
Remove the domain and rewrite path, if any, to match
|
||||||
|
given WbUrl using the url rewriter.
|
||||||
|
"""
|
||||||
|
def __init__(self, url_rewriter):
|
||||||
|
self.url_rewriter = url_rewriter
|
||||||
|
|
||||||
|
def rewrite(self, cookie_str, header='Set-Cookie'):
|
||||||
|
cookie = Cookie.SimpleCookie()
|
||||||
|
cookie.load(cookie_str)
|
||||||
|
|
||||||
|
results = []
|
||||||
|
|
||||||
|
for name, morsel in cookie.iteritems():
|
||||||
|
if morsel.get('domain'):
|
||||||
|
del morsel['domain']
|
||||||
|
if morsel.get('path'):
|
||||||
|
morsel['path'] = self.url_rewriter.rewrite(morsel['path'])
|
||||||
|
if morsel.get('expires'):
|
||||||
|
del morsel['expires']
|
||||||
|
|
||||||
|
results.append((header, morsel.OutputString()))
|
||||||
|
|
||||||
|
return results
|
@ -39,6 +39,8 @@ class HeaderRewriter:
|
|||||||
|
|
||||||
PROXY_NO_REWRITE_HEADERS = ['content-length']
|
PROXY_NO_REWRITE_HEADERS = ['content-length']
|
||||||
|
|
||||||
|
COOKIE_HEADERS = ['set-cookie', 'cookie']
|
||||||
|
|
||||||
def __init__(self, header_prefix='X-Archive-Orig-'):
|
def __init__(self, header_prefix='X-Archive-Orig-'):
|
||||||
self.header_prefix = header_prefix
|
self.header_prefix = header_prefix
|
||||||
|
|
||||||
@ -86,6 +88,8 @@ class HeaderRewriter:
|
|||||||
new_headers = []
|
new_headers = []
|
||||||
removed_header_dict = {}
|
removed_header_dict = {}
|
||||||
|
|
||||||
|
cookie_rewriter = urlrewriter.get_cookie_rewriter()
|
||||||
|
|
||||||
for (name, value) in headers:
|
for (name, value) in headers:
|
||||||
|
|
||||||
lowername = name.lower()
|
lowername = name.lower()
|
||||||
@ -109,6 +113,11 @@ class HeaderRewriter:
|
|||||||
not content_rewritten):
|
not content_rewritten):
|
||||||
new_headers.append((name, value))
|
new_headers.append((name, value))
|
||||||
|
|
||||||
|
elif (lowername in self.COOKIE_HEADERS and
|
||||||
|
cookie_rewriter):
|
||||||
|
cookie_list = cookie_rewriter.rewrite(value)
|
||||||
|
new_headers.extend(cookie_list)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
new_headers.append((self.header_prefix + name, value))
|
new_headers.append((self.header_prefix + name, value))
|
||||||
|
|
||||||
|
@ -6,7 +6,9 @@ import requests
|
|||||||
import datetime
|
import datetime
|
||||||
import mimetypes
|
import mimetypes
|
||||||
|
|
||||||
from pywb.utils.loaders import is_http
|
from urlparse import urlsplit
|
||||||
|
|
||||||
|
from pywb.utils.loaders import is_http, LimitReader
|
||||||
from pywb.utils.timeutils import datetime_to_timestamp
|
from pywb.utils.timeutils import datetime_to_timestamp
|
||||||
from pywb.utils.statusandheaders import StatusAndHeaders
|
from pywb.utils.statusandheaders import StatusAndHeaders
|
||||||
from pywb.utils.canonicalize import canonicalize
|
from pywb.utils.canonicalize import canonicalize
|
||||||
@ -23,6 +25,12 @@ class LiveRewriter(object):
|
|||||||
('HTTP_ACCEPT_CHARSET', 'Accept-Charset'),
|
('HTTP_ACCEPT_CHARSET', 'Accept-Charset'),
|
||||||
('HTTP_ACCEPT_ENCODING', 'Accept-Encoding'),
|
('HTTP_ACCEPT_ENCODING', 'Accept-Encoding'),
|
||||||
('HTTP_RANGE', 'Range'),
|
('HTTP_RANGE', 'Range'),
|
||||||
|
('HTTP_CACHE_CONTROL', 'Cache-Control'),
|
||||||
|
('HTTP_X_REQUESTED_WITH', 'X-Requested-With'),
|
||||||
|
('HTTP_X_CSRF_TOKEN', 'X-CSRF-Token'),
|
||||||
|
('HTTP_COOKIE', 'Cookie'),
|
||||||
|
('CONTENT_TYPE', 'Content-Type'),
|
||||||
|
('CONTENT_LENGTH', 'Content-Length'),
|
||||||
('REL_REFERER', 'Referer'),
|
('REL_REFERER', 'Referer'),
|
||||||
]
|
]
|
||||||
|
|
||||||
@ -67,10 +75,23 @@ class LiveRewriter(object):
|
|||||||
method = env['REQUEST_METHOD'].upper()
|
method = env['REQUEST_METHOD'].upper()
|
||||||
input_ = env['wsgi.input']
|
input_ = env['wsgi.input']
|
||||||
|
|
||||||
|
host = env.get('HTTP_HOST')
|
||||||
|
origin = env.get('HTTP_ORIGIN')
|
||||||
|
if host or origin:
|
||||||
|
splits = urlsplit(url)
|
||||||
|
if host:
|
||||||
|
req_headers['Host'] = splits.netloc
|
||||||
|
if origin:
|
||||||
|
req_headers['Origin'] = (splits.scheme + '://' + splits.netloc)
|
||||||
|
|
||||||
req_headers.update(self.translate_headers(env))
|
req_headers.update(self.translate_headers(env))
|
||||||
|
|
||||||
if method in ('POST', 'PUT'):
|
if method in ('POST', 'PUT'):
|
||||||
data = input_
|
len_ = env.get('CONTENT_LENGTH')
|
||||||
|
if len_:
|
||||||
|
data = LimitReader(input_, int(len_))
|
||||||
|
else:
|
||||||
|
data = input_
|
||||||
|
|
||||||
response = requests.request(method=method,
|
response = requests.request(method=method,
|
||||||
url=url,
|
url=url,
|
||||||
|
25
pywb/rewrite/test/test_cookie_rewriter.py
Normal file
25
pywb/rewrite/test/test_cookie_rewriter.py
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
r"""
|
||||||
|
# No rewriting
|
||||||
|
>>> rewrite_cookie('a=b; c=d;')
|
||||||
|
[('Set-Cookie', 'a=b'), ('Set-Cookie', 'c=d')]
|
||||||
|
|
||||||
|
>>> rewrite_cookie('some=value; Domain=foo.com; Path=/;')
|
||||||
|
[('Set-Cookie', 'some=value; Path=/pywb/20131226101010/http://example.com/')]
|
||||||
|
|
||||||
|
>>> rewrite_cookie('some=value; Domain=foo.com; Path=/diff/path/;')
|
||||||
|
[('Set-Cookie', 'some=value; Path=/pywb/20131226101010/http://example.com/diff/path/')]
|
||||||
|
|
||||||
|
>>> rewrite_cookie('abc=def; Path=file.html; Expires=Wed, 13 Jan 2021 22:23:01 GMT')
|
||||||
|
[('Set-Cookie', 'abc=def; Path=/pywb/20131226101010/http://example.com/some/path/file.html')]
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
from pywb.rewrite.cookie_rewriter import WbUrlCookieRewriter
|
||||||
|
from pywb.rewrite.url_rewriter import UrlRewriter
|
||||||
|
|
||||||
|
urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/pywb/')
|
||||||
|
|
||||||
|
def rewrite_cookie(cookie_str):
|
||||||
|
return WbUrlCookieRewriter(urlrewriter).rewrite(cookie_str)
|
||||||
|
|
80
pywb/rewrite/test/test_header_rewriter.py
Normal file
80
pywb/rewrite/test/test_header_rewriter.py
Normal file
@ -0,0 +1,80 @@
|
|||||||
|
"""
|
||||||
|
#=================================================================
|
||||||
|
HTTP Headers Rewriting
|
||||||
|
#=================================================================
|
||||||
|
|
||||||
|
# Text with charset
|
||||||
|
>>> _test_headers([('Date', 'Fri, 03 Jan 2014 03:03:21 GMT'), ('Content-Length', '5'), ('Content-Type', 'text/html;charset=UTF-8')])
|
||||||
|
{'charset': 'utf-8',
|
||||||
|
'removed_header_dict': {},
|
||||||
|
'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Date', 'Fri, 03 Jan 2014 03:03:21 GMT'),
|
||||||
|
('X-Archive-Orig-Content-Length', '5'),
|
||||||
|
('Content-Type', 'text/html;charset=UTF-8')]),
|
||||||
|
'text_type': 'html'}
|
||||||
|
|
||||||
|
# Redirect
|
||||||
|
>>> _test_headers([('Connection', 'close'), ('Location', '/other.html')], '302 Redirect')
|
||||||
|
{'charset': None,
|
||||||
|
'removed_header_dict': {},
|
||||||
|
'status_headers': StatusAndHeaders(protocol = '', statusline = '302 Redirect', headers = [ ('X-Archive-Orig-Connection', 'close'),
|
||||||
|
('Location', '/web/20131010/http://example.com/other.html')]),
|
||||||
|
'text_type': None}
|
||||||
|
|
||||||
|
# cookie, host/origin rewriting
|
||||||
|
>>> _test_headers([('Connection', 'close'), ('Set-Cookie', 'foo=bar; Path=/; Domain=.example.com, abc=def; Path=somefile.html'), ('Host', 'example.com'), ('Origin', 'https://example.com')])
|
||||||
|
{'charset': None,
|
||||||
|
'removed_header_dict': {},
|
||||||
|
'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Connection', 'close'),
|
||||||
|
('Set-Cookie', 'foo=bar; Path=/web/20131010/http://example.com/'),
|
||||||
|
( 'Set-Cookie',
|
||||||
|
'abc=def; Path=/web/20131010/http://example.com/somefile.html'),
|
||||||
|
('X-Archive-Orig-Host', 'example.com'),
|
||||||
|
('X-Archive-Orig-Origin', 'https://example.com')]),
|
||||||
|
'text_type': None}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# gzip
|
||||||
|
>>> _test_headers([('Content-Length', '199999'), ('Content-Type', 'text/javascript'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked')])
|
||||||
|
{'charset': None,
|
||||||
|
'removed_header_dict': {'content-encoding': 'gzip',
|
||||||
|
'transfer-encoding': 'chunked'},
|
||||||
|
'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Content-Length', '199999'),
|
||||||
|
('Content-Type', 'text/javascript')]),
|
||||||
|
'text_type': 'js'}
|
||||||
|
|
||||||
|
# Binary -- transfer-encoding removed
|
||||||
|
>>> _test_headers([('Content-Length', '200000'), ('Content-Type', 'image/png'), ('Set-Cookie', 'foo=bar; Path=/; Domain=.example.com'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked')])
|
||||||
|
{'charset': None,
|
||||||
|
'removed_header_dict': {'transfer-encoding': 'chunked'},
|
||||||
|
'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('Content-Length', '200000'),
|
||||||
|
('Content-Type', 'image/png'),
|
||||||
|
('Set-Cookie', 'foo=bar; Path=/web/20131010/http://example.com/'),
|
||||||
|
('Content-Encoding', 'gzip')]),
|
||||||
|
'text_type': None}
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
from pywb.rewrite.header_rewriter import HeaderRewriter
|
||||||
|
from pywb.rewrite.url_rewriter import UrlRewriter
|
||||||
|
from pywb.utils.statusandheaders import StatusAndHeaders
|
||||||
|
|
||||||
|
import pprint
|
||||||
|
|
||||||
|
urlrewriter = UrlRewriter('20131010/http://example.com/', '/web/')
|
||||||
|
|
||||||
|
|
||||||
|
headerrewriter = HeaderRewriter()
|
||||||
|
|
||||||
|
def _test_headers(headers, status = '200 OK'):
|
||||||
|
rewritten = headerrewriter.rewrite(StatusAndHeaders(status, headers), urlrewriter)
|
||||||
|
return pprint.pprint(vars(rewritten))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import doctest
|
||||||
|
doctest.testmod()
|
||||||
|
|
||||||
|
|
@ -116,61 +116,13 @@ r"""
|
|||||||
>>> _test_css("@import url(/url.css)\n@import url(/anotherurl.css)\n @import url(/and_a_third.css)")
|
>>> _test_css("@import url(/url.css)\n@import url(/anotherurl.css)\n @import url(/and_a_third.css)")
|
||||||
'@import url(/web/20131010em_/http://example.com/url.css)\n@import url(/web/20131010em_/http://example.com/anotherurl.css)\n @import url(/web/20131010em_/http://example.com/and_a_third.css)'
|
'@import url(/web/20131010em_/http://example.com/url.css)\n@import url(/web/20131010em_/http://example.com/anotherurl.css)\n @import url(/web/20131010em_/http://example.com/and_a_third.css)'
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
HTTP Headers Rewriting
|
|
||||||
#=================================================================
|
|
||||||
|
|
||||||
# Text with charset
|
|
||||||
>>> _test_headers([('Date', 'Fri, 03 Jan 2014 03:03:21 GMT'), ('Content-Length', '5'), ('Content-Type', 'text/html;charset=UTF-8')])
|
|
||||||
{'charset': 'utf-8',
|
|
||||||
'removed_header_dict': {},
|
|
||||||
'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Date', 'Fri, 03 Jan 2014 03:03:21 GMT'),
|
|
||||||
('X-Archive-Orig-Content-Length', '5'),
|
|
||||||
('Content-Type', 'text/html;charset=UTF-8')]),
|
|
||||||
'text_type': 'html'}
|
|
||||||
|
|
||||||
# Redirect
|
|
||||||
>>> _test_headers([('Connection', 'close'), ('Location', '/other.html')], '302 Redirect')
|
|
||||||
{'charset': None,
|
|
||||||
'removed_header_dict': {},
|
|
||||||
'status_headers': StatusAndHeaders(protocol = '', statusline = '302 Redirect', headers = [ ('X-Archive-Orig-Connection', 'close'),
|
|
||||||
('Location', '/web/20131010/http://example.com/other.html')]),
|
|
||||||
'text_type': None}
|
|
||||||
|
|
||||||
# gzip
|
|
||||||
>>> _test_headers([('Content-Length', '199999'), ('Content-Type', 'text/javascript'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked')])
|
|
||||||
{'charset': None,
|
|
||||||
'removed_header_dict': {'content-encoding': 'gzip',
|
|
||||||
'transfer-encoding': 'chunked'},
|
|
||||||
'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Content-Length', '199999'),
|
|
||||||
('Content-Type', 'text/javascript')]),
|
|
||||||
'text_type': 'js'}
|
|
||||||
|
|
||||||
# Binary
|
|
||||||
>>> _test_headers([('Content-Length', '200000'), ('Content-Type', 'image/png'), ('Cookie', 'blah'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked')])
|
|
||||||
{'charset': None,
|
|
||||||
'removed_header_dict': {'transfer-encoding': 'chunked'},
|
|
||||||
'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('Content-Length', '200000'),
|
|
||||||
('Content-Type', 'image/png'),
|
|
||||||
('X-Archive-Orig-Cookie', 'blah'),
|
|
||||||
('Content-Encoding', 'gzip')]),
|
|
||||||
'text_type': None}
|
|
||||||
|
|
||||||
Removing Transfer-Encoding always, Was:
|
|
||||||
('Content-Encoding', 'gzip'),
|
|
||||||
('Transfer-Encoding', 'chunked')]), 'charset': None, 'text_type': None, 'removed_header_dict': {}}
|
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
from pywb.rewrite.url_rewriter import UrlRewriter
|
from pywb.rewrite.url_rewriter import UrlRewriter
|
||||||
from pywb.rewrite.regex_rewriters import RegexRewriter, JSRewriter, CSSRewriter, XMLRewriter
|
from pywb.rewrite.regex_rewriters import RegexRewriter, JSRewriter, CSSRewriter, XMLRewriter
|
||||||
from pywb.rewrite.header_rewriter import HeaderRewriter
|
|
||||||
|
|
||||||
from pywb.utils.statusandheaders import StatusAndHeaders
|
|
||||||
|
|
||||||
import pprint
|
|
||||||
|
|
||||||
urlrewriter = UrlRewriter('20131010/http://example.com/', '/web/')
|
urlrewriter = UrlRewriter('20131010/http://example.com/', '/web/')
|
||||||
|
|
||||||
@ -184,12 +136,6 @@ def _test_xml(string):
|
|||||||
def _test_css(string):
|
def _test_css(string):
|
||||||
return CSSRewriter(urlrewriter).rewrite(string)
|
return CSSRewriter(urlrewriter).rewrite(string)
|
||||||
|
|
||||||
headerrewriter = HeaderRewriter()
|
|
||||||
|
|
||||||
def _test_headers(headers, status = '200 OK'):
|
|
||||||
rewritten = headerrewriter.rewrite(StatusAndHeaders(status, headers), urlrewriter)
|
|
||||||
return pprint.pprint(vars(rewritten))
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import doctest
|
import doctest
|
||||||
|
@ -3,6 +3,8 @@ from pywb.rewrite.url_rewriter import UrlRewriter
|
|||||||
|
|
||||||
from pywb import get_test_dir
|
from pywb import get_test_dir
|
||||||
|
|
||||||
|
from io import BytesIO
|
||||||
|
|
||||||
# This module has some rewriting tests against the 'live web'
|
# This module has some rewriting tests against the 'live web'
|
||||||
# As such, the content may change and the test may break
|
# As such, the content may change and the test may break
|
||||||
|
|
||||||
@ -81,5 +83,17 @@ def test_example_domain_specific_3():
|
|||||||
assert '/* Bootloader.configurePage' in buff
|
assert '/* Bootloader.configurePage' in buff
|
||||||
|
|
||||||
|
|
||||||
|
def test_post():
|
||||||
|
buff = BytesIO('ABCDEF')
|
||||||
|
|
||||||
|
env = {'REQUEST_METHOD': 'POST',
|
||||||
|
'HTTP_ORIGIN': 'http://example.com',
|
||||||
|
'HTTP_HOST': 'example.com',
|
||||||
|
'wsgi.input': buff}
|
||||||
|
|
||||||
|
status_headers, resp_buff = get_rewritten('http://example.com/', urlrewriter, env=env)
|
||||||
|
assert status_headers.get_statuscode() == '200', status_headers
|
||||||
|
|
||||||
|
|
||||||
def get_rewritten(*args, **kwargs):
|
def get_rewritten(*args, **kwargs):
|
||||||
return LiveRewriter().get_rewritten(*args, **kwargs)
|
return LiveRewriter().get_rewritten(*args, **kwargs)
|
||||||
|
@ -2,6 +2,7 @@ import copy
|
|||||||
import urlparse
|
import urlparse
|
||||||
|
|
||||||
from wburl import WbUrl
|
from wburl import WbUrl
|
||||||
|
from cookie_rewriter import WbUrlCookieRewriter
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
@ -82,6 +83,9 @@ class UrlRewriter(object):
|
|||||||
new_wburl.url = new_url
|
new_wburl.url = new_url
|
||||||
return UrlRewriter(new_wburl, self.prefix)
|
return UrlRewriter(new_wburl, self.prefix)
|
||||||
|
|
||||||
|
def get_cookie_rewriter(self):
|
||||||
|
return WbUrlCookieRewriter(self)
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return "UrlRewriter('{0}', '{1}')".format(self.wburl, self.prefix)
|
return "UrlRewriter('{0}', '{1}')".format(self.wburl, self.prefix)
|
||||||
|
|
||||||
@ -114,3 +118,6 @@ class HttpsUrlRewriter(object):
|
|||||||
|
|
||||||
def rebase_rewriter(self, new_url):
|
def rebase_rewriter(self, new_url):
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
def get_cookie_rewriter(self):
|
||||||
|
return None
|
||||||
|
2
setup.py
2
setup.py
@ -34,7 +34,7 @@ class PyTest(TestCommand):
|
|||||||
|
|
||||||
setup(
|
setup(
|
||||||
name='pywb',
|
name='pywb',
|
||||||
version='0.3.1',
|
version='0.4.0',
|
||||||
url='https://github.com/ikreymer/pywb',
|
url='https://github.com/ikreymer/pywb',
|
||||||
author='Ilya Kreymer',
|
author='Ilya Kreymer',
|
||||||
author_email='ikreymer@gmail.com',
|
author_email='ikreymer@gmail.com',
|
||||||
|
Loading…
x
Reference in New Issue
Block a user