1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

tests: add new tests for header_rewriter

default rewriter: using HostScopeCookieRewriter as default cookie rewriter, add 'cookie' entry to all_rewriters
This commit is contained in:
Ilya Kreymer 2017-05-23 23:56:44 -07:00
parent 97182b71b7
commit 37dc4693c0
4 changed files with 231 additions and 174 deletions

View File

@ -10,7 +10,6 @@ import webencodings
import tempfile
from pywb.warcserver.utils import StreamIter, BUFF_SIZE
from pywb.rewrite.cookie_rewriter import ExactPathCookieRewriter
from pywb.utils.loaders import load_yaml_config
@ -29,9 +28,6 @@ class BaseContentRewriter(object):
def add_rewriter(self, rw):
self.all_rewriters[rw.name] = rw
def get_rewriter(self, url, text_type):
return self.all_rewriters.get(text_type)
def load_rules(self, filename):
config = load_yaml_config(filename)
for rule in config.get('rules'):
@ -157,7 +153,7 @@ class BaseContentRewriter(object):
head_insert_func=None,
cdx=None):
rwinfo = RewriteInfo(record, self.get_rewrite_types(), url_rewriter, cookie_rewriter)
rwinfo = RewriteInfo(record, self, url_rewriter, cookie_rewriter)
content_rewriter = None
if rwinfo.should_rw_content():
@ -259,13 +255,13 @@ class StreamingRewriter(object):
class RewriteInfo(object):
TAG_REGEX = re.compile(b'^\s*\<')
def __init__(self, record, rewrite_types, url_rewriter, cookie_rewriter):
def __init__(self, record, content_rewriter, url_rewriter, cookie_rewriter=None):
self.record = record
self._content_stream = None
self.is_content_rw = False
self.rewrite_types = rewrite_types
self.rewrite_types = content_rewriter.get_rewrite_types()
self.text_type = None
self.charset = None
@ -273,7 +269,9 @@ class RewriteInfo(object):
self.url_rewriter = url_rewriter
if not cookie_rewriter:
cookie_rewriter = ExactPathCookieRewriter(url_rewriter)
cookie_rw_class = content_rewriter.all_rewriters.get('cookie')
if cookie_rw_class:
cookie_rewriter = cookie_rw_class(url_rewriter)
self.cookie_rewriter = cookie_rewriter

View File

@ -8,6 +8,7 @@ from pywb.rewrite.regex_rewriters import JSLinkAndLocationRewriter, JSLinkOnlyRe
from pywb.rewrite.regex_rewriters import JSLocationOnlyRewriter, JSNoneRewriter
from pywb.rewrite.header_rewriter import PrefixHeaderRewriter
from pywb.rewrite.cookie_rewriter import HostScopeCookieRewriter
from pywb.rewrite.jsonp_rewriter import JSONPRewriter
@ -20,6 +21,7 @@ from pywb.rewrite.rewrite_amf import RewriteAMF
class DefaultRewriter(BaseContentRewriter):
all_rewriters = {
'header': PrefixHeaderRewriter,
'cookie': HostScopeCookieRewriter,
'html': HTMLRewriter,
'html-banner-only': HTMLInsertOnlyRewriter,

View File

@ -0,0 +1,223 @@
from warcio.statusandheaders import StatusAndHeaders
from warcio.warcwriter import BufferWARCWriter
from warcio.timeutils import datetime_to_http_date
from pywb.rewrite.content_rewriter import RewriteInfo
from pywb.rewrite.default_rewriter import DefaultRewriter
from pywb.rewrite.header_rewriter import PrefixHeaderRewriter
from pywb.rewrite.url_rewriter import UrlRewriter
from datetime import datetime
from io import BytesIO
class TestHeaderRewriter(object):
@classmethod
def setup_class(cls):
cls.urlrewriter = UrlRewriter('20171226/http://example.com/', '/warc/')
cls.default_rewriter = DefaultRewriter()
@classmethod
def get_rwinfo(cls, record):
return RewriteInfo(record=record,
content_rewriter=cls.default_rewriter,
url_rewriter=cls.urlrewriter, cookie_rewriter=None)
@classmethod
def do_rewrite(cls, statusline, headers):
writer = BufferWARCWriter()
http_headers = StatusAndHeaders(statusline, headers, protocol='HTTP/1.0')
record = writer.create_warc_record('http://example.com/', 'response',
http_headers=http_headers)
return cls.get_rwinfo(record)
def test_header_rewrite_200_response(self):
headers = [('Date', 'Fri, 03 Jan 2014 03:03:21 GMT'),
('Content-Length', '5'),
('Content-Type', 'text/html;charset=UTF-8')]
res = """\
HTTP/1.0 200 OK\r\n\
X-Archive-Orig-Date: Fri, 03 Jan 2014 03:03:21 GMT\r\n\
Content-Length: 5\r\n\
Content-Type: text/html;charset=UTF-8\r\n\
"""
rwinfo = self.do_rewrite('200 OK', headers)
http_headers = PrefixHeaderRewriter(rwinfo)()
assert str(http_headers) == res
assert rwinfo.text_type == 'html'
assert rwinfo.charset == 'utf-8'
def test_header_rewrite_redirect(self):
headers = [('Connection', 'close'),
('Location', 'http://example.com/other.html')]
res = """\
HTTP/1.0 302 Redirect\r\n\
X-Archive-Orig-Connection: close\r\n\
Location: /warc/20171226/http://example.com/other.html\r\n\
"""
rwinfo = self.do_rewrite('302 Redirect', headers)
http_headers = PrefixHeaderRewriter(rwinfo)()
assert str(http_headers) == res
assert rwinfo.text_type == None
assert rwinfo.charset == None
def test_header_rewrite_gzipped(self):
headers = [('Content-Length', '199999'),
('Content-Type', 'text/javascript'),
('Content-Encoding', 'gzip'),
('Transfer-Encoding', 'chunked')]
rwinfo = self.do_rewrite('200 OK', headers)
# Content-Encoding, Content-Length not yet rewritten
res = """\
HTTP/1.0 200 OK\r\n\
Content-Length: 199999\r\n\
Content-Type: text/javascript\r\n\
Content-Encoding: gzip\r\n\
X-Archive-Orig-Transfer-Encoding: chunked\r\n\
"""
http_headers = PrefixHeaderRewriter(rwinfo)()
assert str(http_headers) == res
assert rwinfo.text_type == 'js'
assert rwinfo.charset == None
# access stream
stream = rwinfo.content_stream
# Content-Encoding, Content-Length rewritten now
res = """\
HTTP/1.0 200 OK\r\n\
X-Archive-Orig-Content-Length: 199999\r\n\
Content-Type: text/javascript\r\n\
X-Archive-Orig-Content-Encoding: gzip\r\n\
X-Archive-Orig-Transfer-Encoding: chunked\r\n\
"""
http_headers = PrefixHeaderRewriter(rwinfo)()
assert str(http_headers) == res
def test_header_rewrite_binary(self):
headers = [('Content-Length', '200000'),
('Content-Type', 'image/png'),
('Set-Cookie', 'foo=bar; Path=/; abc=123; Path=/path.html'),
('Content-Encoding', 'gzip'),
('Transfer-Encoding', 'chunked'),
('X-Custom', 'test')]
res = """\
HTTP/1.0 200 OK\r\n\
Content-Length: 200000\r\n\
Content-Type: image/png\r\n\
Set-Cookie: foo=bar; Path=/warc/20171226/http://example.com/\r\n\
Set-Cookie: abc=123; Path=/warc/20171226/http://example.com/path.html\r\n\
Content-Encoding: gzip\r\n\
X-Archive-Orig-Transfer-Encoding: chunked\r\n\
X-Archive-Orig-X-Custom: test\r\n\
"""
rwinfo = self.do_rewrite('200 OK', headers)
http_headers = PrefixHeaderRewriter(rwinfo)()
assert str(http_headers) == res
assert rwinfo.text_type == None
assert rwinfo.charset == None
def _test_head_data(headers, status='200 OK', rewriter=None):
rewritten = headerrewriter.rewrite(StatusAndHeaders(status, headers),
rewriter,
rewriter.get_cookie_rewriter())
return rewritten.status_headers
def _test_cookie_headers():
# cookie, host/origin rewriting
res = _test_head_data([('Connection', 'close'),
('Set-Cookie', 'foo=bar; Path=/; abc=def; Path=/somefile.html'),
('Host', 'example.com'),
('Origin', 'https://example.com')])
assert(('Set-Cookie', 'foo=bar; Path=/web/20131010/http://example.com/') in res.headers)
assert(('Set-Cookie', 'abc=def; Path=/web/20131010/http://example.com/somefile.html') in res.headers)
assert(('X-Archive-Orig-Connection', 'close') in res.headers)
assert(('X-Archive-Orig-Host', 'example.com') in res.headers)
assert(('X-Archive-Orig-Origin', 'https://example.com') in res.headers)
def _make_cache_headers():
cache_headers = [('Content-Length', '123'),
('Cache-Control', 'max-age=10'),
('Expires', datetime_to_http_date(datetime.now())),
('ETag', '123456')]
return cache_headers
def _test_proxy_headers(http_cache=None):
headers = _make_cache_headers()
status = '200 OK'
rewriter = UrlRewriter('20131010/http://example.com/', '/pywb/',
rewrite_opts={'http_cache': http_cache})
rewritten = headerrewriter.rewrite(StatusAndHeaders(status, headers),
rewriter,
rewriter.get_cookie_rewriter())
return rewritten.status_headers
def _test_proxy_default():
res = _test_proxy_headers()
assert res.get_header('X-Archive-Orig-Cache-Control') != None
assert res.get_header('X-Archive-Orig-Expires') != None
assert res.get_header('X-Archive-Orig-ETag') != None
def _test_proxy_pass():
res = _test_proxy_headers('pass')
assert res.get_header('Cache-Control') == 'max-age=10'
assert res.get_header('Expires') != None
assert res.get_header('ETag') != None
def _test_proxy_set_age():
res = _test_proxy_headers('600')
assert res.get_header('Cache-Control') == 'max-age=600'
assert res.get_header('Expires') != None
assert res.get_header('ETag') == None
def _test_proxy_zero():
res = _test_proxy_headers('0')
assert res.get_header('Cache-Control') == 'no-cache; no-store'
assert res.get_header('Expires') == None
assert res.get_header('ETag') == None
def _test_proxy_not_num():
res = _test_proxy_headers('blah')
assert res.get_header('Cache-Control') == 'no-cache; no-store'
assert res.get_header('Expires') == None
assert res.get_header('ETag') == None
if __name__ == "__main__":
import doctest
doctest.testmod()

View File

@ -1,166 +0,0 @@
"""
#=================================================================
HTTP Headers Rewriting
#=================================================================
# Text with charset
>>> _test_headers([('Date', 'Fri, 03 Jan 2014 03:03:21 GMT'), ('Content-Length', '5'), ('Content-Type', 'text/html;charset=UTF-8')])
{'charset': 'utf-8',
'removed_header_dict': {'content-length': '5'},
'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Date', 'Fri, 03 Jan 2014 03:03:21 GMT'),
('X-Archive-Orig-Content-Length', '5'),
('Content-Type', 'text/html;charset=UTF-8')]),
'text_type': 'html'}
# Redirect
>>> _test_headers([('Connection', 'close'), ('Location', '/other.html')], '302 Redirect')
{'charset': None,
'removed_header_dict': {},
'status_headers': StatusAndHeaders(protocol = '', statusline = '302 Redirect', headers = [ ('X-Archive-Orig-Connection', 'close'),
('Location', '/web/20131010/http://example.com/other.html')]),
'text_type': None}
# gzip
>>> _test_headers([('Content-Length', '199999'), ('Content-Type', 'text/javascript'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked')])
{'charset': None,
'removed_header_dict': {'content-encoding': 'gzip',
'content-length': '199999',
'transfer-encoding': 'chunked'},
'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Content-Length', '199999'),
('Content-Type', 'text/javascript'),
('X-Archive-Orig-Content-Encoding', 'gzip'),
('X-Archive-Orig-Transfer-Encoding', 'chunked')]),
'text_type': 'js'}
# Binary -- transfer-encoding rewritten
>>> _test_headers([('Content-Length', '200000'), ('Content-Type', 'image/png'), ('Set-Cookie', 'foo=bar; Path=/;'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked'), ('X-Proxy', 'test')])
{'charset': None,
'removed_header_dict': {'transfer-encoding': 'chunked'},
'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('Content-Length', '200000'),
('Content-Type', 'image/png'),
('Set-Cookie', 'foo=bar; Path=/web/20131010/http://example.com/'),
('Content-Encoding', 'gzip'),
('X-Archive-Orig-Transfer-Encoding', 'chunked'),
('X-Archive-Orig-X-Proxy', 'test')]),
'text_type': None}
"""
from pywb.rewrite.header_rewriter import HeaderRewriter
from pywb.rewrite.url_rewriter import UrlRewriter
from warcio.statusandheaders import StatusAndHeaders
from warcio.timeutils import datetime_to_http_date
from datetime import datetime
import pprint
from mock import patch
urlrewriter = UrlRewriter('20131010/http://example.com/', '/web/')
headerrewriter = HeaderRewriter()
def _repr_format(sh):
headers_str = pprint.pformat(sh.headers, indent=2, width=80)
return "StatusAndHeaders(protocol = '{0}', statusline = '{1}', \
headers = {2})".format(sh.protocol, sh.statusline, headers_str)
@patch('warcio.statusandheaders.StatusAndHeaders.__repr__', _repr_format)
def _test_headers(headers, status='200 OK', rewriter=urlrewriter):
rewritten = headerrewriter.rewrite(StatusAndHeaders(status, headers), rewriter, rewriter.get_cookie_rewriter())
return pprint.pprint(vars(rewritten))
def _test_head_data(headers, status='200 OK', rewriter=urlrewriter):
rewritten = headerrewriter.rewrite(StatusAndHeaders(status, headers),
rewriter,
rewriter.get_cookie_rewriter())
return rewritten.status_headers
def test_cookie_headers():
# cookie, host/origin rewriting
res = _test_head_data([('Connection', 'close'),
('Set-Cookie', 'foo=bar; Path=/; abc=def; Path=/somefile.html'),
('Host', 'example.com'),
('Origin', 'https://example.com')])
assert(('Set-Cookie', 'foo=bar; Path=/web/20131010/http://example.com/') in res.headers)
assert(('Set-Cookie', 'abc=def; Path=/web/20131010/http://example.com/somefile.html') in res.headers)
assert(('X-Archive-Orig-Connection', 'close') in res.headers)
assert(('X-Archive-Orig-Host', 'example.com') in res.headers)
assert(('X-Archive-Orig-Origin', 'https://example.com') in res.headers)
def _make_cache_headers():
cache_headers = [('Content-Length', '123'),
('Cache-Control', 'max-age=10'),
('Expires', datetime_to_http_date(datetime.now())),
('ETag', '123456')]
return cache_headers
def _test_proxy_headers(http_cache=None):
headers = _make_cache_headers()
status = '200 OK'
rewriter = UrlRewriter('20131010/http://example.com/', '/pywb/',
rewrite_opts={'http_cache': http_cache})
rewritten = headerrewriter.rewrite(StatusAndHeaders(status, headers),
rewriter,
rewriter.get_cookie_rewriter())
return rewritten.status_headers
def test_proxy_default():
res = _test_proxy_headers()
assert res.get_header('X-Archive-Orig-Cache-Control') != None
assert res.get_header('X-Archive-Orig-Expires') != None
assert res.get_header('X-Archive-Orig-ETag') != None
def test_proxy_pass():
res = _test_proxy_headers('pass')
assert res.get_header('Cache-Control') == 'max-age=10'
assert res.get_header('Expires') != None
assert res.get_header('ETag') != None
def test_proxy_set_age():
res = _test_proxy_headers('600')
assert res.get_header('Cache-Control') == 'max-age=600'
assert res.get_header('Expires') != None
assert res.get_header('ETag') == None
def test_proxy_zero():
res = _test_proxy_headers('0')
assert res.get_header('Cache-Control') == 'no-cache; no-store'
assert res.get_header('Expires') == None
assert res.get_header('ETag') == None
def test_proxy_not_num():
res = _test_proxy_headers('blah')
assert res.get_header('Cache-Control') == 'no-cache; no-store'
assert res.get_header('Expires') == None
assert res.get_header('ETag') == None
if __name__ == "__main__":
import doctest
doctest.testmod()