mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
tests: add new tests for header_rewriter
default rewriter: using HostScopeCookieRewriter as default cookie rewriter, add 'cookie' entry to all_rewriters
This commit is contained in:
parent
97182b71b7
commit
37dc4693c0
@ -10,7 +10,6 @@ import webencodings
|
||||
import tempfile
|
||||
|
||||
from pywb.warcserver.utils import StreamIter, BUFF_SIZE
|
||||
from pywb.rewrite.cookie_rewriter import ExactPathCookieRewriter
|
||||
|
||||
from pywb.utils.loaders import load_yaml_config
|
||||
|
||||
@ -29,9 +28,6 @@ class BaseContentRewriter(object):
|
||||
def add_rewriter(self, rw):
|
||||
self.all_rewriters[rw.name] = rw
|
||||
|
||||
def get_rewriter(self, url, text_type):
|
||||
return self.all_rewriters.get(text_type)
|
||||
|
||||
def load_rules(self, filename):
|
||||
config = load_yaml_config(filename)
|
||||
for rule in config.get('rules'):
|
||||
@ -157,7 +153,7 @@ class BaseContentRewriter(object):
|
||||
head_insert_func=None,
|
||||
cdx=None):
|
||||
|
||||
rwinfo = RewriteInfo(record, self.get_rewrite_types(), url_rewriter, cookie_rewriter)
|
||||
rwinfo = RewriteInfo(record, self, url_rewriter, cookie_rewriter)
|
||||
content_rewriter = None
|
||||
|
||||
if rwinfo.should_rw_content():
|
||||
@ -259,13 +255,13 @@ class StreamingRewriter(object):
|
||||
class RewriteInfo(object):
|
||||
TAG_REGEX = re.compile(b'^\s*\<')
|
||||
|
||||
def __init__(self, record, rewrite_types, url_rewriter, cookie_rewriter):
|
||||
def __init__(self, record, content_rewriter, url_rewriter, cookie_rewriter=None):
|
||||
self.record = record
|
||||
|
||||
self._content_stream = None
|
||||
self.is_content_rw = False
|
||||
|
||||
self.rewrite_types = rewrite_types
|
||||
self.rewrite_types = content_rewriter.get_rewrite_types()
|
||||
|
||||
self.text_type = None
|
||||
self.charset = None
|
||||
@ -273,7 +269,9 @@ class RewriteInfo(object):
|
||||
self.url_rewriter = url_rewriter
|
||||
|
||||
if not cookie_rewriter:
|
||||
cookie_rewriter = ExactPathCookieRewriter(url_rewriter)
|
||||
cookie_rw_class = content_rewriter.all_rewriters.get('cookie')
|
||||
if cookie_rw_class:
|
||||
cookie_rewriter = cookie_rw_class(url_rewriter)
|
||||
|
||||
self.cookie_rewriter = cookie_rewriter
|
||||
|
||||
|
@ -8,6 +8,7 @@ from pywb.rewrite.regex_rewriters import JSLinkAndLocationRewriter, JSLinkOnlyRe
|
||||
from pywb.rewrite.regex_rewriters import JSLocationOnlyRewriter, JSNoneRewriter
|
||||
|
||||
from pywb.rewrite.header_rewriter import PrefixHeaderRewriter
|
||||
from pywb.rewrite.cookie_rewriter import HostScopeCookieRewriter
|
||||
|
||||
from pywb.rewrite.jsonp_rewriter import JSONPRewriter
|
||||
|
||||
@ -20,6 +21,7 @@ from pywb.rewrite.rewrite_amf import RewriteAMF
|
||||
class DefaultRewriter(BaseContentRewriter):
|
||||
all_rewriters = {
|
||||
'header': PrefixHeaderRewriter,
|
||||
'cookie': HostScopeCookieRewriter,
|
||||
|
||||
'html': HTMLRewriter,
|
||||
'html-banner-only': HTMLInsertOnlyRewriter,
|
||||
|
223
pywb/rewrite/test/test_header_rewriter.py
Normal file
223
pywb/rewrite/test/test_header_rewriter.py
Normal file
@ -0,0 +1,223 @@
|
||||
from warcio.statusandheaders import StatusAndHeaders
|
||||
from warcio.warcwriter import BufferWARCWriter
|
||||
from warcio.timeutils import datetime_to_http_date
|
||||
|
||||
from pywb.rewrite.content_rewriter import RewriteInfo
|
||||
from pywb.rewrite.default_rewriter import DefaultRewriter
|
||||
from pywb.rewrite.header_rewriter import PrefixHeaderRewriter
|
||||
from pywb.rewrite.url_rewriter import UrlRewriter
|
||||
|
||||
from datetime import datetime
|
||||
|
||||
from io import BytesIO
|
||||
|
||||
|
||||
class TestHeaderRewriter(object):
|
||||
@classmethod
|
||||
def setup_class(cls):
|
||||
cls.urlrewriter = UrlRewriter('20171226/http://example.com/', '/warc/')
|
||||
cls.default_rewriter = DefaultRewriter()
|
||||
|
||||
@classmethod
|
||||
def get_rwinfo(cls, record):
|
||||
return RewriteInfo(record=record,
|
||||
content_rewriter=cls.default_rewriter,
|
||||
url_rewriter=cls.urlrewriter, cookie_rewriter=None)
|
||||
|
||||
@classmethod
|
||||
def do_rewrite(cls, statusline, headers):
|
||||
writer = BufferWARCWriter()
|
||||
|
||||
http_headers = StatusAndHeaders(statusline, headers, protocol='HTTP/1.0')
|
||||
|
||||
record = writer.create_warc_record('http://example.com/', 'response',
|
||||
http_headers=http_headers)
|
||||
|
||||
return cls.get_rwinfo(record)
|
||||
|
||||
def test_header_rewrite_200_response(self):
|
||||
headers = [('Date', 'Fri, 03 Jan 2014 03:03:21 GMT'),
|
||||
('Content-Length', '5'),
|
||||
('Content-Type', 'text/html;charset=UTF-8')]
|
||||
|
||||
res = """\
|
||||
HTTP/1.0 200 OK\r\n\
|
||||
X-Archive-Orig-Date: Fri, 03 Jan 2014 03:03:21 GMT\r\n\
|
||||
Content-Length: 5\r\n\
|
||||
Content-Type: text/html;charset=UTF-8\r\n\
|
||||
"""
|
||||
rwinfo = self.do_rewrite('200 OK', headers)
|
||||
http_headers = PrefixHeaderRewriter(rwinfo)()
|
||||
assert str(http_headers) == res
|
||||
|
||||
assert rwinfo.text_type == 'html'
|
||||
assert rwinfo.charset == 'utf-8'
|
||||
|
||||
def test_header_rewrite_redirect(self):
|
||||
headers = [('Connection', 'close'),
|
||||
('Location', 'http://example.com/other.html')]
|
||||
|
||||
res = """\
|
||||
HTTP/1.0 302 Redirect\r\n\
|
||||
X-Archive-Orig-Connection: close\r\n\
|
||||
Location: /warc/20171226/http://example.com/other.html\r\n\
|
||||
"""
|
||||
rwinfo = self.do_rewrite('302 Redirect', headers)
|
||||
http_headers = PrefixHeaderRewriter(rwinfo)()
|
||||
assert str(http_headers) == res
|
||||
|
||||
assert rwinfo.text_type == None
|
||||
assert rwinfo.charset == None
|
||||
|
||||
def test_header_rewrite_gzipped(self):
|
||||
headers = [('Content-Length', '199999'),
|
||||
('Content-Type', 'text/javascript'),
|
||||
('Content-Encoding', 'gzip'),
|
||||
('Transfer-Encoding', 'chunked')]
|
||||
|
||||
rwinfo = self.do_rewrite('200 OK', headers)
|
||||
|
||||
# Content-Encoding, Content-Length not yet rewritten
|
||||
res = """\
|
||||
HTTP/1.0 200 OK\r\n\
|
||||
Content-Length: 199999\r\n\
|
||||
Content-Type: text/javascript\r\n\
|
||||
Content-Encoding: gzip\r\n\
|
||||
X-Archive-Orig-Transfer-Encoding: chunked\r\n\
|
||||
"""
|
||||
http_headers = PrefixHeaderRewriter(rwinfo)()
|
||||
assert str(http_headers) == res
|
||||
|
||||
assert rwinfo.text_type == 'js'
|
||||
assert rwinfo.charset == None
|
||||
|
||||
# access stream
|
||||
stream = rwinfo.content_stream
|
||||
|
||||
# Content-Encoding, Content-Length rewritten now
|
||||
res = """\
|
||||
HTTP/1.0 200 OK\r\n\
|
||||
X-Archive-Orig-Content-Length: 199999\r\n\
|
||||
Content-Type: text/javascript\r\n\
|
||||
X-Archive-Orig-Content-Encoding: gzip\r\n\
|
||||
X-Archive-Orig-Transfer-Encoding: chunked\r\n\
|
||||
"""
|
||||
http_headers = PrefixHeaderRewriter(rwinfo)()
|
||||
assert str(http_headers) == res
|
||||
|
||||
def test_header_rewrite_binary(self):
|
||||
headers = [('Content-Length', '200000'),
|
||||
('Content-Type', 'image/png'),
|
||||
('Set-Cookie', 'foo=bar; Path=/; abc=123; Path=/path.html'),
|
||||
('Content-Encoding', 'gzip'),
|
||||
('Transfer-Encoding', 'chunked'),
|
||||
('X-Custom', 'test')]
|
||||
|
||||
res = """\
|
||||
HTTP/1.0 200 OK\r\n\
|
||||
Content-Length: 200000\r\n\
|
||||
Content-Type: image/png\r\n\
|
||||
Set-Cookie: foo=bar; Path=/warc/20171226/http://example.com/\r\n\
|
||||
Set-Cookie: abc=123; Path=/warc/20171226/http://example.com/path.html\r\n\
|
||||
Content-Encoding: gzip\r\n\
|
||||
X-Archive-Orig-Transfer-Encoding: chunked\r\n\
|
||||
X-Archive-Orig-X-Custom: test\r\n\
|
||||
"""
|
||||
rwinfo = self.do_rewrite('200 OK', headers)
|
||||
http_headers = PrefixHeaderRewriter(rwinfo)()
|
||||
assert str(http_headers) == res
|
||||
|
||||
assert rwinfo.text_type == None
|
||||
assert rwinfo.charset == None
|
||||
|
||||
|
||||
|
||||
def _test_head_data(headers, status='200 OK', rewriter=None):
|
||||
rewritten = headerrewriter.rewrite(StatusAndHeaders(status, headers),
|
||||
rewriter,
|
||||
rewriter.get_cookie_rewriter())
|
||||
return rewritten.status_headers
|
||||
|
||||
|
||||
|
||||
def _test_cookie_headers():
|
||||
# cookie, host/origin rewriting
|
||||
res = _test_head_data([('Connection', 'close'),
|
||||
('Set-Cookie', 'foo=bar; Path=/; abc=def; Path=/somefile.html'),
|
||||
('Host', 'example.com'),
|
||||
('Origin', 'https://example.com')])
|
||||
|
||||
assert(('Set-Cookie', 'foo=bar; Path=/web/20131010/http://example.com/') in res.headers)
|
||||
assert(('Set-Cookie', 'abc=def; Path=/web/20131010/http://example.com/somefile.html') in res.headers)
|
||||
|
||||
assert(('X-Archive-Orig-Connection', 'close') in res.headers)
|
||||
assert(('X-Archive-Orig-Host', 'example.com') in res.headers)
|
||||
assert(('X-Archive-Orig-Origin', 'https://example.com') in res.headers)
|
||||
|
||||
|
||||
|
||||
def _make_cache_headers():
|
||||
cache_headers = [('Content-Length', '123'),
|
||||
('Cache-Control', 'max-age=10'),
|
||||
('Expires', datetime_to_http_date(datetime.now())),
|
||||
('ETag', '123456')]
|
||||
return cache_headers
|
||||
|
||||
|
||||
def _test_proxy_headers(http_cache=None):
|
||||
headers = _make_cache_headers()
|
||||
status = '200 OK'
|
||||
rewriter = UrlRewriter('20131010/http://example.com/', '/pywb/',
|
||||
rewrite_opts={'http_cache': http_cache})
|
||||
|
||||
rewritten = headerrewriter.rewrite(StatusAndHeaders(status, headers),
|
||||
rewriter,
|
||||
rewriter.get_cookie_rewriter())
|
||||
return rewritten.status_headers
|
||||
|
||||
|
||||
def _test_proxy_default():
|
||||
res = _test_proxy_headers()
|
||||
|
||||
assert res.get_header('X-Archive-Orig-Cache-Control') != None
|
||||
assert res.get_header('X-Archive-Orig-Expires') != None
|
||||
assert res.get_header('X-Archive-Orig-ETag') != None
|
||||
|
||||
|
||||
def _test_proxy_pass():
|
||||
res = _test_proxy_headers('pass')
|
||||
|
||||
assert res.get_header('Cache-Control') == 'max-age=10'
|
||||
assert res.get_header('Expires') != None
|
||||
assert res.get_header('ETag') != None
|
||||
|
||||
|
||||
def _test_proxy_set_age():
|
||||
res = _test_proxy_headers('600')
|
||||
|
||||
assert res.get_header('Cache-Control') == 'max-age=600'
|
||||
assert res.get_header('Expires') != None
|
||||
assert res.get_header('ETag') == None
|
||||
|
||||
|
||||
def _test_proxy_zero():
|
||||
res = _test_proxy_headers('0')
|
||||
|
||||
assert res.get_header('Cache-Control') == 'no-cache; no-store'
|
||||
assert res.get_header('Expires') == None
|
||||
assert res.get_header('ETag') == None
|
||||
|
||||
|
||||
def _test_proxy_not_num():
|
||||
res = _test_proxy_headers('blah')
|
||||
|
||||
assert res.get_header('Cache-Control') == 'no-cache; no-store'
|
||||
assert res.get_header('Expires') == None
|
||||
assert res.get_header('ETag') == None
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import doctest
|
||||
doctest.testmod()
|
||||
|
||||
|
@ -1,166 +0,0 @@
|
||||
"""
|
||||
#=================================================================
|
||||
HTTP Headers Rewriting
|
||||
#=================================================================
|
||||
|
||||
# Text with charset
|
||||
>>> _test_headers([('Date', 'Fri, 03 Jan 2014 03:03:21 GMT'), ('Content-Length', '5'), ('Content-Type', 'text/html;charset=UTF-8')])
|
||||
{'charset': 'utf-8',
|
||||
'removed_header_dict': {'content-length': '5'},
|
||||
'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Date', 'Fri, 03 Jan 2014 03:03:21 GMT'),
|
||||
('X-Archive-Orig-Content-Length', '5'),
|
||||
('Content-Type', 'text/html;charset=UTF-8')]),
|
||||
'text_type': 'html'}
|
||||
|
||||
# Redirect
|
||||
>>> _test_headers([('Connection', 'close'), ('Location', '/other.html')], '302 Redirect')
|
||||
{'charset': None,
|
||||
'removed_header_dict': {},
|
||||
'status_headers': StatusAndHeaders(protocol = '', statusline = '302 Redirect', headers = [ ('X-Archive-Orig-Connection', 'close'),
|
||||
('Location', '/web/20131010/http://example.com/other.html')]),
|
||||
'text_type': None}
|
||||
|
||||
# gzip
|
||||
>>> _test_headers([('Content-Length', '199999'), ('Content-Type', 'text/javascript'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked')])
|
||||
{'charset': None,
|
||||
'removed_header_dict': {'content-encoding': 'gzip',
|
||||
'content-length': '199999',
|
||||
'transfer-encoding': 'chunked'},
|
||||
'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Content-Length', '199999'),
|
||||
('Content-Type', 'text/javascript'),
|
||||
('X-Archive-Orig-Content-Encoding', 'gzip'),
|
||||
('X-Archive-Orig-Transfer-Encoding', 'chunked')]),
|
||||
'text_type': 'js'}
|
||||
|
||||
# Binary -- transfer-encoding rewritten
|
||||
>>> _test_headers([('Content-Length', '200000'), ('Content-Type', 'image/png'), ('Set-Cookie', 'foo=bar; Path=/;'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked'), ('X-Proxy', 'test')])
|
||||
{'charset': None,
|
||||
'removed_header_dict': {'transfer-encoding': 'chunked'},
|
||||
'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('Content-Length', '200000'),
|
||||
('Content-Type', 'image/png'),
|
||||
('Set-Cookie', 'foo=bar; Path=/web/20131010/http://example.com/'),
|
||||
('Content-Encoding', 'gzip'),
|
||||
('X-Archive-Orig-Transfer-Encoding', 'chunked'),
|
||||
('X-Archive-Orig-X-Proxy', 'test')]),
|
||||
'text_type': None}
|
||||
|
||||
"""
|
||||
|
||||
|
||||
|
||||
from pywb.rewrite.header_rewriter import HeaderRewriter
|
||||
from pywb.rewrite.url_rewriter import UrlRewriter
|
||||
from warcio.statusandheaders import StatusAndHeaders
|
||||
|
||||
from warcio.timeutils import datetime_to_http_date
|
||||
from datetime import datetime
|
||||
|
||||
import pprint
|
||||
from mock import patch
|
||||
|
||||
urlrewriter = UrlRewriter('20131010/http://example.com/', '/web/')
|
||||
|
||||
|
||||
headerrewriter = HeaderRewriter()
|
||||
|
||||
def _repr_format(sh):
|
||||
headers_str = pprint.pformat(sh.headers, indent=2, width=80)
|
||||
return "StatusAndHeaders(protocol = '{0}', statusline = '{1}', \
|
||||
headers = {2})".format(sh.protocol, sh.statusline, headers_str)
|
||||
|
||||
|
||||
@patch('warcio.statusandheaders.StatusAndHeaders.__repr__', _repr_format)
|
||||
def _test_headers(headers, status='200 OK', rewriter=urlrewriter):
|
||||
rewritten = headerrewriter.rewrite(StatusAndHeaders(status, headers), rewriter, rewriter.get_cookie_rewriter())
|
||||
return pprint.pprint(vars(rewritten))
|
||||
|
||||
|
||||
def _test_head_data(headers, status='200 OK', rewriter=urlrewriter):
|
||||
rewritten = headerrewriter.rewrite(StatusAndHeaders(status, headers),
|
||||
rewriter,
|
||||
rewriter.get_cookie_rewriter())
|
||||
return rewritten.status_headers
|
||||
|
||||
|
||||
|
||||
def test_cookie_headers():
|
||||
# cookie, host/origin rewriting
|
||||
res = _test_head_data([('Connection', 'close'),
|
||||
('Set-Cookie', 'foo=bar; Path=/; abc=def; Path=/somefile.html'),
|
||||
('Host', 'example.com'),
|
||||
('Origin', 'https://example.com')])
|
||||
|
||||
assert(('Set-Cookie', 'foo=bar; Path=/web/20131010/http://example.com/') in res.headers)
|
||||
assert(('Set-Cookie', 'abc=def; Path=/web/20131010/http://example.com/somefile.html') in res.headers)
|
||||
|
||||
assert(('X-Archive-Orig-Connection', 'close') in res.headers)
|
||||
assert(('X-Archive-Orig-Host', 'example.com') in res.headers)
|
||||
assert(('X-Archive-Orig-Origin', 'https://example.com') in res.headers)
|
||||
|
||||
|
||||
|
||||
def _make_cache_headers():
|
||||
cache_headers = [('Content-Length', '123'),
|
||||
('Cache-Control', 'max-age=10'),
|
||||
('Expires', datetime_to_http_date(datetime.now())),
|
||||
('ETag', '123456')]
|
||||
return cache_headers
|
||||
|
||||
|
||||
def _test_proxy_headers(http_cache=None):
|
||||
headers = _make_cache_headers()
|
||||
status = '200 OK'
|
||||
rewriter = UrlRewriter('20131010/http://example.com/', '/pywb/',
|
||||
rewrite_opts={'http_cache': http_cache})
|
||||
|
||||
rewritten = headerrewriter.rewrite(StatusAndHeaders(status, headers),
|
||||
rewriter,
|
||||
rewriter.get_cookie_rewriter())
|
||||
return rewritten.status_headers
|
||||
|
||||
|
||||
def test_proxy_default():
|
||||
res = _test_proxy_headers()
|
||||
|
||||
assert res.get_header('X-Archive-Orig-Cache-Control') != None
|
||||
assert res.get_header('X-Archive-Orig-Expires') != None
|
||||
assert res.get_header('X-Archive-Orig-ETag') != None
|
||||
|
||||
|
||||
def test_proxy_pass():
|
||||
res = _test_proxy_headers('pass')
|
||||
|
||||
assert res.get_header('Cache-Control') == 'max-age=10'
|
||||
assert res.get_header('Expires') != None
|
||||
assert res.get_header('ETag') != None
|
||||
|
||||
|
||||
def test_proxy_set_age():
|
||||
res = _test_proxy_headers('600')
|
||||
|
||||
assert res.get_header('Cache-Control') == 'max-age=600'
|
||||
assert res.get_header('Expires') != None
|
||||
assert res.get_header('ETag') == None
|
||||
|
||||
|
||||
def test_proxy_zero():
|
||||
res = _test_proxy_headers('0')
|
||||
|
||||
assert res.get_header('Cache-Control') == 'no-cache; no-store'
|
||||
assert res.get_header('Expires') == None
|
||||
assert res.get_header('ETag') == None
|
||||
|
||||
|
||||
def test_proxy_not_num():
|
||||
res = _test_proxy_headers('blah')
|
||||
|
||||
assert res.get_header('Cache-Control') == 'no-cache; no-store'
|
||||
assert res.get_header('Expires') == None
|
||||
assert res.get_header('ETag') == None
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import doctest
|
||||
doctest.testmod()
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user