From 37dc4693c0e6a1f592e723f9856f70dada556bbf Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 23 May 2017 23:56:44 -0700 Subject: [PATCH] tests: add new tests for header_rewriter default rewriter: using HostScopeCookieRewriter as default cookie rewriter, add 'cookie' entry to all_rewriters --- pywb/rewrite/content_rewriter.py | 14 +- pywb/rewrite/default_rewriter.py | 2 + pywb/rewrite/test/test_header_rewriter.py | 223 ++++++++++++++++++++++ tests_disabled/test_header_rewriter.py | 166 ---------------- 4 files changed, 231 insertions(+), 174 deletions(-) create mode 100644 pywb/rewrite/test/test_header_rewriter.py delete mode 100644 tests_disabled/test_header_rewriter.py diff --git a/pywb/rewrite/content_rewriter.py b/pywb/rewrite/content_rewriter.py index 1a364ab9..efd915bc 100644 --- a/pywb/rewrite/content_rewriter.py +++ b/pywb/rewrite/content_rewriter.py @@ -10,7 +10,6 @@ import webencodings import tempfile from pywb.warcserver.utils import StreamIter, BUFF_SIZE -from pywb.rewrite.cookie_rewriter import ExactPathCookieRewriter from pywb.utils.loaders import load_yaml_config @@ -29,9 +28,6 @@ class BaseContentRewriter(object): def add_rewriter(self, rw): self.all_rewriters[rw.name] = rw - def get_rewriter(self, url, text_type): - return self.all_rewriters.get(text_type) - def load_rules(self, filename): config = load_yaml_config(filename) for rule in config.get('rules'): @@ -157,7 +153,7 @@ class BaseContentRewriter(object): head_insert_func=None, cdx=None): - rwinfo = RewriteInfo(record, self.get_rewrite_types(), url_rewriter, cookie_rewriter) + rwinfo = RewriteInfo(record, self, url_rewriter, cookie_rewriter) content_rewriter = None if rwinfo.should_rw_content(): @@ -259,13 +255,13 @@ class StreamingRewriter(object): class RewriteInfo(object): TAG_REGEX = re.compile(b'^\s*\<') - def __init__(self, record, rewrite_types, url_rewriter, cookie_rewriter): + def __init__(self, record, content_rewriter, url_rewriter, cookie_rewriter=None): self.record = record self._content_stream = None self.is_content_rw = False - self.rewrite_types = rewrite_types + self.rewrite_types = content_rewriter.get_rewrite_types() self.text_type = None self.charset = None @@ -273,7 +269,9 @@ class RewriteInfo(object): self.url_rewriter = url_rewriter if not cookie_rewriter: - cookie_rewriter = ExactPathCookieRewriter(url_rewriter) + cookie_rw_class = content_rewriter.all_rewriters.get('cookie') + if cookie_rw_class: + cookie_rewriter = cookie_rw_class(url_rewriter) self.cookie_rewriter = cookie_rewriter diff --git a/pywb/rewrite/default_rewriter.py b/pywb/rewrite/default_rewriter.py index 3496c401..6cd323f0 100644 --- a/pywb/rewrite/default_rewriter.py +++ b/pywb/rewrite/default_rewriter.py @@ -8,6 +8,7 @@ from pywb.rewrite.regex_rewriters import JSLinkAndLocationRewriter, JSLinkOnlyRe from pywb.rewrite.regex_rewriters import JSLocationOnlyRewriter, JSNoneRewriter from pywb.rewrite.header_rewriter import PrefixHeaderRewriter +from pywb.rewrite.cookie_rewriter import HostScopeCookieRewriter from pywb.rewrite.jsonp_rewriter import JSONPRewriter @@ -20,6 +21,7 @@ from pywb.rewrite.rewrite_amf import RewriteAMF class DefaultRewriter(BaseContentRewriter): all_rewriters = { 'header': PrefixHeaderRewriter, + 'cookie': HostScopeCookieRewriter, 'html': HTMLRewriter, 'html-banner-only': HTMLInsertOnlyRewriter, diff --git a/pywb/rewrite/test/test_header_rewriter.py b/pywb/rewrite/test/test_header_rewriter.py new file mode 100644 index 00000000..c101b1f3 --- /dev/null +++ b/pywb/rewrite/test/test_header_rewriter.py @@ -0,0 +1,223 @@ +from warcio.statusandheaders import StatusAndHeaders +from warcio.warcwriter import BufferWARCWriter +from warcio.timeutils import datetime_to_http_date + +from pywb.rewrite.content_rewriter import RewriteInfo +from pywb.rewrite.default_rewriter import DefaultRewriter +from pywb.rewrite.header_rewriter import PrefixHeaderRewriter +from pywb.rewrite.url_rewriter import UrlRewriter + +from datetime import datetime + +from io import BytesIO + + +class TestHeaderRewriter(object): + @classmethod + def setup_class(cls): + cls.urlrewriter = UrlRewriter('20171226/http://example.com/', '/warc/') + cls.default_rewriter = DefaultRewriter() + + @classmethod + def get_rwinfo(cls, record): + return RewriteInfo(record=record, + content_rewriter=cls.default_rewriter, + url_rewriter=cls.urlrewriter, cookie_rewriter=None) + + @classmethod + def do_rewrite(cls, statusline, headers): + writer = BufferWARCWriter() + + http_headers = StatusAndHeaders(statusline, headers, protocol='HTTP/1.0') + + record = writer.create_warc_record('http://example.com/', 'response', + http_headers=http_headers) + + return cls.get_rwinfo(record) + + def test_header_rewrite_200_response(self): + headers = [('Date', 'Fri, 03 Jan 2014 03:03:21 GMT'), + ('Content-Length', '5'), + ('Content-Type', 'text/html;charset=UTF-8')] + + res = """\ +HTTP/1.0 200 OK\r\n\ +X-Archive-Orig-Date: Fri, 03 Jan 2014 03:03:21 GMT\r\n\ +Content-Length: 5\r\n\ +Content-Type: text/html;charset=UTF-8\r\n\ +""" + rwinfo = self.do_rewrite('200 OK', headers) + http_headers = PrefixHeaderRewriter(rwinfo)() + assert str(http_headers) == res + + assert rwinfo.text_type == 'html' + assert rwinfo.charset == 'utf-8' + + def test_header_rewrite_redirect(self): + headers = [('Connection', 'close'), + ('Location', 'http://example.com/other.html')] + + res = """\ +HTTP/1.0 302 Redirect\r\n\ +X-Archive-Orig-Connection: close\r\n\ +Location: /warc/20171226/http://example.com/other.html\r\n\ +""" + rwinfo = self.do_rewrite('302 Redirect', headers) + http_headers = PrefixHeaderRewriter(rwinfo)() + assert str(http_headers) == res + + assert rwinfo.text_type == None + assert rwinfo.charset == None + + def test_header_rewrite_gzipped(self): + headers = [('Content-Length', '199999'), + ('Content-Type', 'text/javascript'), + ('Content-Encoding', 'gzip'), + ('Transfer-Encoding', 'chunked')] + + rwinfo = self.do_rewrite('200 OK', headers) + + # Content-Encoding, Content-Length not yet rewritten + res = """\ +HTTP/1.0 200 OK\r\n\ +Content-Length: 199999\r\n\ +Content-Type: text/javascript\r\n\ +Content-Encoding: gzip\r\n\ +X-Archive-Orig-Transfer-Encoding: chunked\r\n\ +""" + http_headers = PrefixHeaderRewriter(rwinfo)() + assert str(http_headers) == res + + assert rwinfo.text_type == 'js' + assert rwinfo.charset == None + + # access stream + stream = rwinfo.content_stream + + # Content-Encoding, Content-Length rewritten now + res = """\ +HTTP/1.0 200 OK\r\n\ +X-Archive-Orig-Content-Length: 199999\r\n\ +Content-Type: text/javascript\r\n\ +X-Archive-Orig-Content-Encoding: gzip\r\n\ +X-Archive-Orig-Transfer-Encoding: chunked\r\n\ +""" + http_headers = PrefixHeaderRewriter(rwinfo)() + assert str(http_headers) == res + + def test_header_rewrite_binary(self): + headers = [('Content-Length', '200000'), + ('Content-Type', 'image/png'), + ('Set-Cookie', 'foo=bar; Path=/; abc=123; Path=/path.html'), + ('Content-Encoding', 'gzip'), + ('Transfer-Encoding', 'chunked'), + ('X-Custom', 'test')] + + res = """\ +HTTP/1.0 200 OK\r\n\ +Content-Length: 200000\r\n\ +Content-Type: image/png\r\n\ +Set-Cookie: foo=bar; Path=/warc/20171226/http://example.com/\r\n\ +Set-Cookie: abc=123; Path=/warc/20171226/http://example.com/path.html\r\n\ +Content-Encoding: gzip\r\n\ +X-Archive-Orig-Transfer-Encoding: chunked\r\n\ +X-Archive-Orig-X-Custom: test\r\n\ +""" + rwinfo = self.do_rewrite('200 OK', headers) + http_headers = PrefixHeaderRewriter(rwinfo)() + assert str(http_headers) == res + + assert rwinfo.text_type == None + assert rwinfo.charset == None + + + +def _test_head_data(headers, status='200 OK', rewriter=None): + rewritten = headerrewriter.rewrite(StatusAndHeaders(status, headers), + rewriter, + rewriter.get_cookie_rewriter()) + return rewritten.status_headers + + + +def _test_cookie_headers(): + # cookie, host/origin rewriting + res = _test_head_data([('Connection', 'close'), + ('Set-Cookie', 'foo=bar; Path=/; abc=def; Path=/somefile.html'), + ('Host', 'example.com'), + ('Origin', 'https://example.com')]) + + assert(('Set-Cookie', 'foo=bar; Path=/web/20131010/http://example.com/') in res.headers) + assert(('Set-Cookie', 'abc=def; Path=/web/20131010/http://example.com/somefile.html') in res.headers) + + assert(('X-Archive-Orig-Connection', 'close') in res.headers) + assert(('X-Archive-Orig-Host', 'example.com') in res.headers) + assert(('X-Archive-Orig-Origin', 'https://example.com') in res.headers) + + + +def _make_cache_headers(): + cache_headers = [('Content-Length', '123'), + ('Cache-Control', 'max-age=10'), + ('Expires', datetime_to_http_date(datetime.now())), + ('ETag', '123456')] + return cache_headers + + +def _test_proxy_headers(http_cache=None): + headers = _make_cache_headers() + status = '200 OK' + rewriter = UrlRewriter('20131010/http://example.com/', '/pywb/', + rewrite_opts={'http_cache': http_cache}) + + rewritten = headerrewriter.rewrite(StatusAndHeaders(status, headers), + rewriter, + rewriter.get_cookie_rewriter()) + return rewritten.status_headers + + +def _test_proxy_default(): + res = _test_proxy_headers() + + assert res.get_header('X-Archive-Orig-Cache-Control') != None + assert res.get_header('X-Archive-Orig-Expires') != None + assert res.get_header('X-Archive-Orig-ETag') != None + + +def _test_proxy_pass(): + res = _test_proxy_headers('pass') + + assert res.get_header('Cache-Control') == 'max-age=10' + assert res.get_header('Expires') != None + assert res.get_header('ETag') != None + + +def _test_proxy_set_age(): + res = _test_proxy_headers('600') + + assert res.get_header('Cache-Control') == 'max-age=600' + assert res.get_header('Expires') != None + assert res.get_header('ETag') == None + + +def _test_proxy_zero(): + res = _test_proxy_headers('0') + + assert res.get_header('Cache-Control') == 'no-cache; no-store' + assert res.get_header('Expires') == None + assert res.get_header('ETag') == None + + +def _test_proxy_not_num(): + res = _test_proxy_headers('blah') + + assert res.get_header('Cache-Control') == 'no-cache; no-store' + assert res.get_header('Expires') == None + assert res.get_header('ETag') == None + + +if __name__ == "__main__": + import doctest + doctest.testmod() + + diff --git a/tests_disabled/test_header_rewriter.py b/tests_disabled/test_header_rewriter.py deleted file mode 100644 index 6e00b0c1..00000000 --- a/tests_disabled/test_header_rewriter.py +++ /dev/null @@ -1,166 +0,0 @@ -""" -#================================================================= -HTTP Headers Rewriting -#================================================================= - -# Text with charset ->>> _test_headers([('Date', 'Fri, 03 Jan 2014 03:03:21 GMT'), ('Content-Length', '5'), ('Content-Type', 'text/html;charset=UTF-8')]) -{'charset': 'utf-8', - 'removed_header_dict': {'content-length': '5'}, - 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Date', 'Fri, 03 Jan 2014 03:03:21 GMT'), - ('X-Archive-Orig-Content-Length', '5'), - ('Content-Type', 'text/html;charset=UTF-8')]), - 'text_type': 'html'} - -# Redirect ->>> _test_headers([('Connection', 'close'), ('Location', '/other.html')], '302 Redirect') -{'charset': None, - 'removed_header_dict': {}, - 'status_headers': StatusAndHeaders(protocol = '', statusline = '302 Redirect', headers = [ ('X-Archive-Orig-Connection', 'close'), - ('Location', '/web/20131010/http://example.com/other.html')]), - 'text_type': None} - -# gzip ->>> _test_headers([('Content-Length', '199999'), ('Content-Type', 'text/javascript'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked')]) -{'charset': None, - 'removed_header_dict': {'content-encoding': 'gzip', - 'content-length': '199999', - 'transfer-encoding': 'chunked'}, - 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Content-Length', '199999'), - ('Content-Type', 'text/javascript'), - ('X-Archive-Orig-Content-Encoding', 'gzip'), - ('X-Archive-Orig-Transfer-Encoding', 'chunked')]), - 'text_type': 'js'} - -# Binary -- transfer-encoding rewritten ->>> _test_headers([('Content-Length', '200000'), ('Content-Type', 'image/png'), ('Set-Cookie', 'foo=bar; Path=/;'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked'), ('X-Proxy', 'test')]) -{'charset': None, - 'removed_header_dict': {'transfer-encoding': 'chunked'}, - 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('Content-Length', '200000'), - ('Content-Type', 'image/png'), - ('Set-Cookie', 'foo=bar; Path=/web/20131010/http://example.com/'), - ('Content-Encoding', 'gzip'), - ('X-Archive-Orig-Transfer-Encoding', 'chunked'), - ('X-Archive-Orig-X-Proxy', 'test')]), - 'text_type': None} - -""" - - - -from pywb.rewrite.header_rewriter import HeaderRewriter -from pywb.rewrite.url_rewriter import UrlRewriter -from warcio.statusandheaders import StatusAndHeaders - -from warcio.timeutils import datetime_to_http_date -from datetime import datetime - -import pprint -from mock import patch - -urlrewriter = UrlRewriter('20131010/http://example.com/', '/web/') - - -headerrewriter = HeaderRewriter() - -def _repr_format(sh): - headers_str = pprint.pformat(sh.headers, indent=2, width=80) - return "StatusAndHeaders(protocol = '{0}', statusline = '{1}', \ -headers = {2})".format(sh.protocol, sh.statusline, headers_str) - - -@patch('warcio.statusandheaders.StatusAndHeaders.__repr__', _repr_format) -def _test_headers(headers, status='200 OK', rewriter=urlrewriter): - rewritten = headerrewriter.rewrite(StatusAndHeaders(status, headers), rewriter, rewriter.get_cookie_rewriter()) - return pprint.pprint(vars(rewritten)) - - -def _test_head_data(headers, status='200 OK', rewriter=urlrewriter): - rewritten = headerrewriter.rewrite(StatusAndHeaders(status, headers), - rewriter, - rewriter.get_cookie_rewriter()) - return rewritten.status_headers - - - -def test_cookie_headers(): - # cookie, host/origin rewriting - res = _test_head_data([('Connection', 'close'), - ('Set-Cookie', 'foo=bar; Path=/; abc=def; Path=/somefile.html'), - ('Host', 'example.com'), - ('Origin', 'https://example.com')]) - - assert(('Set-Cookie', 'foo=bar; Path=/web/20131010/http://example.com/') in res.headers) - assert(('Set-Cookie', 'abc=def; Path=/web/20131010/http://example.com/somefile.html') in res.headers) - - assert(('X-Archive-Orig-Connection', 'close') in res.headers) - assert(('X-Archive-Orig-Host', 'example.com') in res.headers) - assert(('X-Archive-Orig-Origin', 'https://example.com') in res.headers) - - - -def _make_cache_headers(): - cache_headers = [('Content-Length', '123'), - ('Cache-Control', 'max-age=10'), - ('Expires', datetime_to_http_date(datetime.now())), - ('ETag', '123456')] - return cache_headers - - -def _test_proxy_headers(http_cache=None): - headers = _make_cache_headers() - status = '200 OK' - rewriter = UrlRewriter('20131010/http://example.com/', '/pywb/', - rewrite_opts={'http_cache': http_cache}) - - rewritten = headerrewriter.rewrite(StatusAndHeaders(status, headers), - rewriter, - rewriter.get_cookie_rewriter()) - return rewritten.status_headers - - -def test_proxy_default(): - res = _test_proxy_headers() - - assert res.get_header('X-Archive-Orig-Cache-Control') != None - assert res.get_header('X-Archive-Orig-Expires') != None - assert res.get_header('X-Archive-Orig-ETag') != None - - -def test_proxy_pass(): - res = _test_proxy_headers('pass') - - assert res.get_header('Cache-Control') == 'max-age=10' - assert res.get_header('Expires') != None - assert res.get_header('ETag') != None - - -def test_proxy_set_age(): - res = _test_proxy_headers('600') - - assert res.get_header('Cache-Control') == 'max-age=600' - assert res.get_header('Expires') != None - assert res.get_header('ETag') == None - - -def test_proxy_zero(): - res = _test_proxy_headers('0') - - assert res.get_header('Cache-Control') == 'no-cache; no-store' - assert res.get_header('Expires') == None - assert res.get_header('ETag') == None - - -def test_proxy_not_num(): - res = _test_proxy_headers('blah') - - assert res.get_header('Cache-Control') == 'no-cache; no-store' - assert res.get_header('Expires') == None - assert res.get_header('ETag') == None - - -if __name__ == "__main__": - import doctest - doctest.testmod() - -