diff --git a/pywb/rewrite/content_rewriter.py b/pywb/rewrite/content_rewriter.py index 2d59665d..f774c7ac 100644 --- a/pywb/rewrite/content_rewriter.py +++ b/pywb/rewrite/content_rewriter.py @@ -276,9 +276,27 @@ class RewriteInfo(object): self.cookie_rewriter = cookie_rewriter - if self.record: - self._fill_text_type_and_charset() - self._resolve_text_type() + if not self.record: + return + + self._fill_text_type_and_charset() + + orig_text_type = self.text_type + + self._resolve_text_type() + + if not self.text_type or (self.text_type != 'html' and self.text_type == orig_text_type): + return + + # text type changed, ensure content-type header matches + content_type = content_rewriter.default_content_types.get(self.text_type) + if not content_type: + return + + if self.charset: + content_type += '; charset=' + self.charset + + self.record.http_headers.replace_header('Content-Type', content_type) def _fill_text_type_and_charset(self): content_type = self.record.http_headers.get_header('Content-Type') diff --git a/pywb/rewrite/default_rewriter.py b/pywb/rewrite/default_rewriter.py index fc7bdeee..7b1873b6 100644 --- a/pywb/rewrite/default_rewriter.py +++ b/pywb/rewrite/default_rewriter.py @@ -79,6 +79,12 @@ class DefaultRewriter(BaseContentRewriter): 'text/plain': 'plain', } + default_content_types = { + 'html': 'text/html', + 'css': 'text/css', + 'js': 'text/javascript' + } + def __init__(self, rules_file=None, replay_mod=''): rules_file = rules_file or 'pkg://pywb/rules.yaml' super(DefaultRewriter, self).__init__(rules_file, replay_mod) diff --git a/pywb/rewrite/test/test_content_rewriter.py b/pywb/rewrite/test/test_content_rewriter.py new file mode 100644 index 00000000..c7a7d239 --- /dev/null +++ b/pywb/rewrite/test/test_content_rewriter.py @@ -0,0 +1,109 @@ +from warcio.warcwriter import BufferWARCWriter, GzippingWrapper +from warcio.statusandheaders import StatusAndHeaders + +from io import BytesIO + +from pywb.warcserver.index.cdxobject import CDXObject +from pywb.utils.canonicalize import canonicalize + +from pywb.rewrite.wburl import WbUrl +from pywb.rewrite.url_rewriter import UrlRewriter +from pywb.rewrite.default_rewriter import DefaultRewriter + +import pytest + + +@pytest.fixture(params=[{'Content-Type': 'text/html'}, + {'Content-Type': 'application/xhtml+xml'}, + {}], + ids=['html', 'xhtml', 'none']) +def headers(request): + return request.param + + +# ============================================================================ +class TestContentRewriter(object): + @classmethod + def setup_class(self): + self.content_rewriter = DefaultRewriter() + + def _create_response_record(self, url, headers, payload): + writer = BufferWARCWriter() + + payload = payload.encode('utf-8') + + http_headers = StatusAndHeaders('200 OK', headers, protocol='HTTP/1.0') + + return writer.create_warc_record(url, 'response', + payload=BytesIO(payload), + length=len(payload), + http_headers=http_headers) + + def rewrite_record(self, headers, content, url='http://example.com/', + ts='20170102030000000', prefix='http://localhost:8080/prefix/'): + + record = self._create_response_record(url, headers, content) + + wburl = WbUrl(ts + '/' + url) + print(wburl.mod) + url_rewriter = UrlRewriter(wburl, prefix) + + cdx = CDXObject() + cdx['url'] = url + cdx['timestamp'] = ts + cdx['urlkey'] = canonicalize(url) + + return self.content_rewriter(record, url_rewriter, None, cdx=cdx) + + def test_rewrite_html(self, headers): + content = '' + + headers, gen, is_rw = self.rewrite_record(headers, content) + + assert ('Content-Type', 'text/html') in headers.headers + + exp = '' + assert b''.join(gen).decode('utf-8') == exp + + def test_rewrite_html_js_mod(self, headers): + content = '' + + headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701js_') + + assert ('Content-Type', 'text/html') in headers.headers + + exp = '' + assert b''.join(gen).decode('utf-8') == exp + + def test_rewrite_js_mod(self, headers): + content = 'function() { location.href = "http://example.com/"; }' + + headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701js_') + + assert ('Content-Type', 'text/javascript') in headers.headers + + exp = 'function() { WB_wombat_location.href = "http://example.com/"; }' + assert b''.join(gen).decode('utf-8') == exp + + def test_rewrite_cs_mod(self, headers): + content = '.foo { background: url(http://localhost:8080/prefix/201701cs_/http://example.com/) }' + + headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701cs_') + + assert ('Content-Type', 'text/css') in headers.headers + + exp = '.foo { background: url(http://localhost:8080/prefix/201701cs_/http://example.com/) }' + + assert b''.join(gen).decode('utf-8') == exp + + def test_binary_no_content_type(self): + headers = {} + content = '\x11\x12\x13\x14' + headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701mp_') + + assert 'Content-Type' not in headers.headers + + assert is_rw == False + + + diff --git a/pywb/rewrite/test/test_header_rewriter.py b/pywb/rewrite/test/test_header_rewriter.py index 114b70c7..a2c20743 100644 --- a/pywb/rewrite/test/test_header_rewriter.py +++ b/pywb/rewrite/test/test_header_rewriter.py @@ -44,7 +44,7 @@ class TestHeaderRewriter(object): HTTP/1.0 200 OK\r\n\ X-Archive-Orig-Date: Fri, 03 Jan 2014 03:03:21 GMT\r\n\ Content-Length: 5\r\n\ -Content-Type: text/html;charset=UTF-8\r\n\ +Content-Type: text/html; charset=utf-8\r\n\ """ rwinfo = self.do_rewrite('200 OK', headers) http_headers = PrefixHeaderRewriter(rwinfo)()