rewriter: content rewriter content-type detection improvements:

- if content-type missing, resolve if text type by checking for html and modifier - if text type has changed, set default JS and CSS text type - if text type is html, ensure mime type is text/html (force xhtml mime type to text/html) tests: add test_content_rewriter for direct header + content rewriting tests
2025-03-15 00:03:28 +01:00 · 2017-08-16 23:01:59 -07:00 · 2017-08-16 23:01:59 -07:00 · 07229bafed
commit 07229bafed
parent aaad583276
4 changed files with 137 additions and 4 deletions
--- a/pywb/rewrite/content_rewriter.py
+++ b/pywb/rewrite/content_rewriter.py
@ -276,9 +276,27 @@ class RewriteInfo(object):

        self.cookie_rewriter = cookie_rewriter

-        if self.record:
-            self._fill_text_type_and_charset()
-            self._resolve_text_type()
+        if not self.record:
+            return
+
+        self._fill_text_type_and_charset()
+
+        orig_text_type = self.text_type
+
+        self._resolve_text_type()
+
+        if not self.text_type or (self.text_type != 'html' and self.text_type == orig_text_type):
+            return
+
+        # text type changed, ensure content-type header matches
+        content_type = content_rewriter.default_content_types.get(self.text_type)
+        if not content_type:
+            return
+
+        if self.charset:
+           content_type += '; charset=' + self.charset
+
+        self.record.http_headers.replace_header('Content-Type', content_type)

    def _fill_text_type_and_charset(self):
        content_type = self.record.http_headers.get_header('Content-Type')
--- a/pywb/rewrite/default_rewriter.py
+++ b/pywb/rewrite/default_rewriter.py
@ -79,6 +79,12 @@ class DefaultRewriter(BaseContentRewriter):
        'text/plain': 'plain',
    }

+    default_content_types = {
+        'html': 'text/html',
+        'css': 'text/css',
+        'js': 'text/javascript'
+    }
+
    def __init__(self, rules_file=None, replay_mod=''):
        rules_file = rules_file or 'pkg://pywb/rules.yaml'
        super(DefaultRewriter, self).__init__(rules_file, replay_mod)
--- a/pywb/rewrite/test/test_content_rewriter.py
+++ b/pywb/rewrite/test/test_content_rewriter.py
@ -0,0 +1,109 @@
+from warcio.warcwriter import BufferWARCWriter, GzippingWrapper
+from warcio.statusandheaders import StatusAndHeaders
+
+from io import BytesIO
+
+from pywb.warcserver.index.cdxobject import CDXObject
+from pywb.utils.canonicalize import canonicalize
+
+from pywb.rewrite.wburl import WbUrl
+from pywb.rewrite.url_rewriter import UrlRewriter
+from pywb.rewrite.default_rewriter import DefaultRewriter
+
+import pytest
+
+
+@pytest.fixture(params=[{'Content-Type': 'text/html'},
+                        {'Content-Type': 'application/xhtml+xml'},
+                        {}],
+                ids=['html', 'xhtml', 'none'])
+def headers(request):
+    return request.param
+
+
+# ============================================================================
+class TestContentRewriter(object):
+    @classmethod
+    def setup_class(self):
+        self.content_rewriter = DefaultRewriter()
+
+    def _create_response_record(self, url, headers, payload):
+        writer = BufferWARCWriter()
+
+        payload = payload.encode('utf-8')
+
+        http_headers = StatusAndHeaders('200 OK', headers, protocol='HTTP/1.0')
+
+        return writer.create_warc_record(url, 'response',
+                                         payload=BytesIO(payload),
+                                         length=len(payload),
+                                         http_headers=http_headers)
+
+    def rewrite_record(self, headers, content, url='http://example.com/',
+                       ts='20170102030000000', prefix='http://localhost:8080/prefix/'):
+
+        record = self._create_response_record(url, headers, content)
+
+        wburl = WbUrl(ts + '/' + url)
+        print(wburl.mod)
+        url_rewriter = UrlRewriter(wburl, prefix)
+
+        cdx = CDXObject()
+        cdx['url'] = url
+        cdx['timestamp'] = ts
+        cdx['urlkey'] = canonicalize(url)
+
+        return self.content_rewriter(record, url_rewriter, None, cdx=cdx)
+
+    def test_rewrite_html(self, headers):
+        content = '<html><body><a href="http://example.com/"></a></body></html>'
+
+        headers, gen, is_rw = self.rewrite_record(headers, content)
+
+        assert ('Content-Type', 'text/html') in headers.headers
+
+        exp = '<html><body><a href="http://localhost:8080/prefix/20170102030000000/http://example.com/"></a></body></html>'
+        assert b''.join(gen).decode('utf-8') == exp
+
+    def test_rewrite_html_js_mod(self, headers):
+        content = '<html><body><a href="http://example.com/"></a></body></html>'
+
+        headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701js_')
+
+        assert ('Content-Type', 'text/html') in headers.headers
+
+        exp = '<html><body><a href="http://localhost:8080/prefix/201701/http://example.com/"></a></body></html>'
+        assert b''.join(gen).decode('utf-8') == exp
+
+    def test_rewrite_js_mod(self, headers):
+        content = 'function() { location.href = "http://example.com/"; }'
+
+        headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701js_')
+
+        assert ('Content-Type', 'text/javascript') in headers.headers
+
+        exp = 'function() { WB_wombat_location.href = "http://example.com/"; }'
+        assert b''.join(gen).decode('utf-8') == exp
+
+    def test_rewrite_cs_mod(self, headers):
+        content = '.foo { background: url(http://localhost:8080/prefix/201701cs_/http://example.com/) }'
+
+        headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701cs_')
+
+        assert ('Content-Type', 'text/css') in headers.headers
+
+        exp = '.foo { background: url(http://localhost:8080/prefix/201701cs_/http://example.com/) }'
+
+        assert b''.join(gen).decode('utf-8') == exp
+
+    def test_binary_no_content_type(self):
+        headers = {}
+        content = '\x11\x12\x13\x14'
+        headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701mp_')
+
+        assert 'Content-Type' not in headers.headers
+
+        assert is_rw == False
+
+
+
--- a/pywb/rewrite/test/test_header_rewriter.py
+++ b/pywb/rewrite/test/test_header_rewriter.py
@ -44,7 +44,7 @@ class TestHeaderRewriter(object):
 HTTP/1.0 200 OK\r\n\
 X-Archive-Orig-Date: Fri, 03 Jan 2014 03:03:21 GMT\r\n\
 Content-Length: 5\r\n\
-Content-Type: text/html;charset=UTF-8\r\n\
+Content-Type: text/html; charset=utf-8\r\n\
 """
        rwinfo = self.do_rewrite('200 OK', headers)
        http_headers = PrefixHeaderRewriter(rwinfo)()