mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
rewriter: content rewriter content-type detection improvements:
- if content-type missing, resolve if text type by checking for html and modifier - if text type has changed, set default JS and CSS text type - if text type is html, ensure mime type is text/html (force xhtml mime type to text/html) tests: add test_content_rewriter for direct header + content rewriting tests
This commit is contained in:
parent
aaad583276
commit
07229bafed
@ -276,9 +276,27 @@ class RewriteInfo(object):
|
||||
|
||||
self.cookie_rewriter = cookie_rewriter
|
||||
|
||||
if self.record:
|
||||
self._fill_text_type_and_charset()
|
||||
self._resolve_text_type()
|
||||
if not self.record:
|
||||
return
|
||||
|
||||
self._fill_text_type_and_charset()
|
||||
|
||||
orig_text_type = self.text_type
|
||||
|
||||
self._resolve_text_type()
|
||||
|
||||
if not self.text_type or (self.text_type != 'html' and self.text_type == orig_text_type):
|
||||
return
|
||||
|
||||
# text type changed, ensure content-type header matches
|
||||
content_type = content_rewriter.default_content_types.get(self.text_type)
|
||||
if not content_type:
|
||||
return
|
||||
|
||||
if self.charset:
|
||||
content_type += '; charset=' + self.charset
|
||||
|
||||
self.record.http_headers.replace_header('Content-Type', content_type)
|
||||
|
||||
def _fill_text_type_and_charset(self):
|
||||
content_type = self.record.http_headers.get_header('Content-Type')
|
||||
|
@ -79,6 +79,12 @@ class DefaultRewriter(BaseContentRewriter):
|
||||
'text/plain': 'plain',
|
||||
}
|
||||
|
||||
default_content_types = {
|
||||
'html': 'text/html',
|
||||
'css': 'text/css',
|
||||
'js': 'text/javascript'
|
||||
}
|
||||
|
||||
def __init__(self, rules_file=None, replay_mod=''):
|
||||
rules_file = rules_file or 'pkg://pywb/rules.yaml'
|
||||
super(DefaultRewriter, self).__init__(rules_file, replay_mod)
|
||||
|
109
pywb/rewrite/test/test_content_rewriter.py
Normal file
109
pywb/rewrite/test/test_content_rewriter.py
Normal file
@ -0,0 +1,109 @@
|
||||
from warcio.warcwriter import BufferWARCWriter, GzippingWrapper
|
||||
from warcio.statusandheaders import StatusAndHeaders
|
||||
|
||||
from io import BytesIO
|
||||
|
||||
from pywb.warcserver.index.cdxobject import CDXObject
|
||||
from pywb.utils.canonicalize import canonicalize
|
||||
|
||||
from pywb.rewrite.wburl import WbUrl
|
||||
from pywb.rewrite.url_rewriter import UrlRewriter
|
||||
from pywb.rewrite.default_rewriter import DefaultRewriter
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.fixture(params=[{'Content-Type': 'text/html'},
|
||||
{'Content-Type': 'application/xhtml+xml'},
|
||||
{}],
|
||||
ids=['html', 'xhtml', 'none'])
|
||||
def headers(request):
|
||||
return request.param
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class TestContentRewriter(object):
|
||||
@classmethod
|
||||
def setup_class(self):
|
||||
self.content_rewriter = DefaultRewriter()
|
||||
|
||||
def _create_response_record(self, url, headers, payload):
|
||||
writer = BufferWARCWriter()
|
||||
|
||||
payload = payload.encode('utf-8')
|
||||
|
||||
http_headers = StatusAndHeaders('200 OK', headers, protocol='HTTP/1.0')
|
||||
|
||||
return writer.create_warc_record(url, 'response',
|
||||
payload=BytesIO(payload),
|
||||
length=len(payload),
|
||||
http_headers=http_headers)
|
||||
|
||||
def rewrite_record(self, headers, content, url='http://example.com/',
|
||||
ts='20170102030000000', prefix='http://localhost:8080/prefix/'):
|
||||
|
||||
record = self._create_response_record(url, headers, content)
|
||||
|
||||
wburl = WbUrl(ts + '/' + url)
|
||||
print(wburl.mod)
|
||||
url_rewriter = UrlRewriter(wburl, prefix)
|
||||
|
||||
cdx = CDXObject()
|
||||
cdx['url'] = url
|
||||
cdx['timestamp'] = ts
|
||||
cdx['urlkey'] = canonicalize(url)
|
||||
|
||||
return self.content_rewriter(record, url_rewriter, None, cdx=cdx)
|
||||
|
||||
def test_rewrite_html(self, headers):
|
||||
content = '<html><body><a href="http://example.com/"></a></body></html>'
|
||||
|
||||
headers, gen, is_rw = self.rewrite_record(headers, content)
|
||||
|
||||
assert ('Content-Type', 'text/html') in headers.headers
|
||||
|
||||
exp = '<html><body><a href="http://localhost:8080/prefix/20170102030000000/http://example.com/"></a></body></html>'
|
||||
assert b''.join(gen).decode('utf-8') == exp
|
||||
|
||||
def test_rewrite_html_js_mod(self, headers):
|
||||
content = '<html><body><a href="http://example.com/"></a></body></html>'
|
||||
|
||||
headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701js_')
|
||||
|
||||
assert ('Content-Type', 'text/html') in headers.headers
|
||||
|
||||
exp = '<html><body><a href="http://localhost:8080/prefix/201701/http://example.com/"></a></body></html>'
|
||||
assert b''.join(gen).decode('utf-8') == exp
|
||||
|
||||
def test_rewrite_js_mod(self, headers):
|
||||
content = 'function() { location.href = "http://example.com/"; }'
|
||||
|
||||
headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701js_')
|
||||
|
||||
assert ('Content-Type', 'text/javascript') in headers.headers
|
||||
|
||||
exp = 'function() { WB_wombat_location.href = "http://example.com/"; }'
|
||||
assert b''.join(gen).decode('utf-8') == exp
|
||||
|
||||
def test_rewrite_cs_mod(self, headers):
|
||||
content = '.foo { background: url(http://localhost:8080/prefix/201701cs_/http://example.com/) }'
|
||||
|
||||
headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701cs_')
|
||||
|
||||
assert ('Content-Type', 'text/css') in headers.headers
|
||||
|
||||
exp = '.foo { background: url(http://localhost:8080/prefix/201701cs_/http://example.com/) }'
|
||||
|
||||
assert b''.join(gen).decode('utf-8') == exp
|
||||
|
||||
def test_binary_no_content_type(self):
|
||||
headers = {}
|
||||
content = '\x11\x12\x13\x14'
|
||||
headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701mp_')
|
||||
|
||||
assert 'Content-Type' not in headers.headers
|
||||
|
||||
assert is_rw == False
|
||||
|
||||
|
||||
|
@ -44,7 +44,7 @@ class TestHeaderRewriter(object):
|
||||
HTTP/1.0 200 OK\r\n\
|
||||
X-Archive-Orig-Date: Fri, 03 Jan 2014 03:03:21 GMT\r\n\
|
||||
Content-Length: 5\r\n\
|
||||
Content-Type: text/html;charset=UTF-8\r\n\
|
||||
Content-Type: text/html; charset=utf-8\r\n\
|
||||
"""
|
||||
rwinfo = self.do_rewrite('200 OK', headers)
|
||||
http_headers = PrefixHeaderRewriter(rwinfo)()
|
||||
|
Loading…
x
Reference in New Issue
Block a user