1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

rewriter: content rewriter content-type detection improvements:

- if content-type missing, resolve if text type by checking for html and modifier
- if text type has changed, set default JS and CSS text type
- if text type is html, ensure mime type is text/html (force xhtml mime type to text/html)
tests: add test_content_rewriter for direct header + content rewriting tests
This commit is contained in:
Ilya Kreymer 2017-08-16 23:01:59 -07:00 committed by Ilya Kreymer
parent aaad583276
commit 07229bafed
4 changed files with 137 additions and 4 deletions

View File

@ -276,9 +276,27 @@ class RewriteInfo(object):
self.cookie_rewriter = cookie_rewriter
if self.record:
self._fill_text_type_and_charset()
self._resolve_text_type()
if not self.record:
return
self._fill_text_type_and_charset()
orig_text_type = self.text_type
self._resolve_text_type()
if not self.text_type or (self.text_type != 'html' and self.text_type == orig_text_type):
return
# text type changed, ensure content-type header matches
content_type = content_rewriter.default_content_types.get(self.text_type)
if not content_type:
return
if self.charset:
content_type += '; charset=' + self.charset
self.record.http_headers.replace_header('Content-Type', content_type)
def _fill_text_type_and_charset(self):
content_type = self.record.http_headers.get_header('Content-Type')

View File

@ -79,6 +79,12 @@ class DefaultRewriter(BaseContentRewriter):
'text/plain': 'plain',
}
default_content_types = {
'html': 'text/html',
'css': 'text/css',
'js': 'text/javascript'
}
def __init__(self, rules_file=None, replay_mod=''):
rules_file = rules_file or 'pkg://pywb/rules.yaml'
super(DefaultRewriter, self).__init__(rules_file, replay_mod)

View File

@ -0,0 +1,109 @@
from warcio.warcwriter import BufferWARCWriter, GzippingWrapper
from warcio.statusandheaders import StatusAndHeaders
from io import BytesIO
from pywb.warcserver.index.cdxobject import CDXObject
from pywb.utils.canonicalize import canonicalize
from pywb.rewrite.wburl import WbUrl
from pywb.rewrite.url_rewriter import UrlRewriter
from pywb.rewrite.default_rewriter import DefaultRewriter
import pytest
@pytest.fixture(params=[{'Content-Type': 'text/html'},
{'Content-Type': 'application/xhtml+xml'},
{}],
ids=['html', 'xhtml', 'none'])
def headers(request):
return request.param
# ============================================================================
class TestContentRewriter(object):
@classmethod
def setup_class(self):
self.content_rewriter = DefaultRewriter()
def _create_response_record(self, url, headers, payload):
writer = BufferWARCWriter()
payload = payload.encode('utf-8')
http_headers = StatusAndHeaders('200 OK', headers, protocol='HTTP/1.0')
return writer.create_warc_record(url, 'response',
payload=BytesIO(payload),
length=len(payload),
http_headers=http_headers)
def rewrite_record(self, headers, content, url='http://example.com/',
ts='20170102030000000', prefix='http://localhost:8080/prefix/'):
record = self._create_response_record(url, headers, content)
wburl = WbUrl(ts + '/' + url)
print(wburl.mod)
url_rewriter = UrlRewriter(wburl, prefix)
cdx = CDXObject()
cdx['url'] = url
cdx['timestamp'] = ts
cdx['urlkey'] = canonicalize(url)
return self.content_rewriter(record, url_rewriter, None, cdx=cdx)
def test_rewrite_html(self, headers):
content = '<html><body><a href="http://example.com/"></a></body></html>'
headers, gen, is_rw = self.rewrite_record(headers, content)
assert ('Content-Type', 'text/html') in headers.headers
exp = '<html><body><a href="http://localhost:8080/prefix/20170102030000000/http://example.com/"></a></body></html>'
assert b''.join(gen).decode('utf-8') == exp
def test_rewrite_html_js_mod(self, headers):
content = '<html><body><a href="http://example.com/"></a></body></html>'
headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701js_')
assert ('Content-Type', 'text/html') in headers.headers
exp = '<html><body><a href="http://localhost:8080/prefix/201701/http://example.com/"></a></body></html>'
assert b''.join(gen).decode('utf-8') == exp
def test_rewrite_js_mod(self, headers):
content = 'function() { location.href = "http://example.com/"; }'
headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701js_')
assert ('Content-Type', 'text/javascript') in headers.headers
exp = 'function() { WB_wombat_location.href = "http://example.com/"; }'
assert b''.join(gen).decode('utf-8') == exp
def test_rewrite_cs_mod(self, headers):
content = '.foo { background: url(http://localhost:8080/prefix/201701cs_/http://example.com/) }'
headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701cs_')
assert ('Content-Type', 'text/css') in headers.headers
exp = '.foo { background: url(http://localhost:8080/prefix/201701cs_/http://example.com/) }'
assert b''.join(gen).decode('utf-8') == exp
def test_binary_no_content_type(self):
headers = {}
content = '\x11\x12\x13\x14'
headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701mp_')
assert 'Content-Type' not in headers.headers
assert is_rw == False

View File

@ -44,7 +44,7 @@ class TestHeaderRewriter(object):
HTTP/1.0 200 OK\r\n\
X-Archive-Orig-Date: Fri, 03 Jan 2014 03:03:21 GMT\r\n\
Content-Length: 5\r\n\
Content-Type: text/html;charset=UTF-8\r\n\
Content-Type: text/html; charset=utf-8\r\n\
"""
rwinfo = self.do_rewrite('200 OK', headers)
http_headers = PrefixHeaderRewriter(rwinfo)()