From c8c0cecda3e9e73bdd3b50a3eb120fd8321c64fb Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 27 Jul 2016 21:34:58 -0400 Subject: [PATCH] rewrite improvements: if content-type is text/plain but mod is js_ or cs_, treat as js or css (#31) header rewriter: ensure removed content-length and content-encoding are added back if no rewriting performed on response body --- pywb/rewrite/header_rewriter.py | 19 ++++++++++++------- pywb/rewrite/rewrite_amf.py | 7 +++---- pywb/rewrite/rewrite_content.py | 16 +++++++++++----- pywb/rewrite/test/test_header_rewriter.py | 4 +++- tests/test_live_proxy.py | 4 ++-- 5 files changed, 31 insertions(+), 19 deletions(-) diff --git a/pywb/rewrite/header_rewriter.py b/pywb/rewrite/header_rewriter.py index 610df546..ba2a6d03 100644 --- a/pywb/rewrite/header_rewriter.py +++ b/pywb/rewrite/header_rewriter.py @@ -17,6 +17,12 @@ class RewrittenStatusAndHeaders(object): def contains_removed_header(self, name, value): return self.removed_header_dict.get(name) == value + def readd_rewrite_removed(self): + for name in HeaderRewriter.PROXY_NO_REWRITE_HEADERS: + value = self.removed_header_dict.get(name) + if value is not None: + self.status_headers.headers.append((name, value)) + #================================================================= class HeaderRewriter(object): @@ -34,6 +40,8 @@ class HeaderRewriter(object): 'json': ['application/json'], 'xml': ['/xml', '+xml', '.xml', '.rss'], + + 'plain': ['text/plain'], } PROXY_HEADERS = ['content-type', 'content-disposition', 'content-range', @@ -41,12 +49,12 @@ class HeaderRewriter(object): URL_REWRITE_HEADERS = ['location', 'content-location', 'content-base'] - ENCODING_HEADERS = ['content-encoding'] + #ENCODING_HEADERS = ['content-encoding'] REMOVE_HEADERS = ['transfer-encoding', 'content-security-policy', 'strict-transport-security'] - PROXY_NO_REWRITE_HEADERS = ['content-length'] + PROXY_NO_REWRITE_HEADERS = ['content-length', 'content-encoding'] COOKIE_HEADERS = ['set-cookie', 'cookie'] @@ -141,9 +149,10 @@ class HeaderRewriter(object): elif urlrewriter and lowername in self.URL_REWRITE_HEADERS: new_headers.append((name, urlrewriter.rewrite(value))) - elif lowername in self.ENCODING_HEADERS: + elif lowername in self.PROXY_NO_REWRITE_HEADERS: if content_rewritten: removed_header_dict[lowername] = value + add_prefixed_header(name, value) else: add_header(name, value) @@ -151,10 +160,6 @@ class HeaderRewriter(object): removed_header_dict[lowername] = value add_prefixed_header(name, value) - elif (lowername in self.PROXY_NO_REWRITE_HEADERS and - not content_rewritten): - add_header(name, value) - elif (lowername in self.COOKIE_HEADERS and cookie_rewriter): cookie_list = cookie_rewriter.rewrite(value) diff --git a/pywb/rewrite/rewrite_amf.py b/pywb/rewrite/rewrite_amf.py index 07a73470..0cfd217f 100644 --- a/pywb/rewrite/rewrite_amf.py +++ b/pywb/rewrite/rewrite_amf.py @@ -6,13 +6,12 @@ from pywb.rewrite.rewrite_content import RewriteContent # ============================================================================ # Expiermental: not fully tested class RewriteContentAMF(RewriteContent): #pragma: no cover - def handle_custom_rewrite(self, text_type, status_headers, stream, env): - - if status_headers.get_header('Content-Type') == 'application/x-amf': + def handle_custom_rewrite(self, rewritten_headers, stream, mod, env): + if rewritten_headers.status_headers.get_header('Content-Type') == 'application/x-amf': stream = self.rewrite_amf(stream, env) return (super(RewriteContentAMF, self). - handle_custom_rewrite(text_type, status_headers, stream, env)) + handle_custom_rewrite(rewritten_headers, stream, mod, env)) def rewrite_amf(self, stream, env): try: diff --git a/pywb/rewrite/rewrite_content.py b/pywb/rewrite/rewrite_content.py index 677e20ae..6ae183a5 100644 --- a/pywb/rewrite/rewrite_content.py +++ b/pywb/rewrite/rewrite_content.py @@ -118,11 +118,9 @@ class RewriteContent(object): urlkey, cookie_rewriter) - status_headers = rewritten_headers.status_headers - - res = self.handle_custom_rewrite(rewritten_headers.text_type, - status_headers, + res = self.handle_custom_rewrite(rewritten_headers, stream, + wb_url.mod, env) if res: return res @@ -131,6 +129,7 @@ class RewriteContent(object): # ==================================================================== # special case -- need to ungzip the body + status_headers = rewritten_headers.status_headers text_type = rewritten_headers.text_type # see known js/css modifier specified, the context should run @@ -246,11 +245,18 @@ class RewriteContent(object): return (status_headers, gen, True) - def handle_custom_rewrite(self, text_type, status_headers, stream, env): + def handle_custom_rewrite(self, rewritten_headers, stream, mod, env): + text_type = rewritten_headers.text_type + status_headers = rewritten_headers.status_headers + # use rewritten headers, but no further rewriting needed if text_type is None: return (status_headers, self.stream_to_gen(stream), False) + if text_type == 'plain' and not mod in ('js_', 'cs_'): + rewritten_headers.readd_rewrite_removed() + return (status_headers, self.stream_to_gen(stream), False) + @staticmethod def _extract_html_charset(buff, status_headers): charset = None diff --git a/pywb/rewrite/test/test_header_rewriter.py b/pywb/rewrite/test/test_header_rewriter.py index 8e1f1a87..fc2146d7 100644 --- a/pywb/rewrite/test/test_header_rewriter.py +++ b/pywb/rewrite/test/test_header_rewriter.py @@ -6,7 +6,7 @@ HTTP Headers Rewriting # Text with charset >>> _test_headers([('Date', 'Fri, 03 Jan 2014 03:03:21 GMT'), ('Content-Length', '5'), ('Content-Type', 'text/html;charset=UTF-8')]) {'charset': 'utf-8', - 'removed_header_dict': {}, + 'removed_header_dict': {'content-length': '5'}, 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Date', 'Fri, 03 Jan 2014 03:03:21 GMT'), ('X-Archive-Orig-Content-Length', '5'), ('Content-Type', 'text/html;charset=UTF-8')]), @@ -24,9 +24,11 @@ HTTP Headers Rewriting >>> _test_headers([('Content-Length', '199999'), ('Content-Type', 'text/javascript'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked')]) {'charset': None, 'removed_header_dict': {'content-encoding': 'gzip', + 'content-length': '199999', 'transfer-encoding': 'chunked'}, 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Content-Length', '199999'), ('Content-Type', 'text/javascript'), + ('X-Archive-Orig-Content-Encoding', 'gzip'), ('X-Archive-Orig-Transfer-Encoding', 'chunked')]), 'text_type': 'js'} diff --git a/tests/test_live_proxy.py b/tests/test_live_proxy.py index 6c48c5de..575c51a8 100644 --- a/tests/test_live_proxy.py +++ b/tests/test_live_proxy.py @@ -125,7 +125,7 @@ class TestProxyLiveRewriter: def test_echo_proxy_start_unbounded_remove_range(self): headers = [('Range', 'bytes=0-')] - resp = self.testapp.get('/rewrite/http://example.com/', headers=headers) + resp = self.testapp.get('/rewrite/http://httpbin.org/range/100', headers=headers) # actual response is with range assert resp.status_int == 206 @@ -138,7 +138,7 @@ class TestProxyLiveRewriter: assert self.requestlog[0] == resp.text assert resp.headers['x-archive-orig-x-proxy'] == 'test' - assert self.requestlog[0].startswith('GET http://example.com/ HTTP/1.1') + assert self.requestlog[0].startswith('GET http://httpbin.org/range/100 HTTP/1.1') assert 'range: ' not in self.requestlog[0] assert len(self.cache) == 0