From eeb35ea3b42eb2f0f0483178b8a7c16f8cb97674 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sun, 25 Oct 2015 18:02:51 -0700 Subject: [PATCH] proxy: add ProxyRouter wrapper to check for content-length and, if missing, perform full buffering (http1.0) or chunked encoding (http1.1) (separate from replay view buffering) add tests for buffering and chunked encoding, fixes #143, also tests no banner url-rewrite only proxy related to #142 --- pywb/framework/proxy.py | 58 +++++++++++++++++++++++++- pywb/rewrite/test/test_rewrite_live.py | 2 +- pywb/webapp/replay_views.py | 7 +--- tests/test_config_proxy_no_banner.yaml | 18 ++++++++ tests/test_proxy_http_no_banner.py | 48 +++++++++++++++++++++ 5 files changed, 125 insertions(+), 8 deletions(-) create mode 100644 tests/test_config_proxy_no_banner.yaml create mode 100644 tests/test_proxy_http_no_banner.py diff --git a/pywb/framework/proxy.py b/pywb/framework/proxy.py index 61ad4d16..6024c150 100644 --- a/pywb/framework/proxy.py +++ b/pywb/framework/proxy.py @@ -10,12 +10,15 @@ import socket import ssl from pywb.rewrite.url_rewriter import SchemeOnlyUrlRewriter +from pywb.rewrite.rewrite_content import RewriteContent from pywb.utils.wbexception import BadRequestException from pywb.utils.bufferedreaders import BufferedReader from pywb.framework.proxy_resolvers import ProxyAuthResolver, CookieResolver, IPCacheResolver +from tempfile import SpooledTemporaryFile + #================================================================= class ProxyArchivalRouter(ArchivalRouter): @@ -55,6 +58,7 @@ class ProxyRouter(object): BLOCK_SIZE = 4096 DEF_MAGIC_NAME = 'pywb.proxy' + BUFF_RESPONSE_MEM_SIZE = 1024*1024 CERT_DL_PEM = '/pywb-ca.pem' CERT_DL_P12 = '/pywb-ca.p12' @@ -222,12 +226,63 @@ class ProxyRouter(object): wbrequest.wb_url.mod = 'uo_' response = route.handler(wbrequest) + if not response: + return None + # add extra headers for replay responses if wbrequest.wb_url and wbrequest.wb_url.is_replay(): response.status_headers.replace_headers(self.extra_headers) + # check for content-length + res = response.status_headers.get_header('content-length') + try: + if int(res) > 0: + return response + except: + pass + + # need to either chunk or buffer to get content-length + if env.get('SERVER_PROTOCOL') == 'HTTP/1.1': + response.status_headers.remove_header('content-length') + response.status_headers.headers.append(('Transfer-Encoding', 'chunked')) + response.body = self._chunk_encode(response.body) + else: + response.body = self._buffer_response(response.status_headers, + response.body) + return response + @staticmethod + def _chunk_encode(orig_iter): + for buff in orig_iter: + chunk = bytes(buff) + if not len(chunk): + continue + chunk_len = '%X\r\n' % len(chunk) + yield chunk_len + yield chunk + yield '\r\n' + + yield '0\r\n\r\n' + + @staticmethod + def _buffer_response(status_headers, iterator): + out = SpooledTemporaryFile(ProxyRouter.BUFF_RESPONSE_MEM_SIZE) + size = 0 + + for buff in iterator: + buff = bytes(buff) + size += len(buff) + out.write(buff) + + content_length_str = str(size) + # remove existing content length + status_headers.replace_header('Content-Length', + content_length_str) + + out.seek(0) + return RewriteContent.stream_to_gen(out) + def get_request_socket(self, env): if not self.ca: return None @@ -259,7 +314,8 @@ class ProxyRouter(object): return WbResponse.text_response('HTTPS Proxy Not Supported', '405 HTTPS Proxy Not Supported') - sock.send('HTTP/1.0 200 Connection Established\r\n') + sock.send('HTTP/1.1 200 Connection Established\r\n') + sock.send('Proxy-Connection: close\r\n') sock.send('Server: pywb proxy\r\n') sock.send('\r\n') diff --git a/pywb/rewrite/test/test_rewrite_live.py b/pywb/rewrite/test/test_rewrite_live.py index 02c17fe0..21f1cbed 100644 --- a/pywb/rewrite/test/test_rewrite_live.py +++ b/pywb/rewrite/test/test_rewrite_live.py @@ -203,7 +203,7 @@ def test_example_1(): status_headers, buff = get_rewritten('http://example.com/', urlrewriter, req_headers={'Connection': 'close'}) # verify header rewriting - assert (('X-Archive-Orig-Content-Length', '1270') in status_headers.headers), status_headers + assert status_headers.get_header('x-archive-orig-content-length') == '1270', status_headers # verify utf-8 charset detection diff --git a/pywb/webapp/replay_views.py b/pywb/webapp/replay_views.py index 50441000..d6637141 100644 --- a/pywb/webapp/replay_views.py +++ b/pywb/webapp/replay_views.py @@ -187,12 +187,7 @@ class ReplayView(object): content_len = 0 if content_len <= 0: - # if proxy mode, must set content-length (or use chunked) - if wbrequest.options.get('is_proxy'): - max_size = 0 - else: - max_size = self.buffer_max_size - + max_size = self.buffer_max_size response_iter = self.buffered_response(status_headers, response_iter, max_size) diff --git a/tests/test_config_proxy_no_banner.yaml b/tests/test_config_proxy_no_banner.yaml new file mode 100644 index 00000000..596b764e --- /dev/null +++ b/tests/test_config_proxy_no_banner.yaml @@ -0,0 +1,18 @@ +collections: + all: + - ./sample_archive/cdx/iana.cdx + +archive_paths: ./sample_archive/warcs/ + +enable_http_proxy: true + +buffer_response: false + +proxy_options: + enable_https_proxy: false + + cookie_resolver: ip + use_default_coll: all + + use_banner: false + use_client_rewrite: false diff --git a/tests/test_proxy_http_no_banner.py b/tests/test_proxy_http_no_banner.py new file mode 100644 index 00000000..3168b67e --- /dev/null +++ b/tests/test_proxy_http_no_banner.py @@ -0,0 +1,48 @@ +from pytest import raises +import webtest +import base64 + +from pywb.webapp.pywb_init import create_wb_router +from pywb.framework.wsgi_wrappers import init_app +from pywb.cdx.cdxobject import CDXObject + +from urlparse import urlsplit + +from server_mock import make_setup_module, BaseIntegration + +setup_module = make_setup_module('tests/test_config_proxy_no_banner.yaml') + +class TestProxyNoBanner(BaseIntegration): + def get_url(self, uri, addr='127.0.0.1', server_protocol='HTTP/1.0'): + parts = urlsplit(uri) + env = dict(REQUEST_URI=uri, QUERY_STRING=parts.query, SCRIPT_NAME='', + SERVER_PROTOCOL=server_protocol, REMOTE_ADDR=addr) + # 'Simulating' proxy by settings REQUEST_URI explicitly to full url with empty SCRIPT_NAME + return self.testapp.get('/x-ignore-this-x', extra_environ=env) + + def test_proxy_chunked(self): + resp = self.get_url('http://www.iana.org/_img/2013.1/icann-logo.svg', server_protocol='HTTP/1.1') + assert resp.content_type == 'image/svg+xml' + assert resp.headers['Transfer-Encoding'] == 'chunked' + assert int(resp.headers['Content-Length']) == len(resp.body) + + def test_proxy_buffered(self): + resp = self.get_url('http://www.iana.org/_img/2013.1/icann-logo.svg', server_protocol='HTTP/1.0') + assert resp.content_type == 'image/svg+xml' + assert 'Transfer-Encoding' not in resp.headers + assert int(resp.headers['Content-Length']) == len(resp.body) + + def test_proxy_html_url_only_rewrite_buffered(self): + resp = self.get_url('http://www.iana.org/', server_protocol='HTTP/1.0') + assert 'Transfer-Encoding' not in resp.headers + assert int(resp.headers['Content-Length']) == len(resp.body) + + def test_proxy_js_url_only_rewrite_buffered(self): + resp = self.get_url('http://www.iana.org/_js/2013.1/iana.js', server_protocol='HTTP/1.0') + assert 'Transfer-Encoding' not in resp.headers + assert int(resp.headers['Content-Length']) == len(resp.body) + + def test_proxy_js_url_only_rewrite_chunked(self): + resp = self.get_url('http://www.iana.org/_js/2013.1/iana.js', server_protocol='HTTP/1.1') + assert resp.headers['Transfer-Encoding'] == 'chunked' + assert int(resp.headers['Content-Length']) == len(resp.body)