1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

proxy: add ProxyRouter wrapper to check for content-length and, if missing, perform full buffering (http1.0) or chunked encoding (http1.1) (separate from replay view buffering)

add tests for buffering and chunked encoding, fixes #143, also tests no banner url-rewrite only proxy related to #142
This commit is contained in:
Ilya Kreymer 2015-10-25 18:02:51 -07:00
parent 0c96591c49
commit eeb35ea3b4
5 changed files with 125 additions and 8 deletions

View File

@ -10,12 +10,15 @@ import socket
import ssl
from pywb.rewrite.url_rewriter import SchemeOnlyUrlRewriter
from pywb.rewrite.rewrite_content import RewriteContent
from pywb.utils.wbexception import BadRequestException
from pywb.utils.bufferedreaders import BufferedReader
from pywb.framework.proxy_resolvers import ProxyAuthResolver, CookieResolver, IPCacheResolver
from tempfile import SpooledTemporaryFile
#=================================================================
class ProxyArchivalRouter(ArchivalRouter):
@ -55,6 +58,7 @@ class ProxyRouter(object):
BLOCK_SIZE = 4096
DEF_MAGIC_NAME = 'pywb.proxy'
BUFF_RESPONSE_MEM_SIZE = 1024*1024
CERT_DL_PEM = '/pywb-ca.pem'
CERT_DL_P12 = '/pywb-ca.p12'
@ -222,12 +226,63 @@ class ProxyRouter(object):
wbrequest.wb_url.mod = 'uo_'
response = route.handler(wbrequest)
if not response:
return None
# add extra headers for replay responses
if wbrequest.wb_url and wbrequest.wb_url.is_replay():
response.status_headers.replace_headers(self.extra_headers)
# check for content-length
res = response.status_headers.get_header('content-length')
try:
if int(res) > 0:
return response
except:
pass
# need to either chunk or buffer to get content-length
if env.get('SERVER_PROTOCOL') == 'HTTP/1.1':
response.status_headers.remove_header('content-length')
response.status_headers.headers.append(('Transfer-Encoding', 'chunked'))
response.body = self._chunk_encode(response.body)
else:
response.body = self._buffer_response(response.status_headers,
response.body)
return response
@staticmethod
def _chunk_encode(orig_iter):
for buff in orig_iter:
chunk = bytes(buff)
if not len(chunk):
continue
chunk_len = '%X\r\n' % len(chunk)
yield chunk_len
yield chunk
yield '\r\n'
yield '0\r\n\r\n'
@staticmethod
def _buffer_response(status_headers, iterator):
out = SpooledTemporaryFile(ProxyRouter.BUFF_RESPONSE_MEM_SIZE)
size = 0
for buff in iterator:
buff = bytes(buff)
size += len(buff)
out.write(buff)
content_length_str = str(size)
# remove existing content length
status_headers.replace_header('Content-Length',
content_length_str)
out.seek(0)
return RewriteContent.stream_to_gen(out)
def get_request_socket(self, env):
if not self.ca:
return None
@ -259,7 +314,8 @@ class ProxyRouter(object):
return WbResponse.text_response('HTTPS Proxy Not Supported',
'405 HTTPS Proxy Not Supported')
sock.send('HTTP/1.0 200 Connection Established\r\n')
sock.send('HTTP/1.1 200 Connection Established\r\n')
sock.send('Proxy-Connection: close\r\n')
sock.send('Server: pywb proxy\r\n')
sock.send('\r\n')

View File

@ -203,7 +203,7 @@ def test_example_1():
status_headers, buff = get_rewritten('http://example.com/', urlrewriter, req_headers={'Connection': 'close'})
# verify header rewriting
assert (('X-Archive-Orig-Content-Length', '1270') in status_headers.headers), status_headers
assert status_headers.get_header('x-archive-orig-content-length') == '1270', status_headers
# verify utf-8 charset detection

View File

@ -187,12 +187,7 @@ class ReplayView(object):
content_len = 0
if content_len <= 0:
# if proxy mode, must set content-length (or use chunked)
if wbrequest.options.get('is_proxy'):
max_size = 0
else:
max_size = self.buffer_max_size
max_size = self.buffer_max_size
response_iter = self.buffered_response(status_headers,
response_iter,
max_size)

View File

@ -0,0 +1,18 @@
collections:
all:
- ./sample_archive/cdx/iana.cdx
archive_paths: ./sample_archive/warcs/
enable_http_proxy: true
buffer_response: false
proxy_options:
enable_https_proxy: false
cookie_resolver: ip
use_default_coll: all
use_banner: false
use_client_rewrite: false

View File

@ -0,0 +1,48 @@
from pytest import raises
import webtest
import base64
from pywb.webapp.pywb_init import create_wb_router
from pywb.framework.wsgi_wrappers import init_app
from pywb.cdx.cdxobject import CDXObject
from urlparse import urlsplit
from server_mock import make_setup_module, BaseIntegration
setup_module = make_setup_module('tests/test_config_proxy_no_banner.yaml')
class TestProxyNoBanner(BaseIntegration):
def get_url(self, uri, addr='127.0.0.1', server_protocol='HTTP/1.0'):
parts = urlsplit(uri)
env = dict(REQUEST_URI=uri, QUERY_STRING=parts.query, SCRIPT_NAME='',
SERVER_PROTOCOL=server_protocol, REMOTE_ADDR=addr)
# 'Simulating' proxy by settings REQUEST_URI explicitly to full url with empty SCRIPT_NAME
return self.testapp.get('/x-ignore-this-x', extra_environ=env)
def test_proxy_chunked(self):
resp = self.get_url('http://www.iana.org/_img/2013.1/icann-logo.svg', server_protocol='HTTP/1.1')
assert resp.content_type == 'image/svg+xml'
assert resp.headers['Transfer-Encoding'] == 'chunked'
assert int(resp.headers['Content-Length']) == len(resp.body)
def test_proxy_buffered(self):
resp = self.get_url('http://www.iana.org/_img/2013.1/icann-logo.svg', server_protocol='HTTP/1.0')
assert resp.content_type == 'image/svg+xml'
assert 'Transfer-Encoding' not in resp.headers
assert int(resp.headers['Content-Length']) == len(resp.body)
def test_proxy_html_url_only_rewrite_buffered(self):
resp = self.get_url('http://www.iana.org/', server_protocol='HTTP/1.0')
assert 'Transfer-Encoding' not in resp.headers
assert int(resp.headers['Content-Length']) == len(resp.body)
def test_proxy_js_url_only_rewrite_buffered(self):
resp = self.get_url('http://www.iana.org/_js/2013.1/iana.js', server_protocol='HTTP/1.0')
assert 'Transfer-Encoding' not in resp.headers
assert int(resp.headers['Content-Length']) == len(resp.body)
def test_proxy_js_url_only_rewrite_chunked(self):
resp = self.get_url('http://www.iana.org/_js/2013.1/iana.js', server_protocol='HTTP/1.1')
assert resp.headers['Transfer-Encoding'] == 'chunked'
assert int(resp.headers['Content-Length']) == len(resp.body)