mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
send raw bytes from server response back to proxy client (not unchunked)
This commit is contained in:
parent
5f90e76ca6
commit
fc139f1f4e
85
warcprox.py
85
warcprox.py
@ -109,66 +109,68 @@ class UnsupportedSchemeException(Exception):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class Recorder:
|
# This class intercepts the raw bytes, so it's the easiest place to hook in to
|
||||||
|
# send the raw bytes on to the proxy destination.
|
||||||
|
class ProxyingRecorder:
|
||||||
|
|
||||||
def __init__(self, fp):
|
def __init__(self, fp, proxy_dest):
|
||||||
self.fp = fp
|
self.fp = fp
|
||||||
self.data = bytearray('')
|
self.data = bytearray('')
|
||||||
self.block_sha1 = hashlib.sha1()
|
self.block_sha1 = hashlib.sha1()
|
||||||
self.payload_sha1 = None
|
self.payload_sha1 = None
|
||||||
|
self.proxy_dest = proxy_dest
|
||||||
|
|
||||||
|
def _update(self, hunk):
|
||||||
def _update(self, chunk):
|
|
||||||
if self.payload_sha1 is None:
|
if self.payload_sha1 is None:
|
||||||
# convoluted handling of two newlines crossing chunks
|
# convoluted handling of two newlines crossing chunks
|
||||||
# XXX write tests for this
|
# XXX write tests for this
|
||||||
if self.data.endswith('\n'):
|
if self.data.endswith('\n'):
|
||||||
if chunk.startswith('\n'):
|
if hunk.startswith('\n'):
|
||||||
self.payload_sha1 = hashlib.sha1()
|
self.payload_sha1 = hashlib.sha1()
|
||||||
self.payload_sha1.update(chunk[1:])
|
self.payload_sha1.update(hunk[1:])
|
||||||
elif chunk.startswith('\r\n'):
|
elif hunk.startswith('\r\n'):
|
||||||
self.payload_sha1 = hashlib.sha1()
|
self.payload_sha1 = hashlib.sha1()
|
||||||
self.payload_sha1.update(chunk[2:])
|
self.payload_sha1.update(hunk[2:])
|
||||||
elif self.data.endswith('\n\r'):
|
elif self.data.endswith('\n\r'):
|
||||||
if chunk.startswith('\n'):
|
if hunk.startswith('\n'):
|
||||||
self.payload_sha1 = hashlib.sha1()
|
self.payload_sha1 = hashlib.sha1()
|
||||||
self.payload_sha1.update(chunk[1:])
|
self.payload_sha1.update(hunk[1:])
|
||||||
else:
|
else:
|
||||||
m = re.search(r'\n\r?\n', chunk)
|
m = re.search(r'\n\r?\n', hunk)
|
||||||
if m is not None:
|
if m is not None:
|
||||||
self.payload_sha1 = hashlib.sha1()
|
self.payload_sha1 = hashlib.sha1()
|
||||||
self.payload_sha1.update(chunk[m.end():])
|
self.payload_sha1.update(hunk[m.end():])
|
||||||
else:
|
else:
|
||||||
self.payload_sha1.update(chunk)
|
self.payload_sha1.update(hunk)
|
||||||
|
|
||||||
self.block_sha1.update(chunk)
|
self.block_sha1.update(hunk)
|
||||||
self.data.extend(chunk)
|
|
||||||
|
self.data.extend(hunk)
|
||||||
|
self.proxy_dest.sendall(hunk)
|
||||||
|
|
||||||
def read(self, size=-1):
|
def read(self, size=-1):
|
||||||
chunk = self.fp.read(size=size)
|
hunk = self.fp.read(size=size)
|
||||||
self._update(chunk)
|
self._update(hunk)
|
||||||
return chunk
|
return hunk
|
||||||
|
|
||||||
|
|
||||||
def readline(self, size=-1):
|
def readline(self, size=-1):
|
||||||
# XXX does not call self.read(); if it ever did this would break
|
# XXX does not call self.read(); if it ever did this would break
|
||||||
chunk = self.fp.readline(size=size)
|
hunk = self.fp.readline(size=size)
|
||||||
self._update(chunk)
|
self._update(hunk)
|
||||||
return chunk
|
return hunk
|
||||||
|
|
||||||
|
|
||||||
def close(self):
|
def close(self):
|
||||||
return self.fp.close()
|
return self.fp.close()
|
||||||
|
|
||||||
|
|
||||||
class RecordingHTTPResponse(httplib.HTTPResponse):
|
class ProxyingRecordingHTTPResponse(httplib.HTTPResponse):
|
||||||
|
|
||||||
def __init__(self, sock, debuglevel=0, strict=0, method=None, buffering=False):
|
def __init__(self, sock, debuglevel=0, strict=0, method=None, buffering=False, proxy_dest=None):
|
||||||
httplib.HTTPResponse.__init__(self, sock, debuglevel=debuglevel, strict=strict, method=method, buffering=buffering)
|
httplib.HTTPResponse.__init__(self, sock, debuglevel=debuglevel, strict=strict, method=method, buffering=buffering)
|
||||||
|
|
||||||
# Keep around extra reference to self.fp because HTTPResponse sets
|
# Keep around extra reference to self.fp because HTTPResponse sets
|
||||||
# self.fp=None after it finishes reading, but we still need it
|
# self.fp=None after it finishes reading, but we still need it
|
||||||
self.recorder = Recorder(self.fp)
|
self.recorder = ProxyingRecorder(self.fp, proxy_dest)
|
||||||
self.fp = self.recorder
|
self.fp = self.recorder
|
||||||
|
|
||||||
def recorded(self):
|
def recorded(self):
|
||||||
@ -264,35 +266,32 @@ class ProxyHandler(BaseHTTPServer.BaseHTTPRequestHandler):
|
|||||||
# Append message body if present to the request
|
# Append message body if present to the request
|
||||||
if 'Content-Length' in self.headers:
|
if 'Content-Length' in self.headers:
|
||||||
req += self.rfile.read(int(self.headers['Content-Length']))
|
req += self.rfile.read(int(self.headers['Content-Length']))
|
||||||
|
|
||||||
warc_record_queuer.do_request(req)
|
|
||||||
|
|
||||||
# Send it down the pipe!
|
# Send it down the pipe!
|
||||||
self._proxy_sock.sendall(req)
|
self._proxy_sock.sendall(req)
|
||||||
|
|
||||||
# Parse response
|
# We want HTTPResponse's smarts about http and handling of
|
||||||
h = RecordingHTTPResponse(self._proxy_sock)
|
# non-compliant servers. But HTTPResponse.read() doesn't return the raw
|
||||||
|
# bytes read from the server, it unchunks them if they're chunked, and
|
||||||
|
# might do other stuff. We want to send the raw bytes back to the
|
||||||
|
# client. So we ignore the values returned by h.read() below. Instead
|
||||||
|
# the ProxyingRecordingHTTPResponse takes care of sending the raw bytes
|
||||||
|
# to the proxy client.
|
||||||
|
|
||||||
|
# Proxy and record the response
|
||||||
|
h = ProxyingRecordingHTTPResponse(self._proxy_sock, proxy_dest=self.request)
|
||||||
h.begin()
|
h.begin()
|
||||||
|
|
||||||
# Get rid of the pesky header
|
|
||||||
del h.msg['Transfer-Encoding']
|
|
||||||
|
|
||||||
# Time to relay the message across
|
|
||||||
headers = '%s %s %s\r\n' % (self.request_version, h.status, h.reason)
|
|
||||||
headers += '%s\r\n' % h.msg
|
|
||||||
self.request.sendall(headers)
|
|
||||||
|
|
||||||
buf = h.read(4096)
|
buf = h.read(4096)
|
||||||
while buf != '':
|
while buf != '':
|
||||||
self.request.sendall(buf)
|
|
||||||
buf = h.read(4096)
|
buf = h.read(4096)
|
||||||
|
|
||||||
warc_record_queuer.do_response(h.recorder)
|
|
||||||
|
|
||||||
# Let's close off the remote end
|
# Let's close off the remote end
|
||||||
h.close()
|
h.close()
|
||||||
self._proxy_sock.close()
|
self._proxy_sock.close()
|
||||||
|
|
||||||
|
warc_record_queuer.do_request(req)
|
||||||
|
warc_record_queuer.do_response(h.recorder)
|
||||||
|
|
||||||
def __getattr__(self, item):
|
def __getattr__(self, item):
|
||||||
if item.startswith('do_'):
|
if item.startswith('do_'):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user