since aborting the connection is normal behavior in many circumstances for browsers, handle it gracefully, continuing to download and archive the url from the remote server

This commit is contained in:
Noah Levitt 2013-10-30 17:57:59 -07:00
parent 534c61a4c1
commit 630779ff0b
2 changed files with 22 additions and 6 deletions

View File

@ -83,8 +83,8 @@ incorporated into warctools mainline.
- dns cache?? the system already does a fine job I'm thinking - dns cache?? the system already does a fine job I'm thinking
- keepalive with remote servers? - keepalive with remote servers?
- python3 - python3
- special handling for 304 not-modified (either write revisit record, or modify - special handling for 304 not-modified (write nothing or write revisit
request so server never responds with 304) record... and/or modify request so server never responds with 304)
#### To not do #### To not do

View File

@ -1,6 +1,11 @@
#!/usr/bin/python #!/usr/bin/python
# vim:set sw=4 et: # vim:set sw=4 et:
# #
"""
WARC writing MITM HTTP/S proxy
See README.md or https://github.com/internetarchive/warcprox
"""
import BaseHTTPServer, SocketServer import BaseHTTPServer, SocketServer
import socket import socket
@ -112,9 +117,11 @@ class UnsupportedSchemeException(Exception):
pass pass
# This class intercepts the raw bytes, so it's the easiest place to hook in to
# send the raw bytes on to the proxy destination.
class ProxyingRecorder: class ProxyingRecorder:
"""
Wraps a socket._fileobject, recording the bytes as they are read,
calculating digests, and sending them on to the proxy client.
"""
def __init__(self, fp, proxy_dest, digest_algorithm='sha1'): def __init__(self, fp, proxy_dest, digest_algorithm='sha1'):
self.fp = fp self.fp = fp
@ -125,6 +132,7 @@ class ProxyingRecorder:
self.payload_offset = None self.payload_offset = None
self.payload_digest = None self.payload_digest = None
self.proxy_dest = proxy_dest self.proxy_dest = proxy_dest
self._proxy_dest_conn_open = True
self._prev_hunk_last_two_bytes = '' self._prev_hunk_last_two_bytes = ''
self.len = 0 self.len = 0
@ -162,7 +170,15 @@ class ProxyingRecorder:
self.block_digest.update(hunk) self.block_digest.update(hunk)
self.tempfile.write(hunk) self.tempfile.write(hunk)
self.proxy_dest.sendall(hunk)
if self._proxy_dest_conn_open:
try:
self.proxy_dest.sendall(hunk)
except BaseException as e:
self._proxy_dest_conn_open = False
logging.warn('{} sending data to proxy client'.format(e))
logging.info('will continue downloading from remote server without sending to client')
self.len += len(hunk) self.len += len(hunk)
@ -404,7 +420,7 @@ class DedupDb:
json_value = json.dumps(py_value, separators=(',',':')) json_value = json.dumps(py_value, separators=(',',':'))
self.db[key] = json_value self.db[key] = json_value
logging.info('dedup db saved {}={}'.format(key, json_value)) logging.info('dedup db saved {}:{}'.format(key, json_value))
def lookup(self, key): def lookup(self, key):