From 630779ff0b76b4ffef0b3cbc6884ff056efb47b0 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 30 Oct 2013 17:57:59 -0700 Subject: [PATCH] since aborting the connection is normal behavior in many circumstances for browsers, handle it gracefully, continuing to download and archive the url from the remote server --- README.md | 4 ++-- warcprox.py | 24 ++++++++++++++++++++---- 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 3b77234..43fefd7 100644 --- a/README.md +++ b/README.md @@ -83,8 +83,8 @@ incorporated into warctools mainline. - dns cache?? the system already does a fine job I'm thinking - keepalive with remote servers? - python3 -- special handling for 304 not-modified (either write revisit record, or modify - request so server never responds with 304) +- special handling for 304 not-modified (write nothing or write revisit + record... and/or modify request so server never responds with 304) #### To not do diff --git a/warcprox.py b/warcprox.py index 5a878da..d52f610 100755 --- a/warcprox.py +++ b/warcprox.py @@ -1,6 +1,11 @@ #!/usr/bin/python # vim:set sw=4 et: # +""" +WARC writing MITM HTTP/S proxy + +See README.md or https://github.com/internetarchive/warcprox +""" import BaseHTTPServer, SocketServer import socket @@ -112,9 +117,11 @@ class UnsupportedSchemeException(Exception): pass -# This class intercepts the raw bytes, so it's the easiest place to hook in to -# send the raw bytes on to the proxy destination. class ProxyingRecorder: + """ + Wraps a socket._fileobject, recording the bytes as they are read, + calculating digests, and sending them on to the proxy client. + """ def __init__(self, fp, proxy_dest, digest_algorithm='sha1'): self.fp = fp @@ -125,6 +132,7 @@ class ProxyingRecorder: self.payload_offset = None self.payload_digest = None self.proxy_dest = proxy_dest + self._proxy_dest_conn_open = True self._prev_hunk_last_two_bytes = '' self.len = 0 @@ -162,7 +170,15 @@ class ProxyingRecorder: self.block_digest.update(hunk) self.tempfile.write(hunk) - self.proxy_dest.sendall(hunk) + + if self._proxy_dest_conn_open: + try: + self.proxy_dest.sendall(hunk) + except BaseException as e: + self._proxy_dest_conn_open = False + logging.warn('{} sending data to proxy client'.format(e)) + logging.info('will continue downloading from remote server without sending to client') + self.len += len(hunk) @@ -404,7 +420,7 @@ class DedupDb: json_value = json.dumps(py_value, separators=(',',':')) self.db[key] = json_value - logging.info('dedup db saved {}={}'.format(key, json_value)) + logging.info('dedup db saved {}:{}'.format(key, json_value)) def lookup(self, key):