mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
since aborting the connection is normal behavior in many circumstances for browsers, handle it gracefully, continuing to download and archive the url from the remote server
This commit is contained in:
parent
534c61a4c1
commit
630779ff0b
@ -83,8 +83,8 @@ incorporated into warctools mainline.
|
||||
- dns cache?? the system already does a fine job I'm thinking
|
||||
- keepalive with remote servers?
|
||||
- python3
|
||||
- special handling for 304 not-modified (either write revisit record, or modify
|
||||
request so server never responds with 304)
|
||||
- special handling for 304 not-modified (write nothing or write revisit
|
||||
record... and/or modify request so server never responds with 304)
|
||||
|
||||
#### To not do
|
||||
|
||||
|
24
warcprox.py
24
warcprox.py
@ -1,6 +1,11 @@
|
||||
#!/usr/bin/python
|
||||
# vim:set sw=4 et:
|
||||
#
|
||||
"""
|
||||
WARC writing MITM HTTP/S proxy
|
||||
|
||||
See README.md or https://github.com/internetarchive/warcprox
|
||||
"""
|
||||
|
||||
import BaseHTTPServer, SocketServer
|
||||
import socket
|
||||
@ -112,9 +117,11 @@ class UnsupportedSchemeException(Exception):
|
||||
pass
|
||||
|
||||
|
||||
# This class intercepts the raw bytes, so it's the easiest place to hook in to
|
||||
# send the raw bytes on to the proxy destination.
|
||||
class ProxyingRecorder:
|
||||
"""
|
||||
Wraps a socket._fileobject, recording the bytes as they are read,
|
||||
calculating digests, and sending them on to the proxy client.
|
||||
"""
|
||||
|
||||
def __init__(self, fp, proxy_dest, digest_algorithm='sha1'):
|
||||
self.fp = fp
|
||||
@ -125,6 +132,7 @@ class ProxyingRecorder:
|
||||
self.payload_offset = None
|
||||
self.payload_digest = None
|
||||
self.proxy_dest = proxy_dest
|
||||
self._proxy_dest_conn_open = True
|
||||
self._prev_hunk_last_two_bytes = ''
|
||||
self.len = 0
|
||||
|
||||
@ -162,7 +170,15 @@ class ProxyingRecorder:
|
||||
self.block_digest.update(hunk)
|
||||
|
||||
self.tempfile.write(hunk)
|
||||
self.proxy_dest.sendall(hunk)
|
||||
|
||||
if self._proxy_dest_conn_open:
|
||||
try:
|
||||
self.proxy_dest.sendall(hunk)
|
||||
except BaseException as e:
|
||||
self._proxy_dest_conn_open = False
|
||||
logging.warn('{} sending data to proxy client'.format(e))
|
||||
logging.info('will continue downloading from remote server without sending to client')
|
||||
|
||||
self.len += len(hunk)
|
||||
|
||||
|
||||
@ -404,7 +420,7 @@ class DedupDb:
|
||||
json_value = json.dumps(py_value, separators=(',',':'))
|
||||
|
||||
self.db[key] = json_value
|
||||
logging.info('dedup db saved {}={}'.format(key, json_value))
|
||||
logging.info('dedup db saved {}:{}'.format(key, json_value))
|
||||
|
||||
|
||||
def lookup(self, key):
|
||||
|
Loading…
x
Reference in New Issue
Block a user