mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
calculate payload sha1
This commit is contained in:
parent
9d176a408b
commit
72f141fec3
119
warcprox.py
119
warcprox.py
@ -19,6 +19,7 @@ import os
|
|||||||
import argparse
|
import argparse
|
||||||
import random
|
import random
|
||||||
import httplib
|
import httplib
|
||||||
|
import re
|
||||||
|
|
||||||
class CertificateAuthority(object):
|
class CertificateAuthority(object):
|
||||||
|
|
||||||
@ -110,15 +111,49 @@ class Recorder:
|
|||||||
|
|
||||||
def __init__(self, fp):
|
def __init__(self, fp):
|
||||||
self.fp = fp
|
self.fp = fp
|
||||||
self.recorded = bytearray('')
|
self.data = bytearray('')
|
||||||
|
self.block_sha1 = hashlib.sha1()
|
||||||
|
self.payload_sha1 = None
|
||||||
|
|
||||||
|
|
||||||
|
def _update(self, chunk):
|
||||||
|
if self.payload_sha1 is None:
|
||||||
|
# convoluted handling of two newlines crossing chunks
|
||||||
|
# XXX write tests for this
|
||||||
|
if self.data.endswith('\n'):
|
||||||
|
if chunk.startswith('\n'):
|
||||||
|
self.payload_sha1 = hashlib.sha1()
|
||||||
|
self.payload_sha1.update(chunk[1:])
|
||||||
|
elif chunk.startswith('\r\n'):
|
||||||
|
self.payload_sha1 = hashlib.sha1()
|
||||||
|
self.payload_sha1.update(chunk[2:])
|
||||||
|
elif self.data.endswith('\n\r'):
|
||||||
|
if chunk.startswith('\n'):
|
||||||
|
self.payload_sha1 = hashlib.sha1()
|
||||||
|
self.payload_sha1.update(chunk[1:])
|
||||||
|
else:
|
||||||
|
m = re.search(r'\n\r?\n', chunk)
|
||||||
|
if m is not None:
|
||||||
|
self.payload_sha1 = hashlib.sha1()
|
||||||
|
self.payload_sha1.update(chunk[m.end():])
|
||||||
|
else:
|
||||||
|
self.payload_sha1.update(chunk)
|
||||||
|
|
||||||
|
self.block_sha1.update(chunk)
|
||||||
|
self.data.extend(chunk)
|
||||||
|
|
||||||
def read(self, size=-1):
|
def read(self, size=-1):
|
||||||
result = self.fp.read(size=size)
|
chunk = self.fp.read(size=size)
|
||||||
self.recorded.extend(result)
|
self._update(chunk)
|
||||||
return result
|
return chunk
|
||||||
|
|
||||||
|
|
||||||
def readline(self, size=-1):
|
def readline(self, size=-1):
|
||||||
return self.fp.readline(size=size)
|
# XXX does not call self.read(); if it ever did this would break
|
||||||
|
chunk = self.fp.readline(size=size)
|
||||||
|
self._update(chunk)
|
||||||
|
return chunk
|
||||||
|
|
||||||
|
|
||||||
def close(self):
|
def close(self):
|
||||||
return self.fp.close()
|
return self.fp.close()
|
||||||
@ -217,6 +252,8 @@ class ProxyHandler(BaseHTTPServer.BaseHTTPRequestHandler):
|
|||||||
return
|
return
|
||||||
# Extract path
|
# Extract path
|
||||||
|
|
||||||
|
warc_record_queuer = WarcRecordQueuer(self.server, self)
|
||||||
|
|
||||||
# Build request
|
# Build request
|
||||||
req = '%s %s %s\r\n' % (self.command, self.path, self.request_version)
|
req = '%s %s %s\r\n' % (self.command, self.path, self.request_version)
|
||||||
|
|
||||||
@ -227,10 +264,10 @@ class ProxyHandler(BaseHTTPServer.BaseHTTPRequestHandler):
|
|||||||
if 'Content-Length' in self.headers:
|
if 'Content-Length' in self.headers:
|
||||||
req += self.rfile.read(int(self.headers['Content-Length']))
|
req += self.rfile.read(int(self.headers['Content-Length']))
|
||||||
|
|
||||||
interceptors = [p(self.server, self) for p in self.server._interceptors]
|
warc_record_queuer.do_request(req)
|
||||||
|
|
||||||
# Send it down the pipe!
|
# Send it down the pipe!
|
||||||
self._proxy_sock.sendall(self.mitm_request(req, interceptors))
|
self._proxy_sock.sendall(req)
|
||||||
|
|
||||||
# Parse response
|
# Parse response
|
||||||
h = RecordingHTTPResponse(self._proxy_sock)
|
h = RecordingHTTPResponse(self._proxy_sock)
|
||||||
@ -249,25 +286,13 @@ class ProxyHandler(BaseHTTPServer.BaseHTTPRequestHandler):
|
|||||||
self.request.sendall(buf)
|
self.request.sendall(buf)
|
||||||
buf = h.read(4096)
|
buf = h.read(4096)
|
||||||
|
|
||||||
self.mitm_response(h.recorded(), interceptors)
|
warc_record_queuer.do_response(h.recorder)
|
||||||
|
|
||||||
# Let's close off the remote end
|
# Let's close off the remote end
|
||||||
h.close()
|
h.close()
|
||||||
self._proxy_sock.close()
|
self._proxy_sock.close()
|
||||||
|
|
||||||
|
|
||||||
def mitm_request(self, data, interceptors):
|
|
||||||
for i in interceptors:
|
|
||||||
data = i.do_request(data)
|
|
||||||
return data
|
|
||||||
|
|
||||||
|
|
||||||
def mitm_response(self, data, interceptors):
|
|
||||||
for i in interceptors:
|
|
||||||
data = i.do_response(data)
|
|
||||||
return data
|
|
||||||
|
|
||||||
|
|
||||||
def __getattr__(self, item):
|
def __getattr__(self, item):
|
||||||
if item.startswith('do_'):
|
if item.startswith('do_'):
|
||||||
return self.do_COMMAND
|
return self.do_COMMAND
|
||||||
@ -283,21 +308,6 @@ class ProxyHandler(BaseHTTPServer.BaseHTTPRequestHandler):
|
|||||||
self.log_date_time_string(), format % args))
|
self.log_date_time_string(), format % args))
|
||||||
|
|
||||||
|
|
||||||
# InterceptorPlugin modified from pymiproxy to send the request and response
|
|
||||||
# from a single transaction through the same instance of the interceptor
|
|
||||||
class InterceptorPlugin(object):
|
|
||||||
|
|
||||||
def __init__(self, server, msg):
|
|
||||||
self.server = server
|
|
||||||
self.message = msg
|
|
||||||
|
|
||||||
def do_request(self, data):
|
|
||||||
return data
|
|
||||||
|
|
||||||
def do_response(self, data):
|
|
||||||
return data
|
|
||||||
|
|
||||||
|
|
||||||
class InvalidInterceptorPluginException(Exception):
|
class InvalidInterceptorPluginException(Exception):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@ -306,24 +316,15 @@ class MitmProxy(BaseHTTPServer.HTTPServer):
|
|||||||
|
|
||||||
def __init__(self, server_address, req_handler_class=ProxyHandler, bind_and_activate=True, ca_file='./warcprox-ca.pem', certs_dir='./warcprox-ca'):
|
def __init__(self, server_address, req_handler_class=ProxyHandler, bind_and_activate=True, ca_file='./warcprox-ca.pem', certs_dir='./warcprox-ca'):
|
||||||
BaseHTTPServer.HTTPServer.__init__(self, server_address, req_handler_class, bind_and_activate)
|
BaseHTTPServer.HTTPServer.__init__(self, server_address, req_handler_class, bind_and_activate)
|
||||||
self._interceptors = []
|
|
||||||
self.ca = CertificateAuthority(ca_file, certs_dir)
|
self.ca = CertificateAuthority(ca_file, certs_dir)
|
||||||
|
|
||||||
|
|
||||||
def register_interceptor(self, interceptor_class):
|
|
||||||
if not issubclass(interceptor_class, InterceptorPlugin):
|
|
||||||
raise InvalidInterceptorPluginException('Expected type InterceptorPlugin got %s instead' % type(interceptor_class))
|
|
||||||
self._interceptors.append(interceptor_class)
|
|
||||||
|
|
||||||
|
|
||||||
def server_activate(self):
|
def server_activate(self):
|
||||||
BaseHTTPServer.HTTPServer.server_activate(self)
|
BaseHTTPServer.HTTPServer.server_activate(self)
|
||||||
logging.info('listening on {0}:{1}'.format(self.server_address[0], self.server_address[1]))
|
logging.info('listening on {0}:{1}'.format(self.server_address[0], self.server_address[1]))
|
||||||
|
|
||||||
|
|
||||||
def server_close(self):
|
def server_close(self):
|
||||||
|
logging.info('shutting down')
|
||||||
BaseHTTPServer.HTTPServer.server_close(self)
|
BaseHTTPServer.HTTPServer.server_close(self)
|
||||||
logging.info('shut down')
|
|
||||||
|
|
||||||
|
|
||||||
class AsyncMitmProxy(SocketServer.ThreadingMixIn, MitmProxy):
|
class AsyncMitmProxy(SocketServer.ThreadingMixIn, MitmProxy):
|
||||||
@ -331,7 +332,7 @@ class AsyncMitmProxy(SocketServer.ThreadingMixIn, MitmProxy):
|
|||||||
|
|
||||||
|
|
||||||
# assumes do_request happens before do_response
|
# assumes do_request happens before do_response
|
||||||
class WarcRecordQueuer(InterceptorPlugin):
|
class WarcRecordQueuer:
|
||||||
|
|
||||||
# Each item in the queue is a tuple of warc records which should be written
|
# Each item in the queue is a tuple of warc records which should be written
|
||||||
# together, e.g. (reponse, request) where request has WARC-Concurrent-To
|
# together, e.g. (reponse, request) where request has WARC-Concurrent-To
|
||||||
@ -344,7 +345,8 @@ class WarcRecordQueuer(InterceptorPlugin):
|
|||||||
|
|
||||||
|
|
||||||
def __init__(self, server, msg):
|
def __init__(self, server, msg):
|
||||||
InterceptorPlugin.__init__(self, server, msg)
|
self.server = server
|
||||||
|
self.msg = msg
|
||||||
|
|
||||||
if msg.is_connect:
|
if msg.is_connect:
|
||||||
# have to construct the url if proxy request is a CONNECT
|
# have to construct the url if proxy request is a CONNECT
|
||||||
@ -379,7 +381,7 @@ class WarcRecordQueuer(InterceptorPlugin):
|
|||||||
|
|
||||||
|
|
||||||
def do_request(self, data):
|
def do_request(self, data):
|
||||||
logging.info('{0} >> {1}'.format(self.url, repr(data[:100])))
|
logging.info('{0} >> {1}'.format(self.url, repr(data[:40])))
|
||||||
|
|
||||||
record_id = WarcRecordQueuer.make_warc_uuid("{0} {1}".format(self.url, self._warc_date()))
|
record_id = WarcRecordQueuer.make_warc_uuid("{0} {1}".format(self.url, self._warc_date()))
|
||||||
|
|
||||||
@ -393,11 +395,9 @@ class WarcRecordQueuer(InterceptorPlugin):
|
|||||||
content_tuple = "application/http;msgtype=request", data
|
content_tuple = "application/http;msgtype=request", data
|
||||||
self._request_record = warctools.WarcRecord(headers=headers, content=content_tuple)
|
self._request_record = warctools.WarcRecord(headers=headers, content=content_tuple)
|
||||||
|
|
||||||
return data
|
|
||||||
|
|
||||||
|
def do_response(self, recorder):
|
||||||
def do_response(self, data):
|
logging.info('{0} << {1}'.format(self.url, repr(recorder.data[:40])))
|
||||||
logging.info('{0} << {1}'.format(self.url, repr(data[:100])))
|
|
||||||
|
|
||||||
record_id = WarcRecordQueuer.make_warc_uuid("{0} {1}".format(self.url, self._warc_date()))
|
record_id = WarcRecordQueuer.make_warc_uuid("{0} {1}".format(self.url, self._warc_date()))
|
||||||
|
|
||||||
@ -406,9 +406,12 @@ class WarcRecordQueuer(InterceptorPlugin):
|
|||||||
headers.append((warctools.WarcRecord.TYPE, warctools.WarcRecord.RESPONSE))
|
headers.append((warctools.WarcRecord.TYPE, warctools.WarcRecord.RESPONSE))
|
||||||
headers.append((warctools.WarcRecord.URL, self.url))
|
headers.append((warctools.WarcRecord.URL, self.url))
|
||||||
headers.append((warctools.WarcRecord.DATE, self._warc_date()))
|
headers.append((warctools.WarcRecord.DATE, self._warc_date()))
|
||||||
|
headers.append((warctools.WarcRecord.BLOCK_DIGEST, 'sha1:{}'.format(recorder.block_sha1.hexdigest())))
|
||||||
|
if recorder.payload_sha1 is not None:
|
||||||
|
headers.append(('WARC-Payload-Digest', 'sha1:{}'.format(recorder.block_sha1.hexdigest())))
|
||||||
# headers.append((warctools.WarcRecord.IP_ADDRESS, ip))
|
# headers.append((warctools.WarcRecord.IP_ADDRESS, ip))
|
||||||
|
|
||||||
content_tuple = ("application/http;msgtype=response", data)
|
content_tuple = ("application/http;msgtype=response", recorder.data)
|
||||||
|
|
||||||
response_record = warctools.WarcRecord(headers=headers, content=content_tuple)
|
response_record = warctools.WarcRecord(headers=headers, content=content_tuple)
|
||||||
|
|
||||||
@ -420,8 +423,6 @@ class WarcRecordQueuer(InterceptorPlugin):
|
|||||||
|
|
||||||
WarcRecordQueuer.warc_record_group_queue.put(record_group)
|
WarcRecordQueuer.warc_record_group_queue.put(record_group)
|
||||||
|
|
||||||
return data
|
|
||||||
|
|
||||||
|
|
||||||
class WarcWriterThread(threading.Thread):
|
class WarcWriterThread(threading.Thread):
|
||||||
|
|
||||||
@ -564,10 +565,12 @@ if __name__ == '__main__':
|
|||||||
# [--httpheader=warcinfo httpheader]
|
# [--httpheader=warcinfo httpheader]
|
||||||
args = arg_parser.parse_args()
|
args = arg_parser.parse_args()
|
||||||
|
|
||||||
proxy = AsyncMitmProxy(server_address=(args.address, int(args.port)), ca_file=args.cacert, certs_dir=args.certs_dir)
|
proxy = AsyncMitmProxy(server_address=(args.address, int(args.port)),
|
||||||
proxy.register_interceptor(WarcRecordQueuer)
|
ca_file=args.cacert, certs_dir=args.certs_dir)
|
||||||
|
|
||||||
warc_writer = WarcWriterThread(WarcRecordQueuer.warc_record_group_queue, directory=args.directory, gzip=args.gzip, prefix=args.prefix, size=int(args.size), port=int(args.port))
|
warc_writer = WarcWriterThread(WarcRecordQueuer.warc_record_group_queue,
|
||||||
|
directory=args.directory, gzip=args.gzip, prefix=args.prefix,
|
||||||
|
size=int(args.size), port=int(args.port))
|
||||||
warc_writer.start()
|
warc_writer.start()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user