diff --git a/README.md b/README.md index 39a3e04..e298b4f 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,8 @@ incorporated into warctools mainline. usage: warcprox.py [-h] [-p PORT] [-b ADDRESS] [-c CACERT] [--certs-dir CERTS_DIR] [-d DIRECTORY] [-z] [-n PREFIX] - [-s SIZE] [-v] [-q] + [-s SIZE] [--rollover-idle-time ROLLOVER_IDLE_TIME] + [--base32] [-v] [-q] warcprox - WARC writing MITM HTTP/S proxy @@ -36,10 +37,11 @@ incorporated into warctools mainline. address to listen on (default: localhost) -c CACERT, --cacert CACERT CA certificate file; if file does not exist, it will - be created (default: ./warcprox-ca.pem) + be created (default: ./desktop-nlevitt-warcprox- + ca.pem) --certs-dir CERTS_DIR where to store and load generated certificates - (default: ./warcprox-ca) + (default: ./desktop-nlevitt-warcprox-ca) -d DIRECTORY, --dir DIRECTORY where to write warcs (default: ./warcs) -z, --gzip write gzip-compressed warc records (default: False) @@ -47,6 +49,12 @@ incorporated into warctools mainline. WARC filename prefix (default: WARCPROX) -s SIZE, --size SIZE WARC file rollover size threshold in bytes (default: 1000000000) + --rollover-idle-time ROLLOVER_IDLE_TIME + WARC file rollover idle time threshold in seconds (so + that Friday's last open WARC doesn't sit there all + weekend waiting for more data) (default: None) + --base32 write SHA1 digests in Base32 instead of hex (default: + False) -v, --verbose -q, --quiet @@ -59,7 +67,7 @@ incorporated into warctools mainline. - keep statistics, produce reports - write cdx while crawling? - performance testing -- base32 sha1 like heritrix? +- ~~base32 sha1 like heritrix?~~ - configurable timeouts and stuff - evaluate ipv6 support - more explicit handling of connection closed exception during transfer? other error cases? diff --git a/warcprox.py b/warcprox.py index 925c713..66dcf7e 100755 --- a/warcprox.py +++ b/warcprox.py @@ -23,6 +23,7 @@ import re import signal import time import tempfile +import base64 class CertificateAuthority(object): @@ -354,6 +355,11 @@ class WarcProxy(SocketServer.ThreadingMixIn, BaseHTTPServer.HTTPServer): # consecutively in the same warc. class WarcRecordsetQueue(Queue.Queue): + def __init__(self, base32=False): + Queue.Queue.__init__(self) + self.base32 = base32 + + def create_and_queue(self, url, request_data, response_recorder, remote_ip): warc_date = warctools.warc.warc_datetime_str(datetime.now()) @@ -373,8 +379,14 @@ class WarcRecordsetQueue(Queue.Queue): self.put(record_group) - @staticmethod - def make_record(url, warc_date=None, recorder=None, data=None, + def digest_str(self, hash_obj): + if self.base32: + return base64.b32encode(hash_obj.digest()) + else: + return hash_obj.hexdigest() + + + def make_record(self, url, warc_date=None, recorder=None, data=None, concurrent_to=None, warc_type=None, content_type=None, remote_ip=None): if warc_date is None: @@ -397,16 +409,19 @@ class WarcRecordsetQueue(Queue.Queue): if recorder is not None: headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(len(recorder)))) - headers.append((warctools.WarcRecord.BLOCK_DIGEST, 'sha1:{}'.format(recorder.block_sha1.hexdigest()))) + headers.append((warctools.WarcRecord.BLOCK_DIGEST, + 'sha1:{}'.format(self.digest_str(recorder.block_sha1)))) if recorder.payload_sha1 is not None: - headers.append((warctools.WarcRecord.PAYLOAD_DIGEST, 'sha1:{}'.format(recorder.payload_sha1.hexdigest()))) + headers.append((warctools.WarcRecord.PAYLOAD_DIGEST, + 'sha1:{}'.format(self.digest_str(recorder.payload_sha1)))) recorder.tempfile.seek(0) record = warctools.WarcRecord(headers=headers, content_file=recorder.tempfile) else: headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(len(data)))) - headers.append((warctools.WarcRecord.BLOCK_DIGEST, 'sha1:{}'.format(hashlib.sha1(data).hexdigest()))) + headers.append((warctools.WarcRecord.BLOCK_DIGEST, + 'sha1:{}'.format(self.digest_str(hashlib.sha1(data))))) content_tuple = content_type, data record = warctools.WarcRecord(headers=headers, content=content_tuple) @@ -573,6 +588,8 @@ if __name__ == '__main__': arg_parser.add_argument('--rollover-idle-time', dest='rollover_idle_time', default=None, help="WARC file rollover idle time threshold in seconds (so that Friday's last open WARC doesn't sit there all weekend waiting for more data)") + arg_parser.add_argument('--base32', dest='base32', action='store_true', + default=False, help='write SHA1 digests in Base32 instead of hex') arg_parser.add_argument('-v', '--verbose', dest='verbose', action='store_true') arg_parser.add_argument('-q', '--quiet', dest='quiet', action='store_true') # [--ispartof=warcinfo ispartof] @@ -591,7 +608,7 @@ if __name__ == '__main__': logging.basicConfig(stream=sys.stdout, level=loglevel, format='%(asctime)s %(process)d %(threadName)s %(levelname)s %(funcName)s(%(filename)s:%(lineno)d) %(message)s') - recordset_q = WarcRecordsetQueue() + recordset_q = WarcRecordsetQueue(base32=args.base32) proxy = WarcProxy(server_address=(args.address, int(args.port)), ca_file=args.cacert, certs_dir=args.certs_dir,