mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
--base32 write SHA1 digests in Base32 instead of hex (default: False)
This commit is contained in:
parent
1ab5c1f683
commit
57c21920bd
16
README.md
16
README.md
@ -25,7 +25,8 @@ incorporated into warctools mainline.
|
|||||||
|
|
||||||
usage: warcprox.py [-h] [-p PORT] [-b ADDRESS] [-c CACERT]
|
usage: warcprox.py [-h] [-p PORT] [-b ADDRESS] [-c CACERT]
|
||||||
[--certs-dir CERTS_DIR] [-d DIRECTORY] [-z] [-n PREFIX]
|
[--certs-dir CERTS_DIR] [-d DIRECTORY] [-z] [-n PREFIX]
|
||||||
[-s SIZE] [-v] [-q]
|
[-s SIZE] [--rollover-idle-time ROLLOVER_IDLE_TIME]
|
||||||
|
[--base32] [-v] [-q]
|
||||||
|
|
||||||
warcprox - WARC writing MITM HTTP/S proxy
|
warcprox - WARC writing MITM HTTP/S proxy
|
||||||
|
|
||||||
@ -36,10 +37,11 @@ incorporated into warctools mainline.
|
|||||||
address to listen on (default: localhost)
|
address to listen on (default: localhost)
|
||||||
-c CACERT, --cacert CACERT
|
-c CACERT, --cacert CACERT
|
||||||
CA certificate file; if file does not exist, it will
|
CA certificate file; if file does not exist, it will
|
||||||
be created (default: ./warcprox-ca.pem)
|
be created (default: ./desktop-nlevitt-warcprox-
|
||||||
|
ca.pem)
|
||||||
--certs-dir CERTS_DIR
|
--certs-dir CERTS_DIR
|
||||||
where to store and load generated certificates
|
where to store and load generated certificates
|
||||||
(default: ./warcprox-ca)
|
(default: ./desktop-nlevitt-warcprox-ca)
|
||||||
-d DIRECTORY, --dir DIRECTORY
|
-d DIRECTORY, --dir DIRECTORY
|
||||||
where to write warcs (default: ./warcs)
|
where to write warcs (default: ./warcs)
|
||||||
-z, --gzip write gzip-compressed warc records (default: False)
|
-z, --gzip write gzip-compressed warc records (default: False)
|
||||||
@ -47,6 +49,12 @@ incorporated into warctools mainline.
|
|||||||
WARC filename prefix (default: WARCPROX)
|
WARC filename prefix (default: WARCPROX)
|
||||||
-s SIZE, --size SIZE WARC file rollover size threshold in bytes (default:
|
-s SIZE, --size SIZE WARC file rollover size threshold in bytes (default:
|
||||||
1000000000)
|
1000000000)
|
||||||
|
--rollover-idle-time ROLLOVER_IDLE_TIME
|
||||||
|
WARC file rollover idle time threshold in seconds (so
|
||||||
|
that Friday's last open WARC doesn't sit there all
|
||||||
|
weekend waiting for more data) (default: None)
|
||||||
|
--base32 write SHA1 digests in Base32 instead of hex (default:
|
||||||
|
False)
|
||||||
-v, --verbose
|
-v, --verbose
|
||||||
-q, --quiet
|
-q, --quiet
|
||||||
|
|
||||||
@ -59,7 +67,7 @@ incorporated into warctools mainline.
|
|||||||
- keep statistics, produce reports
|
- keep statistics, produce reports
|
||||||
- write cdx while crawling?
|
- write cdx while crawling?
|
||||||
- performance testing
|
- performance testing
|
||||||
- base32 sha1 like heritrix?
|
- ~~base32 sha1 like heritrix?~~
|
||||||
- configurable timeouts and stuff
|
- configurable timeouts and stuff
|
||||||
- evaluate ipv6 support
|
- evaluate ipv6 support
|
||||||
- more explicit handling of connection closed exception during transfer? other error cases?
|
- more explicit handling of connection closed exception during transfer? other error cases?
|
||||||
|
29
warcprox.py
29
warcprox.py
@ -23,6 +23,7 @@ import re
|
|||||||
import signal
|
import signal
|
||||||
import time
|
import time
|
||||||
import tempfile
|
import tempfile
|
||||||
|
import base64
|
||||||
|
|
||||||
class CertificateAuthority(object):
|
class CertificateAuthority(object):
|
||||||
|
|
||||||
@ -354,6 +355,11 @@ class WarcProxy(SocketServer.ThreadingMixIn, BaseHTTPServer.HTTPServer):
|
|||||||
# consecutively in the same warc.
|
# consecutively in the same warc.
|
||||||
class WarcRecordsetQueue(Queue.Queue):
|
class WarcRecordsetQueue(Queue.Queue):
|
||||||
|
|
||||||
|
def __init__(self, base32=False):
|
||||||
|
Queue.Queue.__init__(self)
|
||||||
|
self.base32 = base32
|
||||||
|
|
||||||
|
|
||||||
def create_and_queue(self, url, request_data, response_recorder, remote_ip):
|
def create_and_queue(self, url, request_data, response_recorder, remote_ip):
|
||||||
warc_date = warctools.warc.warc_datetime_str(datetime.now())
|
warc_date = warctools.warc.warc_datetime_str(datetime.now())
|
||||||
|
|
||||||
@ -373,8 +379,14 @@ class WarcRecordsetQueue(Queue.Queue):
|
|||||||
self.put(record_group)
|
self.put(record_group)
|
||||||
|
|
||||||
|
|
||||||
@staticmethod
|
def digest_str(self, hash_obj):
|
||||||
def make_record(url, warc_date=None, recorder=None, data=None,
|
if self.base32:
|
||||||
|
return base64.b32encode(hash_obj.digest())
|
||||||
|
else:
|
||||||
|
return hash_obj.hexdigest()
|
||||||
|
|
||||||
|
|
||||||
|
def make_record(self, url, warc_date=None, recorder=None, data=None,
|
||||||
concurrent_to=None, warc_type=None, content_type=None, remote_ip=None):
|
concurrent_to=None, warc_type=None, content_type=None, remote_ip=None):
|
||||||
|
|
||||||
if warc_date is None:
|
if warc_date is None:
|
||||||
@ -397,16 +409,19 @@ class WarcRecordsetQueue(Queue.Queue):
|
|||||||
|
|
||||||
if recorder is not None:
|
if recorder is not None:
|
||||||
headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(len(recorder))))
|
headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(len(recorder))))
|
||||||
headers.append((warctools.WarcRecord.BLOCK_DIGEST, 'sha1:{}'.format(recorder.block_sha1.hexdigest())))
|
headers.append((warctools.WarcRecord.BLOCK_DIGEST,
|
||||||
|
'sha1:{}'.format(self.digest_str(recorder.block_sha1))))
|
||||||
if recorder.payload_sha1 is not None:
|
if recorder.payload_sha1 is not None:
|
||||||
headers.append((warctools.WarcRecord.PAYLOAD_DIGEST, 'sha1:{}'.format(recorder.payload_sha1.hexdigest())))
|
headers.append((warctools.WarcRecord.PAYLOAD_DIGEST,
|
||||||
|
'sha1:{}'.format(self.digest_str(recorder.payload_sha1))))
|
||||||
|
|
||||||
recorder.tempfile.seek(0)
|
recorder.tempfile.seek(0)
|
||||||
record = warctools.WarcRecord(headers=headers, content_file=recorder.tempfile)
|
record = warctools.WarcRecord(headers=headers, content_file=recorder.tempfile)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(len(data))))
|
headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(len(data))))
|
||||||
headers.append((warctools.WarcRecord.BLOCK_DIGEST, 'sha1:{}'.format(hashlib.sha1(data).hexdigest())))
|
headers.append((warctools.WarcRecord.BLOCK_DIGEST,
|
||||||
|
'sha1:{}'.format(self.digest_str(hashlib.sha1(data)))))
|
||||||
|
|
||||||
content_tuple = content_type, data
|
content_tuple = content_type, data
|
||||||
record = warctools.WarcRecord(headers=headers, content=content_tuple)
|
record = warctools.WarcRecord(headers=headers, content=content_tuple)
|
||||||
@ -573,6 +588,8 @@ if __name__ == '__main__':
|
|||||||
arg_parser.add_argument('--rollover-idle-time',
|
arg_parser.add_argument('--rollover-idle-time',
|
||||||
dest='rollover_idle_time', default=None,
|
dest='rollover_idle_time', default=None,
|
||||||
help="WARC file rollover idle time threshold in seconds (so that Friday's last open WARC doesn't sit there all weekend waiting for more data)")
|
help="WARC file rollover idle time threshold in seconds (so that Friday's last open WARC doesn't sit there all weekend waiting for more data)")
|
||||||
|
arg_parser.add_argument('--base32', dest='base32', action='store_true',
|
||||||
|
default=False, help='write SHA1 digests in Base32 instead of hex')
|
||||||
arg_parser.add_argument('-v', '--verbose', dest='verbose', action='store_true')
|
arg_parser.add_argument('-v', '--verbose', dest='verbose', action='store_true')
|
||||||
arg_parser.add_argument('-q', '--quiet', dest='quiet', action='store_true')
|
arg_parser.add_argument('-q', '--quiet', dest='quiet', action='store_true')
|
||||||
# [--ispartof=warcinfo ispartof]
|
# [--ispartof=warcinfo ispartof]
|
||||||
@ -591,7 +608,7 @@ if __name__ == '__main__':
|
|||||||
logging.basicConfig(stream=sys.stdout, level=loglevel,
|
logging.basicConfig(stream=sys.stdout, level=loglevel,
|
||||||
format='%(asctime)s %(process)d %(threadName)s %(levelname)s %(funcName)s(%(filename)s:%(lineno)d) %(message)s')
|
format='%(asctime)s %(process)d %(threadName)s %(levelname)s %(funcName)s(%(filename)s:%(lineno)d) %(message)s')
|
||||||
|
|
||||||
recordset_q = WarcRecordsetQueue()
|
recordset_q = WarcRecordsetQueue(base32=args.base32)
|
||||||
|
|
||||||
proxy = WarcProxy(server_address=(args.address, int(args.port)),
|
proxy = WarcProxy(server_address=(args.address, int(args.port)),
|
||||||
ca_file=args.cacert, certs_dir=args.certs_dir,
|
ca_file=args.cacert, certs_dir=args.certs_dir,
|
||||||
|
Loading…
x
Reference in New Issue
Block a user