mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
--base32 write SHA1 digests in Base32 instead of hex (default: False)
This commit is contained in:
parent
1ab5c1f683
commit
57c21920bd
16
README.md
16
README.md
@ -25,7 +25,8 @@ incorporated into warctools mainline.
|
||||
|
||||
usage: warcprox.py [-h] [-p PORT] [-b ADDRESS] [-c CACERT]
|
||||
[--certs-dir CERTS_DIR] [-d DIRECTORY] [-z] [-n PREFIX]
|
||||
[-s SIZE] [-v] [-q]
|
||||
[-s SIZE] [--rollover-idle-time ROLLOVER_IDLE_TIME]
|
||||
[--base32] [-v] [-q]
|
||||
|
||||
warcprox - WARC writing MITM HTTP/S proxy
|
||||
|
||||
@ -36,10 +37,11 @@ incorporated into warctools mainline.
|
||||
address to listen on (default: localhost)
|
||||
-c CACERT, --cacert CACERT
|
||||
CA certificate file; if file does not exist, it will
|
||||
be created (default: ./warcprox-ca.pem)
|
||||
be created (default: ./desktop-nlevitt-warcprox-
|
||||
ca.pem)
|
||||
--certs-dir CERTS_DIR
|
||||
where to store and load generated certificates
|
||||
(default: ./warcprox-ca)
|
||||
(default: ./desktop-nlevitt-warcprox-ca)
|
||||
-d DIRECTORY, --dir DIRECTORY
|
||||
where to write warcs (default: ./warcs)
|
||||
-z, --gzip write gzip-compressed warc records (default: False)
|
||||
@ -47,6 +49,12 @@ incorporated into warctools mainline.
|
||||
WARC filename prefix (default: WARCPROX)
|
||||
-s SIZE, --size SIZE WARC file rollover size threshold in bytes (default:
|
||||
1000000000)
|
||||
--rollover-idle-time ROLLOVER_IDLE_TIME
|
||||
WARC file rollover idle time threshold in seconds (so
|
||||
that Friday's last open WARC doesn't sit there all
|
||||
weekend waiting for more data) (default: None)
|
||||
--base32 write SHA1 digests in Base32 instead of hex (default:
|
||||
False)
|
||||
-v, --verbose
|
||||
-q, --quiet
|
||||
|
||||
@ -59,7 +67,7 @@ incorporated into warctools mainline.
|
||||
- keep statistics, produce reports
|
||||
- write cdx while crawling?
|
||||
- performance testing
|
||||
- base32 sha1 like heritrix?
|
||||
- ~~base32 sha1 like heritrix?~~
|
||||
- configurable timeouts and stuff
|
||||
- evaluate ipv6 support
|
||||
- more explicit handling of connection closed exception during transfer? other error cases?
|
||||
|
29
warcprox.py
29
warcprox.py
@ -23,6 +23,7 @@ import re
|
||||
import signal
|
||||
import time
|
||||
import tempfile
|
||||
import base64
|
||||
|
||||
class CertificateAuthority(object):
|
||||
|
||||
@ -354,6 +355,11 @@ class WarcProxy(SocketServer.ThreadingMixIn, BaseHTTPServer.HTTPServer):
|
||||
# consecutively in the same warc.
|
||||
class WarcRecordsetQueue(Queue.Queue):
|
||||
|
||||
def __init__(self, base32=False):
|
||||
Queue.Queue.__init__(self)
|
||||
self.base32 = base32
|
||||
|
||||
|
||||
def create_and_queue(self, url, request_data, response_recorder, remote_ip):
|
||||
warc_date = warctools.warc.warc_datetime_str(datetime.now())
|
||||
|
||||
@ -373,8 +379,14 @@ class WarcRecordsetQueue(Queue.Queue):
|
||||
self.put(record_group)
|
||||
|
||||
|
||||
@staticmethod
|
||||
def make_record(url, warc_date=None, recorder=None, data=None,
|
||||
def digest_str(self, hash_obj):
|
||||
if self.base32:
|
||||
return base64.b32encode(hash_obj.digest())
|
||||
else:
|
||||
return hash_obj.hexdigest()
|
||||
|
||||
|
||||
def make_record(self, url, warc_date=None, recorder=None, data=None,
|
||||
concurrent_to=None, warc_type=None, content_type=None, remote_ip=None):
|
||||
|
||||
if warc_date is None:
|
||||
@ -397,16 +409,19 @@ class WarcRecordsetQueue(Queue.Queue):
|
||||
|
||||
if recorder is not None:
|
||||
headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(len(recorder))))
|
||||
headers.append((warctools.WarcRecord.BLOCK_DIGEST, 'sha1:{}'.format(recorder.block_sha1.hexdigest())))
|
||||
headers.append((warctools.WarcRecord.BLOCK_DIGEST,
|
||||
'sha1:{}'.format(self.digest_str(recorder.block_sha1))))
|
||||
if recorder.payload_sha1 is not None:
|
||||
headers.append((warctools.WarcRecord.PAYLOAD_DIGEST, 'sha1:{}'.format(recorder.payload_sha1.hexdigest())))
|
||||
headers.append((warctools.WarcRecord.PAYLOAD_DIGEST,
|
||||
'sha1:{}'.format(self.digest_str(recorder.payload_sha1))))
|
||||
|
||||
recorder.tempfile.seek(0)
|
||||
record = warctools.WarcRecord(headers=headers, content_file=recorder.tempfile)
|
||||
|
||||
else:
|
||||
headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(len(data))))
|
||||
headers.append((warctools.WarcRecord.BLOCK_DIGEST, 'sha1:{}'.format(hashlib.sha1(data).hexdigest())))
|
||||
headers.append((warctools.WarcRecord.BLOCK_DIGEST,
|
||||
'sha1:{}'.format(self.digest_str(hashlib.sha1(data)))))
|
||||
|
||||
content_tuple = content_type, data
|
||||
record = warctools.WarcRecord(headers=headers, content=content_tuple)
|
||||
@ -573,6 +588,8 @@ if __name__ == '__main__':
|
||||
arg_parser.add_argument('--rollover-idle-time',
|
||||
dest='rollover_idle_time', default=None,
|
||||
help="WARC file rollover idle time threshold in seconds (so that Friday's last open WARC doesn't sit there all weekend waiting for more data)")
|
||||
arg_parser.add_argument('--base32', dest='base32', action='store_true',
|
||||
default=False, help='write SHA1 digests in Base32 instead of hex')
|
||||
arg_parser.add_argument('-v', '--verbose', dest='verbose', action='store_true')
|
||||
arg_parser.add_argument('-q', '--quiet', dest='quiet', action='store_true')
|
||||
# [--ispartof=warcinfo ispartof]
|
||||
@ -591,7 +608,7 @@ if __name__ == '__main__':
|
||||
logging.basicConfig(stream=sys.stdout, level=loglevel,
|
||||
format='%(asctime)s %(process)d %(threadName)s %(levelname)s %(funcName)s(%(filename)s:%(lineno)d) %(message)s')
|
||||
|
||||
recordset_q = WarcRecordsetQueue()
|
||||
recordset_q = WarcRecordsetQueue(base32=args.base32)
|
||||
|
||||
proxy = WarcProxy(server_address=(args.address, int(args.port)),
|
||||
ca_file=args.cacert, certs_dir=args.certs_dir,
|
||||
|
Loading…
x
Reference in New Issue
Block a user