diff --git a/setup.py b/setup.py index 04d0352..78e312b 100755 --- a/setup.py +++ b/setup.py @@ -51,7 +51,7 @@ except: setuptools.setup( name='warcprox', - version='2.2.1b2.dev109', + version='2.2.1b2.dev110', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', diff --git a/warcprox/crawl_log.py b/warcprox/crawl_log.py index eff4df9..68d1fbf 100644 --- a/warcprox/crawl_log.py +++ b/warcprox/crawl_log.py @@ -26,8 +26,9 @@ import os import warcprox class CrawlLogger(object): - def __init__(self, dir_): + def __init__(self, dir_, options=warcprox.Options()): self.dir = dir_ + self.options = options if not os.path.exists(self.dir): logging.info('creating directory %r', self.dir) os.mkdir(self.dir) @@ -40,10 +41,20 @@ class CrawlLogger(object): 'warcFilename': records[0].warc_filename, 'warcFileOffset': records[0].offset, } + if recorded_url.response_recorder: + content_length = recorded_url.response_recorder.len - recorded_url.response_recorder.payload_offset + payload_digest = warcprox.digest_str( + recorded_url.response_recorder.payload_digest, + self.options.base32) + else: + # WARCPROX_WRITE_RECORD request + content_length = len(recorded_url.request_data) + payload_digest = records[0].get_header( + b'WARC-Payload-Digest') fields = [ '{:%Y-%m-%dT%H:%M:%S}.{:03d}Z'.format(now, now.microsecond//1000), '% 5s' % recorded_url.status, - '% 10s' % (recorded_url.response_recorder.len - recorded_url.response_recorder.payload_offset), + '% 10s' % content_length, recorded_url.url, '-', # hop path recorded_url.referer or '-', @@ -53,8 +64,7 @@ class CrawlLogger(object): recorded_url.timestamp, recorded_url.timestamp.microsecond//1000, recorded_url.duration.microseconds//1000), - warcprox.digest_str( - recorded_url.response_recorder.payload_digest, True), + payload_digest, recorded_url.warcprox_meta.get('metadata', {}).get('seed', '-'), 'duplicate:digest' if records[0].type == b'revisit' else '-', json.dumps(extra_info, separators=(',',':')), diff --git a/warcprox/main.py b/warcprox/main.py index e21ff6a..1e6aaf8 100644 --- a/warcprox/main.py +++ b/warcprox/main.py @@ -238,7 +238,8 @@ def init_controller(args): playback_proxy = None if args.crawl_log_dir: - listeners.append(warcprox.crawl_log.CrawlLogger(args.crawl_log_dir)) + listeners.append(warcprox.crawl_log.CrawlLogger( + args.crawl_log_dir, options=options)) for qualname in args.plugins or []: try: diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index 544dc61..afe1835 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -293,16 +293,19 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): if raw_warcprox_meta: warcprox_meta = json.loads(raw_warcprox_meta) - rec_custom = RecordedUrl(url=self.url, - request_data=request_data, - response_recorder=None, - remote_ip=b'', - warcprox_meta=warcprox_meta, - content_type=self.headers['Content-Type'], - custom_type=warc_type or self.headers['WARC-Type'].encode('utf-8'), - status=204, size=len(request_data), - client_ip=self.client_address[0], - method=self.command, timestamp=timestamp) + rec_custom = RecordedUrl( + url=self.url, + request_data=request_data, + response_recorder=None, + remote_ip=b'', + warcprox_meta=warcprox_meta, + content_type=self.headers['Content-Type'], + custom_type=warc_type or self.headers['WARC-Type'].encode('utf-8'), + status=204, size=len(request_data), + client_ip=self.client_address[0], + method=self.command, + timestamp=timestamp, + duration=datetime.datetime.utcnow()-timestamp) self.server.recorded_url_q.put(rec_custom) self.send_response(204, 'OK')