fix crawl log handling of WARCPROX_WRITE_RECORD request

This commit is contained in:
Noah Levitt 2017-11-09 12:35:10 -08:00
parent 538c9e0caf
commit 78c6137016
4 changed files with 30 additions and 16 deletions

View File

@ -51,7 +51,7 @@ except:
setuptools.setup(
name='warcprox',
version='2.2.1b2.dev109',
version='2.2.1b2.dev110',
description='WARC writing MITM HTTP/S proxy',
url='https://github.com/internetarchive/warcprox',
author='Noah Levitt',

View File

@ -26,8 +26,9 @@ import os
import warcprox
class CrawlLogger(object):
def __init__(self, dir_):
def __init__(self, dir_, options=warcprox.Options()):
self.dir = dir_
self.options = options
if not os.path.exists(self.dir):
logging.info('creating directory %r', self.dir)
os.mkdir(self.dir)
@ -40,10 +41,20 @@ class CrawlLogger(object):
'warcFilename': records[0].warc_filename,
'warcFileOffset': records[0].offset,
}
if recorded_url.response_recorder:
content_length = recorded_url.response_recorder.len - recorded_url.response_recorder.payload_offset
payload_digest = warcprox.digest_str(
recorded_url.response_recorder.payload_digest,
self.options.base32)
else:
# WARCPROX_WRITE_RECORD request
content_length = len(recorded_url.request_data)
payload_digest = records[0].get_header(
b'WARC-Payload-Digest')
fields = [
'{:%Y-%m-%dT%H:%M:%S}.{:03d}Z'.format(now, now.microsecond//1000),
'% 5s' % recorded_url.status,
'% 10s' % (recorded_url.response_recorder.len - recorded_url.response_recorder.payload_offset),
'% 10s' % content_length,
recorded_url.url,
'-', # hop path
recorded_url.referer or '-',
@ -53,8 +64,7 @@ class CrawlLogger(object):
recorded_url.timestamp,
recorded_url.timestamp.microsecond//1000,
recorded_url.duration.microseconds//1000),
warcprox.digest_str(
recorded_url.response_recorder.payload_digest, True),
payload_digest,
recorded_url.warcprox_meta.get('metadata', {}).get('seed', '-'),
'duplicate:digest' if records[0].type == b'revisit' else '-',
json.dumps(extra_info, separators=(',',':')),

View File

@ -238,7 +238,8 @@ def init_controller(args):
playback_proxy = None
if args.crawl_log_dir:
listeners.append(warcprox.crawl_log.CrawlLogger(args.crawl_log_dir))
listeners.append(warcprox.crawl_log.CrawlLogger(
args.crawl_log_dir, options=options))
for qualname in args.plugins or []:
try:

View File

@ -293,16 +293,19 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
if raw_warcprox_meta:
warcprox_meta = json.loads(raw_warcprox_meta)
rec_custom = RecordedUrl(url=self.url,
request_data=request_data,
response_recorder=None,
remote_ip=b'',
warcprox_meta=warcprox_meta,
content_type=self.headers['Content-Type'],
custom_type=warc_type or self.headers['WARC-Type'].encode('utf-8'),
status=204, size=len(request_data),
client_ip=self.client_address[0],
method=self.command, timestamp=timestamp)
rec_custom = RecordedUrl(
url=self.url,
request_data=request_data,
response_recorder=None,
remote_ip=b'',
warcprox_meta=warcprox_meta,
content_type=self.headers['Content-Type'],
custom_type=warc_type or self.headers['WARC-Type'].encode('utf-8'),
status=204, size=len(request_data),
client_ip=self.client_address[0],
method=self.command,
timestamp=timestamp,
duration=datetime.datetime.utcnow()-timestamp)
self.server.recorded_url_q.put(rec_custom)
self.send_response(204, 'OK')