mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
fix crawl log handling of WARCPROX_WRITE_RECORD request
This commit is contained in:
parent
538c9e0caf
commit
78c6137016
2
setup.py
2
setup.py
@ -51,7 +51,7 @@ except:
|
||||
|
||||
setuptools.setup(
|
||||
name='warcprox',
|
||||
version='2.2.1b2.dev109',
|
||||
version='2.2.1b2.dev110',
|
||||
description='WARC writing MITM HTTP/S proxy',
|
||||
url='https://github.com/internetarchive/warcprox',
|
||||
author='Noah Levitt',
|
||||
|
@ -26,8 +26,9 @@ import os
|
||||
import warcprox
|
||||
|
||||
class CrawlLogger(object):
|
||||
def __init__(self, dir_):
|
||||
def __init__(self, dir_, options=warcprox.Options()):
|
||||
self.dir = dir_
|
||||
self.options = options
|
||||
if not os.path.exists(self.dir):
|
||||
logging.info('creating directory %r', self.dir)
|
||||
os.mkdir(self.dir)
|
||||
@ -40,10 +41,20 @@ class CrawlLogger(object):
|
||||
'warcFilename': records[0].warc_filename,
|
||||
'warcFileOffset': records[0].offset,
|
||||
}
|
||||
if recorded_url.response_recorder:
|
||||
content_length = recorded_url.response_recorder.len - recorded_url.response_recorder.payload_offset
|
||||
payload_digest = warcprox.digest_str(
|
||||
recorded_url.response_recorder.payload_digest,
|
||||
self.options.base32)
|
||||
else:
|
||||
# WARCPROX_WRITE_RECORD request
|
||||
content_length = len(recorded_url.request_data)
|
||||
payload_digest = records[0].get_header(
|
||||
b'WARC-Payload-Digest')
|
||||
fields = [
|
||||
'{:%Y-%m-%dT%H:%M:%S}.{:03d}Z'.format(now, now.microsecond//1000),
|
||||
'% 5s' % recorded_url.status,
|
||||
'% 10s' % (recorded_url.response_recorder.len - recorded_url.response_recorder.payload_offset),
|
||||
'% 10s' % content_length,
|
||||
recorded_url.url,
|
||||
'-', # hop path
|
||||
recorded_url.referer or '-',
|
||||
@ -53,8 +64,7 @@ class CrawlLogger(object):
|
||||
recorded_url.timestamp,
|
||||
recorded_url.timestamp.microsecond//1000,
|
||||
recorded_url.duration.microseconds//1000),
|
||||
warcprox.digest_str(
|
||||
recorded_url.response_recorder.payload_digest, True),
|
||||
payload_digest,
|
||||
recorded_url.warcprox_meta.get('metadata', {}).get('seed', '-'),
|
||||
'duplicate:digest' if records[0].type == b'revisit' else '-',
|
||||
json.dumps(extra_info, separators=(',',':')),
|
||||
|
@ -238,7 +238,8 @@ def init_controller(args):
|
||||
playback_proxy = None
|
||||
|
||||
if args.crawl_log_dir:
|
||||
listeners.append(warcprox.crawl_log.CrawlLogger(args.crawl_log_dir))
|
||||
listeners.append(warcprox.crawl_log.CrawlLogger(
|
||||
args.crawl_log_dir, options=options))
|
||||
|
||||
for qualname in args.plugins or []:
|
||||
try:
|
||||
|
@ -293,16 +293,19 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
||||
if raw_warcprox_meta:
|
||||
warcprox_meta = json.loads(raw_warcprox_meta)
|
||||
|
||||
rec_custom = RecordedUrl(url=self.url,
|
||||
request_data=request_data,
|
||||
response_recorder=None,
|
||||
remote_ip=b'',
|
||||
warcprox_meta=warcprox_meta,
|
||||
content_type=self.headers['Content-Type'],
|
||||
custom_type=warc_type or self.headers['WARC-Type'].encode('utf-8'),
|
||||
status=204, size=len(request_data),
|
||||
client_ip=self.client_address[0],
|
||||
method=self.command, timestamp=timestamp)
|
||||
rec_custom = RecordedUrl(
|
||||
url=self.url,
|
||||
request_data=request_data,
|
||||
response_recorder=None,
|
||||
remote_ip=b'',
|
||||
warcprox_meta=warcprox_meta,
|
||||
content_type=self.headers['Content-Type'],
|
||||
custom_type=warc_type or self.headers['WARC-Type'].encode('utf-8'),
|
||||
status=204, size=len(request_data),
|
||||
client_ip=self.client_address[0],
|
||||
method=self.command,
|
||||
timestamp=timestamp,
|
||||
duration=datetime.datetime.utcnow()-timestamp)
|
||||
|
||||
self.server.recorded_url_q.put(rec_custom)
|
||||
self.send_response(204, 'OK')
|
||||
|
Loading…
x
Reference in New Issue
Block a user