mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
fix crawl log handling of WARCPROX_WRITE_RECORD request
This commit is contained in:
parent
538c9e0caf
commit
78c6137016
2
setup.py
2
setup.py
@ -51,7 +51,7 @@ except:
|
|||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='warcprox',
|
name='warcprox',
|
||||||
version='2.2.1b2.dev109',
|
version='2.2.1b2.dev110',
|
||||||
description='WARC writing MITM HTTP/S proxy',
|
description='WARC writing MITM HTTP/S proxy',
|
||||||
url='https://github.com/internetarchive/warcprox',
|
url='https://github.com/internetarchive/warcprox',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
@ -26,8 +26,9 @@ import os
|
|||||||
import warcprox
|
import warcprox
|
||||||
|
|
||||||
class CrawlLogger(object):
|
class CrawlLogger(object):
|
||||||
def __init__(self, dir_):
|
def __init__(self, dir_, options=warcprox.Options()):
|
||||||
self.dir = dir_
|
self.dir = dir_
|
||||||
|
self.options = options
|
||||||
if not os.path.exists(self.dir):
|
if not os.path.exists(self.dir):
|
||||||
logging.info('creating directory %r', self.dir)
|
logging.info('creating directory %r', self.dir)
|
||||||
os.mkdir(self.dir)
|
os.mkdir(self.dir)
|
||||||
@ -40,10 +41,20 @@ class CrawlLogger(object):
|
|||||||
'warcFilename': records[0].warc_filename,
|
'warcFilename': records[0].warc_filename,
|
||||||
'warcFileOffset': records[0].offset,
|
'warcFileOffset': records[0].offset,
|
||||||
}
|
}
|
||||||
|
if recorded_url.response_recorder:
|
||||||
|
content_length = recorded_url.response_recorder.len - recorded_url.response_recorder.payload_offset
|
||||||
|
payload_digest = warcprox.digest_str(
|
||||||
|
recorded_url.response_recorder.payload_digest,
|
||||||
|
self.options.base32)
|
||||||
|
else:
|
||||||
|
# WARCPROX_WRITE_RECORD request
|
||||||
|
content_length = len(recorded_url.request_data)
|
||||||
|
payload_digest = records[0].get_header(
|
||||||
|
b'WARC-Payload-Digest')
|
||||||
fields = [
|
fields = [
|
||||||
'{:%Y-%m-%dT%H:%M:%S}.{:03d}Z'.format(now, now.microsecond//1000),
|
'{:%Y-%m-%dT%H:%M:%S}.{:03d}Z'.format(now, now.microsecond//1000),
|
||||||
'% 5s' % recorded_url.status,
|
'% 5s' % recorded_url.status,
|
||||||
'% 10s' % (recorded_url.response_recorder.len - recorded_url.response_recorder.payload_offset),
|
'% 10s' % content_length,
|
||||||
recorded_url.url,
|
recorded_url.url,
|
||||||
'-', # hop path
|
'-', # hop path
|
||||||
recorded_url.referer or '-',
|
recorded_url.referer or '-',
|
||||||
@ -53,8 +64,7 @@ class CrawlLogger(object):
|
|||||||
recorded_url.timestamp,
|
recorded_url.timestamp,
|
||||||
recorded_url.timestamp.microsecond//1000,
|
recorded_url.timestamp.microsecond//1000,
|
||||||
recorded_url.duration.microseconds//1000),
|
recorded_url.duration.microseconds//1000),
|
||||||
warcprox.digest_str(
|
payload_digest,
|
||||||
recorded_url.response_recorder.payload_digest, True),
|
|
||||||
recorded_url.warcprox_meta.get('metadata', {}).get('seed', '-'),
|
recorded_url.warcprox_meta.get('metadata', {}).get('seed', '-'),
|
||||||
'duplicate:digest' if records[0].type == b'revisit' else '-',
|
'duplicate:digest' if records[0].type == b'revisit' else '-',
|
||||||
json.dumps(extra_info, separators=(',',':')),
|
json.dumps(extra_info, separators=(',',':')),
|
||||||
|
@ -238,7 +238,8 @@ def init_controller(args):
|
|||||||
playback_proxy = None
|
playback_proxy = None
|
||||||
|
|
||||||
if args.crawl_log_dir:
|
if args.crawl_log_dir:
|
||||||
listeners.append(warcprox.crawl_log.CrawlLogger(args.crawl_log_dir))
|
listeners.append(warcprox.crawl_log.CrawlLogger(
|
||||||
|
args.crawl_log_dir, options=options))
|
||||||
|
|
||||||
for qualname in args.plugins or []:
|
for qualname in args.plugins or []:
|
||||||
try:
|
try:
|
||||||
|
@ -293,16 +293,19 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
|||||||
if raw_warcprox_meta:
|
if raw_warcprox_meta:
|
||||||
warcprox_meta = json.loads(raw_warcprox_meta)
|
warcprox_meta = json.loads(raw_warcprox_meta)
|
||||||
|
|
||||||
rec_custom = RecordedUrl(url=self.url,
|
rec_custom = RecordedUrl(
|
||||||
request_data=request_data,
|
url=self.url,
|
||||||
response_recorder=None,
|
request_data=request_data,
|
||||||
remote_ip=b'',
|
response_recorder=None,
|
||||||
warcprox_meta=warcprox_meta,
|
remote_ip=b'',
|
||||||
content_type=self.headers['Content-Type'],
|
warcprox_meta=warcprox_meta,
|
||||||
custom_type=warc_type or self.headers['WARC-Type'].encode('utf-8'),
|
content_type=self.headers['Content-Type'],
|
||||||
status=204, size=len(request_data),
|
custom_type=warc_type or self.headers['WARC-Type'].encode('utf-8'),
|
||||||
client_ip=self.client_address[0],
|
status=204, size=len(request_data),
|
||||||
method=self.command, timestamp=timestamp)
|
client_ip=self.client_address[0],
|
||||||
|
method=self.command,
|
||||||
|
timestamp=timestamp,
|
||||||
|
duration=datetime.datetime.utcnow()-timestamp)
|
||||||
|
|
||||||
self.server.recorded_url_q.put(rec_custom)
|
self.server.recorded_url_q.put(rec_custom)
|
||||||
self.send_response(204, 'OK')
|
self.send_response(204, 'OK')
|
||||||
|
Loading…
x
Reference in New Issue
Block a user