Merge branch 'master' into trough-dedup

* master:
  not gonna bother figuring out why pypy regex is not matching https://travis-ci.org/internetarchive/warcprox/jobs/299864258#L615
  fix failing test just committed, which involves running "listeners" for all urls, including those not archived; make adjustments accordingly
  make test_crawl_log expect HEAD request to be logged
  fix crawl log handling of WARCPROX_WRITE_RECORD request
  modify test_crawl_log to expect crawl log to honor --base32 setting and add tests of WARCPROX_WRITE_RECORD request and HEAD request (not written to warc)
  bump dev version number
  add --crawl-log-dir option to fix failing test
  create crawl log dir at startup if it doesn't exist
  make test pass with py27
  fix crawl log test to avoid any dedup collisions
  fix crawl log test
  heritrix-style crawl log support
  disallow slash and backslash in warc-prefix
  can't see any reason to split the main() like this (anymore?)
  add missing dependency warcio to tests_require
This commit is contained in:
Noah Levitt 2017-11-09 15:50:18 -08:00
commit b2adb778ee
12 changed files with 370 additions and 55 deletions

View File

@ -13,6 +13,7 @@ python:
matrix:
allow_failures:
- python: pypy
- python: pypy3
- python: nightly
- python: 3.7-dev

View File

@ -52,7 +52,7 @@ except:
setuptools.setup(
name='warcprox',
version='2.2.1b2.dev107',
version='2.2.1b2.dev112',
description='WARC writing MITM HTTP/S proxy',
url='https://github.com/internetarchive/warcprox',
author='Noah Levitt',

View File

@ -254,7 +254,8 @@ def warcprox_(request):
'--method-filter=POST',
'--port=0',
'--playback-port=0',
'--onion-tor-socks-proxy=localhost:9050']
'--onion-tor-socks-proxy=localhost:9050',
'--crawl-log-dir=crawl-logs']
if request.config.getoption('--rethinkdb-dedup-url'):
argv.append('--rethinkdb-dedup-url=%s' % request.config.getoption('--rethinkdb-dedup-url'))
# test these here only
@ -1339,6 +1340,200 @@ def test_via_response_header(warcprox_, http_daemon, archiving_proxies, playback
elif record.rec_type == 'request':
assert record.http_headers.get_header('via') == '1.1 warcprox'
def test_slash_in_warc_prefix(warcprox_, http_daemon, archiving_proxies):
url = 'http://localhost:%s/b/b' % http_daemon.server_port
headers = {"Warcprox-Meta": json.dumps({"warc-prefix":"../../../../etc/a"})}
response = requests.get(url, proxies=archiving_proxies, headers=headers)
assert response.status_code == 500
assert response.reason == 'request rejected by warcprox: slash and backslash are not permitted in warc-prefix'
url = 'http://localhost:%s/b/c' % http_daemon.server_port
headers = {"Warcprox-Meta": json.dumps({"warc-prefix":"..\\..\\..\\derp\\monkey"})}
response = requests.get(url, proxies=archiving_proxies, headers=headers)
assert response.status_code == 500
assert response.reason == 'request rejected by warcprox: slash and backslash are not permitted in warc-prefix'
def test_crawl_log(warcprox_, http_daemon, archiving_proxies):
try:
os.unlink(os.path.join(warcprox_.options.crawl_log_dir, 'crawl.log'))
except:
pass
url = 'http://localhost:%s/b/aa' % http_daemon.server_port
response = requests.get(url, proxies=archiving_proxies)
assert response.status_code == 200
url = 'http://localhost:%s/b/bb' % http_daemon.server_port
headers = {
"Warcprox-Meta": json.dumps({"warc-prefix":"test_crawl_log_1"}),
"Referer": "http://example.com/referer",
}
response = requests.get(url, proxies=archiving_proxies, headers=headers)
assert response.status_code == 200
start = time.time()
while time.time() - start < 10:
if os.path.exists(os.path.join(
warcprox_.options.crawl_log_dir, 'test_crawl_log_1.log')):
break
time.sleep(0.5)
crawl_log = open(os.path.join(
warcprox_.options.crawl_log_dir, 'crawl.log'), 'rb').read()
# tests will fail in year 3000 :)
assert re.match(b'\A2[^\n]+\n\Z', crawl_log)
assert crawl_log[24:31] == b' 200 '
assert crawl_log[31:42] == b' 54 '
fields = crawl_log.split()
assert len(fields) == 13
assert fields[3].endswith(b'/b/aa')
assert fields[4] == b'-'
assert fields[5] == b'-'
assert fields[6] == b'text/plain'
assert fields[7] == b'-'
assert re.match(br'^\d{17}[+]\d{3}', fields[8])
assert fields[9] == b'sha1:69d51a46e44a04e8110da0c91897cece979fa70f'
assert fields[10] == b'-'
assert fields[11] == b'-'
extra_info = json.loads(fields[12].decode('utf-8'))
assert set(extra_info.keys()) == {
'contentSize', 'warcFilename', 'warcFileOffset'}
assert extra_info['contentSize'] == 145
crawl_log_1 = open(os.path.join(
warcprox_.options.crawl_log_dir, 'test_crawl_log_1.log'), 'rb').read()
assert re.match(b'\A2[^\n]+\n\Z', crawl_log_1)
assert crawl_log_1[24:31] == b' 200 '
assert crawl_log_1[31:42] == b' 54 '
fields = crawl_log_1.split()
assert len(fields) == 13
assert fields[3].endswith(b'/b/bb')
assert fields[4] == b'-'
assert fields[5] == b'http://example.com/referer'
assert fields[6] == b'text/plain'
assert fields[7] == b'-'
assert re.match(br'^\d{17}[+]\d{3}', fields[8])
assert fields[9] == b'sha1:9aae6acb797c75ca8eb5dded9be2127cc61b3fbb'
assert fields[10] == b'-'
assert fields[11] == b'-'
extra_info = json.loads(fields[12].decode('utf-8'))
assert set(extra_info.keys()) == {
'contentSize', 'warcFilename', 'warcFileOffset'}
assert extra_info['contentSize'] == 145
# should be deduplicated
url = 'http://localhost:%s/b/aa' % http_daemon.server_port
headers = {"Warcprox-Meta": json.dumps({
"warc-prefix": "test_crawl_log_2",
"metadata": {"seed": "http://example.com/seed"}})}
response = requests.get(url, proxies=archiving_proxies, headers=headers)
assert response.status_code == 200
start = time.time()
while time.time() - start < 10:
if os.path.exists(os.path.join(
warcprox_.options.crawl_log_dir, 'test_crawl_log_2.log')):
break
time.sleep(0.5)
crawl_log_2 = open(os.path.join(
warcprox_.options.crawl_log_dir, 'test_crawl_log_2.log'), 'rb').read()
assert re.match(b'\A2[^\n]+\n\Z', crawl_log_2)
assert crawl_log_2[24:31] == b' 200 '
assert crawl_log_2[31:42] == b' 54 '
fields = crawl_log_2.split()
assert len(fields) == 13
assert fields[3].endswith(b'/b/aa')
assert fields[4] == b'-'
assert fields[5] == b'-'
assert fields[6] == b'text/plain'
assert fields[7] == b'-'
assert re.match(br'^\d{17}[+]\d{3}', fields[8])
assert fields[9] == b'sha1:69d51a46e44a04e8110da0c91897cece979fa70f'
assert fields[10] == b'http://example.com/seed'
assert fields[11] == b'duplicate:digest'
extra_info = json.loads(fields[12].decode('utf-8'))
assert set(extra_info.keys()) == {
'contentSize', 'warcFilename', 'warcFileOffset'}
assert extra_info['contentSize'] == 145
# a request that is not saved to a warc (because of --method-filter)
# currently not logged at all (XXX maybe it should be)
url = 'http://localhost:%s/b/cc' % http_daemon.server_port
headers = {'Warcprox-Meta': json.dumps({'warc-prefix': 'test_crawl_log_3'})}
response = requests.head(url, proxies=archiving_proxies, headers=headers)
start = time.time()
while time.time() - start < 10:
if os.path.exists(os.path.join(
warcprox_.options.crawl_log_dir, 'test_crawl_log_3.log')):
break
time.sleep(0.5)
crawl_log_3 = open(os.path.join(
warcprox_.options.crawl_log_dir, 'test_crawl_log_3.log'), 'rb').read()
assert re.match(b'\A2[^\n]+\n\Z', crawl_log_3)
assert crawl_log_3[24:31] == b' 200 '
assert crawl_log_3[31:42] == b' 0 '
fields = crawl_log_3.split()
assert len(fields) == 13
assert fields[3].endswith(b'/b/cc')
assert fields[4] == b'-'
assert fields[5] == b'-'
assert fields[6] == b'text/plain'
assert fields[7] == b'-'
assert re.match(br'^\d{17}[+]\d{3}', fields[8])
assert fields[9] == b'sha1:da39a3ee5e6b4b0d3255bfef95601890afd80709'
assert fields[10] == b'-'
assert fields[11] == b'-'
extra_info = json.loads(fields[12].decode('utf-8'))
assert extra_info == {'contentSize': 91}
# WARCPROX_WRITE_RECORD
url = 'http://fakeurl/'
payload = b'I am the WARCPROX_WRITE_RECORD payload'
headers = {
'Content-Type': 'text/plain',
'WARC-Type': 'metadata',
'Host': 'N/A',
'Warcprox-Meta': json.dumps({'warc-prefix': 'test_crawl_log_4'}),
}
response = requests.request(
method='WARCPROX_WRITE_RECORD', url=url, data=payload,
headers=headers, proxies=archiving_proxies)
assert response.status_code == 204
start = time.time()
while time.time() - start < 10:
if os.path.exists(os.path.join(
warcprox_.options.crawl_log_dir, 'test_crawl_log_4.log')):
break
time.sleep(0.5)
crawl_log_4 = open(os.path.join(
warcprox_.options.crawl_log_dir, 'test_crawl_log_4.log'), 'rb').read()
assert re.match(b'\A2[^\n]+\n\Z', crawl_log_4)
assert crawl_log_4[24:31] == b' 204 '
assert crawl_log_4[31:42] == b' 38 '
fields = crawl_log_4.split()
assert len(fields) == 13
assert fields[3] == b'http://fakeurl/'
assert fields[4] == b'-'
assert fields[5] == b'-'
assert fields[6] == b'text/plain'
assert fields[7] == b'-'
assert re.match(br'^\d{17}[+]\d{3}', fields[8])
assert fields[9] == b'sha1:bb56497c17d2684f5eca4af9df908c78ba74ca1c'
assert fields[10] == b'-'
assert fields[11] == b'-'
extra_info = json.loads(fields[12].decode('utf-8'))
assert set(extra_info.keys()) == {
'contentSize', 'warcFilename', 'warcFileOffset'}
assert extra_info['contentSize'] == 38
def test_long_warcprox_meta(
warcprox_, http_daemon, archiving_proxies, playback_proxies):
url = 'http://localhost:%s/b/g' % http_daemon.server_port

View File

@ -114,3 +114,4 @@ import warcprox.warc as warc
import warcprox.writerthread as writerthread
import warcprox.stats as stats
import warcprox.bigtable as bigtable
import warcprox.crawl_log as crawl_log

View File

@ -201,9 +201,10 @@ class RethinkCaptures:
return entry
def notify(self, recorded_url, records):
entry = self._assemble_entry(recorded_url, records)
with self._batch_lock:
self._batch.append(entry)
if records:
entry = self._assemble_entry(recorded_url, records)
with self._batch_lock:
self._batch.append(entry)
def close(self):
self.stop()

86
warcprox/crawl_log.py Normal file
View File

@ -0,0 +1,86 @@
#!/usr/bin/env python
'''
warcprox/crawl_log.py - heritrix-style crawl logger
Copyright (C) 2017 Internet Archive
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
USA.
'''
import logging
import datetime
import json
import os
import warcprox
class CrawlLogger(object):
def __init__(self, dir_, options=warcprox.Options()):
self.dir = dir_
self.options = options
if not os.path.exists(self.dir):
logging.info('creating directory %r', self.dir)
os.mkdir(self.dir)
def notify(self, recorded_url, records):
# 2017-08-03T21:45:24.496Z 200 2189 https://autismcouncil.wisconsin.gov/robots.txt P https://autismcouncil.wisconsin.gov/ text/plain #001 20170803214523617+365 sha1:PBS2CEF7B4OSEXZZF3QE2XN2VHYCPNPX https://autismcouncil.wisconsin.gov/ duplicate:digest {"warcFileOffset":942,"contentSize":2495,"warcFilename":"ARCHIVEIT-2159-TEST-JOB319150-20170803214522386-00000.warc.gz"}
now = datetime.datetime.utcnow()
extra_info = {'contentSize': recorded_url.size,}
if records:
extra_info['warcFilename'] = records[0].warc_filename
extra_info['warcFileOffset'] = records[0].offset
if recorded_url.response_recorder:
content_length = recorded_url.response_recorder.len - recorded_url.response_recorder.payload_offset
payload_digest = warcprox.digest_str(
recorded_url.response_recorder.payload_digest,
self.options.base32)
else:
# WARCPROX_WRITE_RECORD request
content_length = len(recorded_url.request_data)
payload_digest = records[0].get_header(b'WARC-Payload-Digest')
fields = [
'{:%Y-%m-%dT%H:%M:%S}.{:03d}Z'.format(now, now.microsecond//1000),
'% 5s' % recorded_url.status,
'% 10s' % content_length,
recorded_url.url,
'-', # hop path
recorded_url.referer or '-',
recorded_url.mimetype or '-',
'-',
'{:%Y%m%d%H%M%S}{:03d}+{:03d}'.format(
recorded_url.timestamp,
recorded_url.timestamp.microsecond//1000,
recorded_url.duration.microseconds//1000),
payload_digest,
recorded_url.warcprox_meta.get('metadata', {}).get('seed', '-'),
'duplicate:digest' if records and records[0].type == b'revisit' else '-',
json.dumps(extra_info, separators=(',',':')),
]
for i in range(len(fields)):
# `fields` is a mix of `bytes` and `unicode`, make them all `bytes`
try:
fields[i] = fields[i].encode('utf-8')
except:
pass
line = b' '.join(fields) + b'\n'
if 'warc-prefix' in recorded_url.warcprox_meta:
filename = '%s.log' % recorded_url.warcprox_meta['warc-prefix']
else:
filename = 'crawl.log'
crawl_log_path = os.path.join(self.dir, filename)
with open(crawl_log_path, 'ab') as f:
f.write(line)

View File

@ -96,7 +96,7 @@ class DedupDb(object):
return result
def notify(self, recorded_url, records):
if (records[0].get_header(warctools.WarcRecord.TYPE) == warctools.WarcRecord.RESPONSE
if (records and records[0].type == b'response'
and recorded_url.response_recorder.payload_size() > 0):
digest_key = warcprox.digest_str(
recorded_url.response_recorder.payload_digest,
@ -174,7 +174,7 @@ class RethinkDedupDb:
return result
def notify(self, recorded_url, records):
if (records[0].get_header(warctools.WarcRecord.TYPE) == warctools.WarcRecord.RESPONSE
if (records and records[0].type == b'response'
and recorded_url.response_recorder.payload_size() > 0):
digest_key = warcprox.digest_str(recorded_url.response_recorder.payload_digest,
self.options.base32)

View File

@ -152,6 +152,11 @@ def _build_arg_parser(prog=os.path.basename(sys.argv[0])):
default=None, help=(
'host:port of tor socks proxy, used only to connect to '
'.onion sites'))
arg_parser.add_argument(
'--crawl-log-dir', dest='crawl_log_dir', default=None, help=(
'if specified, write crawl log files in the specified '
'directory; one crawl log is written per warc filename '
'prefix; crawl log format mimics heritrix'))
arg_parser.add_argument(
'--plugin', metavar='PLUGIN_CLASS', dest='plugins',
action='append', help=(
@ -248,6 +253,10 @@ def init_controller(args):
playback_index_db = None
playback_proxy = None
if args.crawl_log_dir:
listeners.append(warcprox.crawl_log.CrawlLogger(
args.crawl_log_dir, options=options))
for qualname in args.plugins or []:
try:
(module_name, class_name) = qualname.rsplit('.', 1)
@ -285,22 +294,6 @@ def init_controller(args):
return controller
def real_main(args):
# see https://github.com/pyca/cryptography/issues/2911
cryptography.hazmat.backends.openssl.backend.activate_builtin_random()
controller = init_controller(args)
signal.signal(signal.SIGTERM, lambda a,b: controller.stop.set())
signal.signal(signal.SIGINT, lambda a,b: controller.stop.set())
try:
signal.signal(signal.SIGQUIT, dump_state)
except AttributeError:
# SIGQUIT does not exist on some platforms (windows)
pass
controller.run_until_shutdown()
def parse_args(argv=sys.argv):
'''
Parses command line arguments with argparse.
@ -329,7 +322,20 @@ def main(argv=sys.argv):
'%(asctime)s %(process)d %(levelname)s %(threadName)s '
'%(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s'))
real_main(args)
# see https://github.com/pyca/cryptography/issues/2911
cryptography.hazmat.backends.openssl.backend.activate_builtin_random()
controller = init_controller(args)
signal.signal(signal.SIGTERM, lambda a,b: controller.stop.set())
signal.signal(signal.SIGINT, lambda a,b: controller.stop.set())
try:
signal.signal(signal.SIGQUIT, dump_state)
except AttributeError:
# SIGQUIT does not exist on some platforms (windows)
pass
controller.run_until_shutdown()
def ensure_rethinkdb_tables():
'''

View File

@ -259,7 +259,8 @@ class PlaybackIndexDb(object):
pass
def notify(self, recorded_url, records):
self.save(records[0].warc_filename, records, records[0].offset)
if records:
self.save(records[0].warc_filename, records, records[0].offset)
def save(self, warcfile, recordset, offset):
response_record = recordset[0]

View File

@ -171,12 +171,13 @@ class StatsDb:
bucket_stats["total"]["urls"] += 1
bucket_stats["total"]["wire_bytes"] += recorded_url.size
if records[0].get_header(warctools.WarcRecord.TYPE) == warctools.WarcRecord.REVISIT:
bucket_stats["revisit"]["urls"] += 1
bucket_stats["revisit"]["wire_bytes"] += recorded_url.size
else:
bucket_stats["new"]["urls"] += 1
bucket_stats["new"]["wire_bytes"] += recorded_url.size
if records:
if records[0].type == b'revisit':
bucket_stats["revisit"]["urls"] += 1
bucket_stats["revisit"]["wire_bytes"] += recorded_url.size
else:
bucket_stats["new"]["urls"] += 1
bucket_stats["new"]["wire_bytes"] += recorded_url.size
json_value = json.dumps(bucket_stats, separators=(',',':'))
conn.execute(
@ -306,8 +307,7 @@ class RethinkStatsDb(StatsDb):
def tally(self, recorded_url, records):
buckets = self.buckets(recorded_url)
is_revisit = records[0].get_header(
warctools.WarcRecord.TYPE) == warctools.WarcRecord.REVISIT
is_revisit = records[0].type == b'revisit'
with self._batch_lock:
for bucket in buckets:
bucket_stats = self._batch.setdefault(

View File

@ -153,16 +153,29 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
limit_key, limit_value = item
self._enforce_limit(limit_key, limit_value, soft=True)
def _security_check(self, warcprox_meta):
'''
Sends a 400 if `warcprox_meta` specifies a 'warc-prefix' and the
'warc-prefix' contains a slash or backslash.
'''
if warcprox_meta and 'warc-prefix' in warcprox_meta and (
'/' in warcprox_meta['warc-prefix']
or '\\' in warcprox_meta['warc-prefix']):
raise Exception(
"request rejected by warcprox: slash and backslash are not "
"permitted in warc-prefix")
def _connect_to_remote_server(self):
'''
Wraps MitmProxyHandler._connect_to_remote_server, first enforcing
Wraps `MitmProxyHandler._connect_to_remote_server`, first enforcing
limits and block rules in the Warcprox-Meta request header, if any.
Raises warcprox.RequestBlockedByRule if a rule has been enforced.
Otherwise calls MitmProxyHandler._connect_to_remote_server, which
initializes self._remote_server_sock.
Raises `warcprox.RequestBlockedByRule` if a rule has been enforced.
Otherwise calls `MitmProxyHandler._connect_to_remote_server`, which
initializes `self._remote_server_sock`.
'''
if 'Warcprox-Meta' in self.headers:
warcprox_meta = json.loads(self.headers['Warcprox-Meta'])
self._security_check(warcprox_meta)
self._enforce_limits(warcprox_meta)
self._enforce_blocks(warcprox_meta)
return warcprox.mitmproxy.MitmProxyHandler._connect_to_remote_server(self)
@ -204,7 +217,8 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
client_ip=self.client_address[0],
content_type=content_type, method=self.command,
timestamp=timestamp, host=self.hostname,
duration=datetime.datetime.utcnow()-timestamp)
duration=datetime.datetime.utcnow()-timestamp,
referer=self.headers.get('referer'))
self.server.recorded_url_q.put(recorded_url)
return recorded_url
@ -279,16 +293,19 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
if raw_warcprox_meta:
warcprox_meta = json.loads(raw_warcprox_meta)
rec_custom = RecordedUrl(url=self.url,
request_data=request_data,
response_recorder=None,
remote_ip=b'',
warcprox_meta=warcprox_meta,
content_type=self.headers['Content-Type'],
custom_type=warc_type or self.headers['WARC-Type'].encode('utf-8'),
status=204, size=len(request_data),
client_ip=self.client_address[0],
method=self.command, timestamp=timestamp)
rec_custom = RecordedUrl(
url=self.url,
request_data=request_data,
response_recorder=None,
remote_ip=b'',
warcprox_meta=warcprox_meta,
content_type=self.headers['Content-Type'],
custom_type=warc_type or self.headers['WARC-Type'].encode('utf-8'),
status=204, size=len(request_data),
client_ip=self.client_address[0],
method=self.command,
timestamp=timestamp,
duration=datetime.datetime.utcnow()-timestamp)
self.server.recorded_url_q.put(rec_custom)
self.send_response(204, 'OK')
@ -311,7 +328,7 @@ class RecordedUrl:
def __init__(self, url, request_data, response_recorder, remote_ip,
warcprox_meta=None, content_type=None, custom_type=None,
status=None, size=None, client_ip=None, method=None,
timestamp=None, host=None, duration=None):
timestamp=None, host=None, duration=None, referer=None):
# XXX should test what happens with non-ascii url (when does
# url-encoding happen?)
if type(url) is not bytes:
@ -348,6 +365,7 @@ class RecordedUrl:
self.timestamp = timestamp
self.host = host
self.duration = duration
self.referer = referer
# inherit from object so that multiple inheritance from this class works
# properly in python 2

View File

@ -82,13 +82,15 @@ class WarcWriterThread(threading.Thread):
self.logger.info("%s urls left to write", qsize)
recorded_url = self.recorded_url_q.get(block=True, timeout=0.5)
records = []
self.idle = None
if self._filter_accepts(recorded_url):
if self.dedup_db:
warcprox.dedup.decorate_with_dedup_info(self.dedup_db,
recorded_url, base32=self.options.base32)
records = self.writer_pool.write_records(recorded_url)
self._final_tasks(recorded_url, records)
self._final_tasks(recorded_url, records)
# try to release resources in a timely fashion
if recorded_url.response_recorder and recorded_url.response_recorder.tempfile:
@ -134,11 +136,15 @@ class WarcWriterThread(threading.Thread):
payload_digest = "-"
# 2015-07-17T22:32:23.672Z 1 58 dns:www.dhss.delaware.gov P http://www.dhss.delaware.gov/dhss/ text/dns #045 20150717223214881+316 sha1:63UTPB7GTWIHAGIK3WWL76E57BBTJGAK http://www.dhss.delaware.gov/dhss/ - {"warcFileOffset":2964,"warcFilename":"ARCHIVEIT-1303-WEEKLY-JOB165158-20150717223222113-00000.warc.gz"}
self.logger.info("{} {} {} {} {} size={} {} {} {} offset={}".format(
recorded_url.client_ip, recorded_url.status, recorded_url.method,
recorded_url.url.decode("utf-8"), recorded_url.mimetype,
recorded_url.size, payload_digest, records[0].type.decode("utf-8"),
records[0].warc_filename, records[0].offset))
type_ = records[0].type.decode("utf-8") if records else '-'
filename = records[0].warc_filename if records else '-'
offset = records[0].offset if records else '-'
self.logger.info(
"%s %s %s %s %s size=%s %s %s %s offset=%s",
recorded_url.client_ip, recorded_url.status,
recorded_url.method, recorded_url.url.decode("utf-8"),
recorded_url.mimetype, recorded_url.size, payload_digest,
type_, filename, offset)
def _final_tasks(self, recorded_url, records):
if self.listeners: