diff --git a/.travis.yml b/.travis.yml index 20b15f2..565ba13 100644 --- a/.travis.yml +++ b/.travis.yml @@ -13,6 +13,7 @@ python: matrix: allow_failures: + - python: pypy - python: pypy3 - python: nightly - python: 3.7-dev diff --git a/setup.py b/setup.py index dae6b94..f67903b 100755 --- a/setup.py +++ b/setup.py @@ -52,7 +52,7 @@ except: setuptools.setup( name='warcprox', - version='2.2.1b2.dev107', + version='2.2.1b2.dev112', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index abc373a..af7eb05 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -254,7 +254,8 @@ def warcprox_(request): '--method-filter=POST', '--port=0', '--playback-port=0', - '--onion-tor-socks-proxy=localhost:9050'] + '--onion-tor-socks-proxy=localhost:9050', + '--crawl-log-dir=crawl-logs'] if request.config.getoption('--rethinkdb-dedup-url'): argv.append('--rethinkdb-dedup-url=%s' % request.config.getoption('--rethinkdb-dedup-url')) # test these here only @@ -1339,6 +1340,200 @@ def test_via_response_header(warcprox_, http_daemon, archiving_proxies, playback elif record.rec_type == 'request': assert record.http_headers.get_header('via') == '1.1 warcprox' +def test_slash_in_warc_prefix(warcprox_, http_daemon, archiving_proxies): + url = 'http://localhost:%s/b/b' % http_daemon.server_port + headers = {"Warcprox-Meta": json.dumps({"warc-prefix":"../../../../etc/a"})} + response = requests.get(url, proxies=archiving_proxies, headers=headers) + assert response.status_code == 500 + assert response.reason == 'request rejected by warcprox: slash and backslash are not permitted in warc-prefix' + + url = 'http://localhost:%s/b/c' % http_daemon.server_port + headers = {"Warcprox-Meta": json.dumps({"warc-prefix":"..\\..\\..\\derp\\monkey"})} + response = requests.get(url, proxies=archiving_proxies, headers=headers) + assert response.status_code == 500 + assert response.reason == 'request rejected by warcprox: slash and backslash are not permitted in warc-prefix' + +def test_crawl_log(warcprox_, http_daemon, archiving_proxies): + try: + os.unlink(os.path.join(warcprox_.options.crawl_log_dir, 'crawl.log')) + except: + pass + + url = 'http://localhost:%s/b/aa' % http_daemon.server_port + response = requests.get(url, proxies=archiving_proxies) + assert response.status_code == 200 + + url = 'http://localhost:%s/b/bb' % http_daemon.server_port + headers = { + "Warcprox-Meta": json.dumps({"warc-prefix":"test_crawl_log_1"}), + "Referer": "http://example.com/referer", + } + response = requests.get(url, proxies=archiving_proxies, headers=headers) + assert response.status_code == 200 + + start = time.time() + while time.time() - start < 10: + if os.path.exists(os.path.join( + warcprox_.options.crawl_log_dir, 'test_crawl_log_1.log')): + break + time.sleep(0.5) + + crawl_log = open(os.path.join( + warcprox_.options.crawl_log_dir, 'crawl.log'), 'rb').read() + # tests will fail in year 3000 :) + assert re.match(b'\A2[^\n]+\n\Z', crawl_log) + assert crawl_log[24:31] == b' 200 ' + assert crawl_log[31:42] == b' 54 ' + fields = crawl_log.split() + assert len(fields) == 13 + assert fields[3].endswith(b'/b/aa') + assert fields[4] == b'-' + assert fields[5] == b'-' + assert fields[6] == b'text/plain' + assert fields[7] == b'-' + assert re.match(br'^\d{17}[+]\d{3}', fields[8]) + assert fields[9] == b'sha1:69d51a46e44a04e8110da0c91897cece979fa70f' + assert fields[10] == b'-' + assert fields[11] == b'-' + extra_info = json.loads(fields[12].decode('utf-8')) + assert set(extra_info.keys()) == { + 'contentSize', 'warcFilename', 'warcFileOffset'} + assert extra_info['contentSize'] == 145 + + crawl_log_1 = open(os.path.join( + warcprox_.options.crawl_log_dir, 'test_crawl_log_1.log'), 'rb').read() + assert re.match(b'\A2[^\n]+\n\Z', crawl_log_1) + assert crawl_log_1[24:31] == b' 200 ' + assert crawl_log_1[31:42] == b' 54 ' + fields = crawl_log_1.split() + assert len(fields) == 13 + assert fields[3].endswith(b'/b/bb') + assert fields[4] == b'-' + assert fields[5] == b'http://example.com/referer' + assert fields[6] == b'text/plain' + assert fields[7] == b'-' + assert re.match(br'^\d{17}[+]\d{3}', fields[8]) + assert fields[9] == b'sha1:9aae6acb797c75ca8eb5dded9be2127cc61b3fbb' + assert fields[10] == b'-' + assert fields[11] == b'-' + extra_info = json.loads(fields[12].decode('utf-8')) + assert set(extra_info.keys()) == { + 'contentSize', 'warcFilename', 'warcFileOffset'} + assert extra_info['contentSize'] == 145 + + # should be deduplicated + url = 'http://localhost:%s/b/aa' % http_daemon.server_port + headers = {"Warcprox-Meta": json.dumps({ + "warc-prefix": "test_crawl_log_2", + "metadata": {"seed": "http://example.com/seed"}})} + response = requests.get(url, proxies=archiving_proxies, headers=headers) + assert response.status_code == 200 + + start = time.time() + while time.time() - start < 10: + if os.path.exists(os.path.join( + warcprox_.options.crawl_log_dir, 'test_crawl_log_2.log')): + break + time.sleep(0.5) + + crawl_log_2 = open(os.path.join( + warcprox_.options.crawl_log_dir, 'test_crawl_log_2.log'), 'rb').read() + + assert re.match(b'\A2[^\n]+\n\Z', crawl_log_2) + assert crawl_log_2[24:31] == b' 200 ' + assert crawl_log_2[31:42] == b' 54 ' + fields = crawl_log_2.split() + assert len(fields) == 13 + assert fields[3].endswith(b'/b/aa') + assert fields[4] == b'-' + assert fields[5] == b'-' + assert fields[6] == b'text/plain' + assert fields[7] == b'-' + assert re.match(br'^\d{17}[+]\d{3}', fields[8]) + assert fields[9] == b'sha1:69d51a46e44a04e8110da0c91897cece979fa70f' + assert fields[10] == b'http://example.com/seed' + assert fields[11] == b'duplicate:digest' + extra_info = json.loads(fields[12].decode('utf-8')) + assert set(extra_info.keys()) == { + 'contentSize', 'warcFilename', 'warcFileOffset'} + assert extra_info['contentSize'] == 145 + + # a request that is not saved to a warc (because of --method-filter) + # currently not logged at all (XXX maybe it should be) + url = 'http://localhost:%s/b/cc' % http_daemon.server_port + headers = {'Warcprox-Meta': json.dumps({'warc-prefix': 'test_crawl_log_3'})} + response = requests.head(url, proxies=archiving_proxies, headers=headers) + + start = time.time() + while time.time() - start < 10: + if os.path.exists(os.path.join( + warcprox_.options.crawl_log_dir, 'test_crawl_log_3.log')): + break + time.sleep(0.5) + + crawl_log_3 = open(os.path.join( + warcprox_.options.crawl_log_dir, 'test_crawl_log_3.log'), 'rb').read() + + assert re.match(b'\A2[^\n]+\n\Z', crawl_log_3) + assert crawl_log_3[24:31] == b' 200 ' + assert crawl_log_3[31:42] == b' 0 ' + fields = crawl_log_3.split() + assert len(fields) == 13 + assert fields[3].endswith(b'/b/cc') + assert fields[4] == b'-' + assert fields[5] == b'-' + assert fields[6] == b'text/plain' + assert fields[7] == b'-' + assert re.match(br'^\d{17}[+]\d{3}', fields[8]) + assert fields[9] == b'sha1:da39a3ee5e6b4b0d3255bfef95601890afd80709' + assert fields[10] == b'-' + assert fields[11] == b'-' + extra_info = json.loads(fields[12].decode('utf-8')) + assert extra_info == {'contentSize': 91} + + # WARCPROX_WRITE_RECORD + url = 'http://fakeurl/' + payload = b'I am the WARCPROX_WRITE_RECORD payload' + headers = { + 'Content-Type': 'text/plain', + 'WARC-Type': 'metadata', + 'Host': 'N/A', + 'Warcprox-Meta': json.dumps({'warc-prefix': 'test_crawl_log_4'}), + } + response = requests.request( + method='WARCPROX_WRITE_RECORD', url=url, data=payload, + headers=headers, proxies=archiving_proxies) + assert response.status_code == 204 + + start = time.time() + while time.time() - start < 10: + if os.path.exists(os.path.join( + warcprox_.options.crawl_log_dir, 'test_crawl_log_4.log')): + break + time.sleep(0.5) + + crawl_log_4 = open(os.path.join( + warcprox_.options.crawl_log_dir, 'test_crawl_log_4.log'), 'rb').read() + + assert re.match(b'\A2[^\n]+\n\Z', crawl_log_4) + assert crawl_log_4[24:31] == b' 204 ' + assert crawl_log_4[31:42] == b' 38 ' + fields = crawl_log_4.split() + assert len(fields) == 13 + assert fields[3] == b'http://fakeurl/' + assert fields[4] == b'-' + assert fields[5] == b'-' + assert fields[6] == b'text/plain' + assert fields[7] == b'-' + assert re.match(br'^\d{17}[+]\d{3}', fields[8]) + assert fields[9] == b'sha1:bb56497c17d2684f5eca4af9df908c78ba74ca1c' + assert fields[10] == b'-' + assert fields[11] == b'-' + extra_info = json.loads(fields[12].decode('utf-8')) + assert set(extra_info.keys()) == { + 'contentSize', 'warcFilename', 'warcFileOffset'} + assert extra_info['contentSize'] == 38 + def test_long_warcprox_meta( warcprox_, http_daemon, archiving_proxies, playback_proxies): url = 'http://localhost:%s/b/g' % http_daemon.server_port diff --git a/warcprox/__init__.py b/warcprox/__init__.py index 5564ff3..ecd6f53 100644 --- a/warcprox/__init__.py +++ b/warcprox/__init__.py @@ -114,3 +114,4 @@ import warcprox.warc as warc import warcprox.writerthread as writerthread import warcprox.stats as stats import warcprox.bigtable as bigtable +import warcprox.crawl_log as crawl_log diff --git a/warcprox/bigtable.py b/warcprox/bigtable.py index a87d49c..f0ed4a7 100644 --- a/warcprox/bigtable.py +++ b/warcprox/bigtable.py @@ -201,9 +201,10 @@ class RethinkCaptures: return entry def notify(self, recorded_url, records): - entry = self._assemble_entry(recorded_url, records) - with self._batch_lock: - self._batch.append(entry) + if records: + entry = self._assemble_entry(recorded_url, records) + with self._batch_lock: + self._batch.append(entry) def close(self): self.stop() diff --git a/warcprox/crawl_log.py b/warcprox/crawl_log.py new file mode 100644 index 0000000..5b4a4fc --- /dev/null +++ b/warcprox/crawl_log.py @@ -0,0 +1,86 @@ +#!/usr/bin/env python +''' +warcprox/crawl_log.py - heritrix-style crawl logger + +Copyright (C) 2017 Internet Archive + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, +USA. +''' +import logging +import datetime +import json +import os +import warcprox + +class CrawlLogger(object): + def __init__(self, dir_, options=warcprox.Options()): + self.dir = dir_ + self.options = options + if not os.path.exists(self.dir): + logging.info('creating directory %r', self.dir) + os.mkdir(self.dir) + + def notify(self, recorded_url, records): + # 2017-08-03T21:45:24.496Z 200 2189 https://autismcouncil.wisconsin.gov/robots.txt P https://autismcouncil.wisconsin.gov/ text/plain #001 20170803214523617+365 sha1:PBS2CEF7B4OSEXZZF3QE2XN2VHYCPNPX https://autismcouncil.wisconsin.gov/ duplicate:digest {"warcFileOffset":942,"contentSize":2495,"warcFilename":"ARCHIVEIT-2159-TEST-JOB319150-20170803214522386-00000.warc.gz"} + now = datetime.datetime.utcnow() + extra_info = {'contentSize': recorded_url.size,} + if records: + extra_info['warcFilename'] = records[0].warc_filename + extra_info['warcFileOffset'] = records[0].offset + if recorded_url.response_recorder: + content_length = recorded_url.response_recorder.len - recorded_url.response_recorder.payload_offset + payload_digest = warcprox.digest_str( + recorded_url.response_recorder.payload_digest, + self.options.base32) + else: + # WARCPROX_WRITE_RECORD request + content_length = len(recorded_url.request_data) + payload_digest = records[0].get_header(b'WARC-Payload-Digest') + fields = [ + '{:%Y-%m-%dT%H:%M:%S}.{:03d}Z'.format(now, now.microsecond//1000), + '% 5s' % recorded_url.status, + '% 10s' % content_length, + recorded_url.url, + '-', # hop path + recorded_url.referer or '-', + recorded_url.mimetype or '-', + '-', + '{:%Y%m%d%H%M%S}{:03d}+{:03d}'.format( + recorded_url.timestamp, + recorded_url.timestamp.microsecond//1000, + recorded_url.duration.microseconds//1000), + payload_digest, + recorded_url.warcprox_meta.get('metadata', {}).get('seed', '-'), + 'duplicate:digest' if records and records[0].type == b'revisit' else '-', + json.dumps(extra_info, separators=(',',':')), + ] + for i in range(len(fields)): + # `fields` is a mix of `bytes` and `unicode`, make them all `bytes` + try: + fields[i] = fields[i].encode('utf-8') + except: + pass + line = b' '.join(fields) + b'\n' + + if 'warc-prefix' in recorded_url.warcprox_meta: + filename = '%s.log' % recorded_url.warcprox_meta['warc-prefix'] + else: + filename = 'crawl.log' + + crawl_log_path = os.path.join(self.dir, filename) + with open(crawl_log_path, 'ab') as f: + f.write(line) + diff --git a/warcprox/dedup.py b/warcprox/dedup.py index 8388344..78d35db 100644 --- a/warcprox/dedup.py +++ b/warcprox/dedup.py @@ -96,7 +96,7 @@ class DedupDb(object): return result def notify(self, recorded_url, records): - if (records[0].get_header(warctools.WarcRecord.TYPE) == warctools.WarcRecord.RESPONSE + if (records and records[0].type == b'response' and recorded_url.response_recorder.payload_size() > 0): digest_key = warcprox.digest_str( recorded_url.response_recorder.payload_digest, @@ -174,7 +174,7 @@ class RethinkDedupDb: return result def notify(self, recorded_url, records): - if (records[0].get_header(warctools.WarcRecord.TYPE) == warctools.WarcRecord.RESPONSE + if (records and records[0].type == b'response' and recorded_url.response_recorder.payload_size() > 0): digest_key = warcprox.digest_str(recorded_url.response_recorder.payload_digest, self.options.base32) diff --git a/warcprox/main.py b/warcprox/main.py index c915917..a2fca9c 100644 --- a/warcprox/main.py +++ b/warcprox/main.py @@ -152,6 +152,11 @@ def _build_arg_parser(prog=os.path.basename(sys.argv[0])): default=None, help=( 'host:port of tor socks proxy, used only to connect to ' '.onion sites')) + arg_parser.add_argument( + '--crawl-log-dir', dest='crawl_log_dir', default=None, help=( + 'if specified, write crawl log files in the specified ' + 'directory; one crawl log is written per warc filename ' + 'prefix; crawl log format mimics heritrix')) arg_parser.add_argument( '--plugin', metavar='PLUGIN_CLASS', dest='plugins', action='append', help=( @@ -248,6 +253,10 @@ def init_controller(args): playback_index_db = None playback_proxy = None + if args.crawl_log_dir: + listeners.append(warcprox.crawl_log.CrawlLogger( + args.crawl_log_dir, options=options)) + for qualname in args.plugins or []: try: (module_name, class_name) = qualname.rsplit('.', 1) @@ -285,22 +294,6 @@ def init_controller(args): return controller -def real_main(args): - # see https://github.com/pyca/cryptography/issues/2911 - cryptography.hazmat.backends.openssl.backend.activate_builtin_random() - - controller = init_controller(args) - - signal.signal(signal.SIGTERM, lambda a,b: controller.stop.set()) - signal.signal(signal.SIGINT, lambda a,b: controller.stop.set()) - try: - signal.signal(signal.SIGQUIT, dump_state) - except AttributeError: - # SIGQUIT does not exist on some platforms (windows) - pass - - controller.run_until_shutdown() - def parse_args(argv=sys.argv): ''' Parses command line arguments with argparse. @@ -329,7 +322,20 @@ def main(argv=sys.argv): '%(asctime)s %(process)d %(levelname)s %(threadName)s ' '%(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s')) - real_main(args) + # see https://github.com/pyca/cryptography/issues/2911 + cryptography.hazmat.backends.openssl.backend.activate_builtin_random() + + controller = init_controller(args) + + signal.signal(signal.SIGTERM, lambda a,b: controller.stop.set()) + signal.signal(signal.SIGINT, lambda a,b: controller.stop.set()) + try: + signal.signal(signal.SIGQUIT, dump_state) + except AttributeError: + # SIGQUIT does not exist on some platforms (windows) + pass + + controller.run_until_shutdown() def ensure_rethinkdb_tables(): ''' diff --git a/warcprox/playback.py b/warcprox/playback.py index a9aa47d..1a698c0 100644 --- a/warcprox/playback.py +++ b/warcprox/playback.py @@ -259,7 +259,8 @@ class PlaybackIndexDb(object): pass def notify(self, recorded_url, records): - self.save(records[0].warc_filename, records, records[0].offset) + if records: + self.save(records[0].warc_filename, records, records[0].offset) def save(self, warcfile, recordset, offset): response_record = recordset[0] diff --git a/warcprox/stats.py b/warcprox/stats.py index 5a87461..b4d54dd 100644 --- a/warcprox/stats.py +++ b/warcprox/stats.py @@ -171,12 +171,13 @@ class StatsDb: bucket_stats["total"]["urls"] += 1 bucket_stats["total"]["wire_bytes"] += recorded_url.size - if records[0].get_header(warctools.WarcRecord.TYPE) == warctools.WarcRecord.REVISIT: - bucket_stats["revisit"]["urls"] += 1 - bucket_stats["revisit"]["wire_bytes"] += recorded_url.size - else: - bucket_stats["new"]["urls"] += 1 - bucket_stats["new"]["wire_bytes"] += recorded_url.size + if records: + if records[0].type == b'revisit': + bucket_stats["revisit"]["urls"] += 1 + bucket_stats["revisit"]["wire_bytes"] += recorded_url.size + else: + bucket_stats["new"]["urls"] += 1 + bucket_stats["new"]["wire_bytes"] += recorded_url.size json_value = json.dumps(bucket_stats, separators=(',',':')) conn.execute( @@ -306,8 +307,7 @@ class RethinkStatsDb(StatsDb): def tally(self, recorded_url, records): buckets = self.buckets(recorded_url) - is_revisit = records[0].get_header( - warctools.WarcRecord.TYPE) == warctools.WarcRecord.REVISIT + is_revisit = records[0].type == b'revisit' with self._batch_lock: for bucket in buckets: bucket_stats = self._batch.setdefault( diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index d37e588..afe1835 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -153,16 +153,29 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): limit_key, limit_value = item self._enforce_limit(limit_key, limit_value, soft=True) + def _security_check(self, warcprox_meta): + ''' + Sends a 400 if `warcprox_meta` specifies a 'warc-prefix' and the + 'warc-prefix' contains a slash or backslash. + ''' + if warcprox_meta and 'warc-prefix' in warcprox_meta and ( + '/' in warcprox_meta['warc-prefix'] + or '\\' in warcprox_meta['warc-prefix']): + raise Exception( + "request rejected by warcprox: slash and backslash are not " + "permitted in warc-prefix") + def _connect_to_remote_server(self): ''' - Wraps MitmProxyHandler._connect_to_remote_server, first enforcing + Wraps `MitmProxyHandler._connect_to_remote_server`, first enforcing limits and block rules in the Warcprox-Meta request header, if any. - Raises warcprox.RequestBlockedByRule if a rule has been enforced. - Otherwise calls MitmProxyHandler._connect_to_remote_server, which - initializes self._remote_server_sock. + Raises `warcprox.RequestBlockedByRule` if a rule has been enforced. + Otherwise calls `MitmProxyHandler._connect_to_remote_server`, which + initializes `self._remote_server_sock`. ''' if 'Warcprox-Meta' in self.headers: warcprox_meta = json.loads(self.headers['Warcprox-Meta']) + self._security_check(warcprox_meta) self._enforce_limits(warcprox_meta) self._enforce_blocks(warcprox_meta) return warcprox.mitmproxy.MitmProxyHandler._connect_to_remote_server(self) @@ -204,7 +217,8 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): client_ip=self.client_address[0], content_type=content_type, method=self.command, timestamp=timestamp, host=self.hostname, - duration=datetime.datetime.utcnow()-timestamp) + duration=datetime.datetime.utcnow()-timestamp, + referer=self.headers.get('referer')) self.server.recorded_url_q.put(recorded_url) return recorded_url @@ -279,16 +293,19 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): if raw_warcprox_meta: warcprox_meta = json.loads(raw_warcprox_meta) - rec_custom = RecordedUrl(url=self.url, - request_data=request_data, - response_recorder=None, - remote_ip=b'', - warcprox_meta=warcprox_meta, - content_type=self.headers['Content-Type'], - custom_type=warc_type or self.headers['WARC-Type'].encode('utf-8'), - status=204, size=len(request_data), - client_ip=self.client_address[0], - method=self.command, timestamp=timestamp) + rec_custom = RecordedUrl( + url=self.url, + request_data=request_data, + response_recorder=None, + remote_ip=b'', + warcprox_meta=warcprox_meta, + content_type=self.headers['Content-Type'], + custom_type=warc_type or self.headers['WARC-Type'].encode('utf-8'), + status=204, size=len(request_data), + client_ip=self.client_address[0], + method=self.command, + timestamp=timestamp, + duration=datetime.datetime.utcnow()-timestamp) self.server.recorded_url_q.put(rec_custom) self.send_response(204, 'OK') @@ -311,7 +328,7 @@ class RecordedUrl: def __init__(self, url, request_data, response_recorder, remote_ip, warcprox_meta=None, content_type=None, custom_type=None, status=None, size=None, client_ip=None, method=None, - timestamp=None, host=None, duration=None): + timestamp=None, host=None, duration=None, referer=None): # XXX should test what happens with non-ascii url (when does # url-encoding happen?) if type(url) is not bytes: @@ -348,6 +365,7 @@ class RecordedUrl: self.timestamp = timestamp self.host = host self.duration = duration + self.referer = referer # inherit from object so that multiple inheritance from this class works # properly in python 2 diff --git a/warcprox/writerthread.py b/warcprox/writerthread.py index e422a65..a8a6ef7 100644 --- a/warcprox/writerthread.py +++ b/warcprox/writerthread.py @@ -82,13 +82,15 @@ class WarcWriterThread(threading.Thread): self.logger.info("%s urls left to write", qsize) recorded_url = self.recorded_url_q.get(block=True, timeout=0.5) + records = [] self.idle = None if self._filter_accepts(recorded_url): if self.dedup_db: warcprox.dedup.decorate_with_dedup_info(self.dedup_db, recorded_url, base32=self.options.base32) records = self.writer_pool.write_records(recorded_url) - self._final_tasks(recorded_url, records) + + self._final_tasks(recorded_url, records) # try to release resources in a timely fashion if recorded_url.response_recorder and recorded_url.response_recorder.tempfile: @@ -134,11 +136,15 @@ class WarcWriterThread(threading.Thread): payload_digest = "-" # 2015-07-17T22:32:23.672Z 1 58 dns:www.dhss.delaware.gov P http://www.dhss.delaware.gov/dhss/ text/dns #045 20150717223214881+316 sha1:63UTPB7GTWIHAGIK3WWL76E57BBTJGAK http://www.dhss.delaware.gov/dhss/ - {"warcFileOffset":2964,"warcFilename":"ARCHIVEIT-1303-WEEKLY-JOB165158-20150717223222113-00000.warc.gz"} - self.logger.info("{} {} {} {} {} size={} {} {} {} offset={}".format( - recorded_url.client_ip, recorded_url.status, recorded_url.method, - recorded_url.url.decode("utf-8"), recorded_url.mimetype, - recorded_url.size, payload_digest, records[0].type.decode("utf-8"), - records[0].warc_filename, records[0].offset)) + type_ = records[0].type.decode("utf-8") if records else '-' + filename = records[0].warc_filename if records else '-' + offset = records[0].offset if records else '-' + self.logger.info( + "%s %s %s %s %s size=%s %s %s %s offset=%s", + recorded_url.client_ip, recorded_url.status, + recorded_url.method, recorded_url.url.decode("utf-8"), + recorded_url.mimetype, recorded_url.size, payload_digest, + type_, filename, offset) def _final_tasks(self, recorded_url, records): if self.listeners: