From 027a242e19da8de738cd154407c812d07b9e79c2 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 3 Aug 2017 15:18:20 -0700 Subject: [PATCH 01/15] add missing dependency warcio to tests_require --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index cbc50e6..c09998d 100755 --- a/setup.py +++ b/setup.py @@ -58,7 +58,7 @@ setuptools.setup( license='GPL', packages=['warcprox'], install_requires=deps, - tests_require=['requests>=2.0.1', 'pytest'], # >=2.0.1 for https://github.com/kennethreitz/requests/pull/1636 + tests_require=['requests>=2.0.1', 'pytest', 'warcio'], # >=2.0.1 for https://github.com/kennethreitz/requests/pull/1636 cmdclass = {'test': PyTest}, test_suite='warcprox.tests', entry_points={ From 0cf283f058cfa48e0d3a309aa749357382e1af9c Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 3 Aug 2017 15:19:57 -0700 Subject: [PATCH 02/15] can't see any reason to split the main() like this (anymore?) --- warcprox/main.py | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/warcprox/main.py b/warcprox/main.py index f5d0597..54355d7 100644 --- a/warcprox/main.py +++ b/warcprox/main.py @@ -262,18 +262,6 @@ def init_controller(args): return controller -def real_main(args): - # see https://github.com/pyca/cryptography/issues/2911 - cryptography.hazmat.backends.openssl.backend.activate_builtin_random() - - controller = init_controller(args) - - signal.signal(signal.SIGTERM, lambda a,b: controller.stop.set()) - signal.signal(signal.SIGINT, lambda a,b: controller.stop.set()) - signal.signal(signal.SIGQUIT, dump_state) - - controller.run_until_shutdown() - def parse_args(argv=sys.argv): ''' Parses command line arguments with argparse. @@ -303,7 +291,16 @@ def main(argv=sys.argv): '%(asctime)s %(process)d %(levelname)s %(threadName)s ' '%(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s')) - real_main(args) + # see https://github.com/pyca/cryptography/issues/2911 + cryptography.hazmat.backends.openssl.backend.activate_builtin_random() + + controller = init_controller(args) + + signal.signal(signal.SIGTERM, lambda a,b: controller.stop.set()) + signal.signal(signal.SIGINT, lambda a,b: controller.stop.set()) + signal.signal(signal.SIGQUIT, dump_state) + + controller.run_until_shutdown() def ensure_rethinkdb_tables(): ''' From 7aed867c90873f5aa41a5c3841a757f9080bd690 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Mon, 7 Aug 2017 11:30:52 -0700 Subject: [PATCH 03/15] disallow slash and backslash in warc-prefix --- tests/test_warcprox.py | 13 ++++++++ warcprox/crawl_log.py | 68 ++++++++++++++++++++++++++++++++++++++++++ warcprox/warcproxy.py | 21 ++++++++++--- 3 files changed, 98 insertions(+), 4 deletions(-) create mode 100644 warcprox/crawl_log.py diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index dd80a86..fb908d9 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -1429,6 +1429,19 @@ def test_via_response_header(warcprox_, http_daemon, archiving_proxies, playback elif record.rec_type == 'request': assert record.http_headers.get_header('via') == '1.1 warcprox' +def test_slash_in_warc_prefix(warcprox_, http_daemon, archiving_proxies): + url = 'http://localhost:%s/b/b' % http_daemon.server_port + headers = {"Warcprox-Meta": json.dumps({"warc-prefix":"../../../../etc/a"})} + response = requests.get(url, proxies=archiving_proxies, headers=headers) + assert response.status_code == 500 + assert response.reason == 'request rejected by warcprox: slash and backslash are not permitted in warc-prefix' + + url = 'http://localhost:%s/b/c' % http_daemon.server_port + headers = {"Warcprox-Meta": json.dumps({"warc-prefix":"..\\..\\..\\derp\\monkey"})} + response = requests.get(url, proxies=archiving_proxies, headers=headers) + assert response.status_code == 500 + assert response.reason == 'request rejected by warcprox: slash and backslash are not permitted in warc-prefix' + if __name__ == '__main__': pytest.main() diff --git a/warcprox/crawl_log.py b/warcprox/crawl_log.py new file mode 100644 index 0000000..4c04563 --- /dev/null +++ b/warcprox/crawl_log.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python +''' +warcprox/crawl_log.py - heritrix-style crawl logger + +Copyright (C) 2017 Internet Archive + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, +USA. +''' +import logging +import datetime +import json + +class CrawlLogger(object): + def __init__(self, dir_): + self.dir = dir_ + + def notify(self, recorded_url, records): + # 2017-08-03T21:45:24.496Z 200 2189 https://autismcouncil.wisconsin.gov/robots.txt P https://autismcouncil.wisconsin.gov/ text/plain #001 20170803214523617+365 sha1:PBS2CEF7B4OSEXZZF3QE2XN2VHYCPNPX https://autismcouncil.wisconsin.gov/ duplicate:digest {"warcFileOffset":942,"contentSize":2495,"warcFilename":"ARCHIVEIT-2159-TEST-JOB319150-20170803214522386-00000.warc.gz"} + now = datetime.datetime.utcnow() + extra_info = { + 'contentSize': recorded_url.size, + 'warcFilename': records[0].warc_filename, + 'warcFileOffset': records[0].offset, + } + fields = [ + '{:%Y-%m-%dT%H:%M:%S}.{:03d}'.format(now, now.microsecond//1000), + '% 5s' % recorded_url.status, + '% 10s' % (recorded_url.response_recorder.len - recorded_url.response_recorder.payload_offset), + recorded_url.url, + '-', # hop path + recorded_url.referer or '-', + recorded_url.mimetype, + '-', + '{:%Y%m%d%H%M%S}{:03d}+{:03d}'.format( + recorded_url.timestamp, recorded_url.microsecond//1000, + recorded_url.duration.microseconds//1000), + warcprox.digest_str( + recorded_url.response_recorder.payload_digest, True), + recorded_url.warcprox_meta.get('metadata', {}).get('seed', '-'), + 'duplicate:digest' if records[0].type == b'revisit' else '0', + json.dumps(extra_info, separators=(',',':')), + ] + for i in range(len(fields)): + # `fields` is a mix of `bytes` and `unicode`, make them all `bytes + try: + fields[i] = fields[i].encode('utf-8') + except: + pass + line = b' '.join(fields) + + if 'warc-prefix' in recorded_url.warcprox_meta: + filename = '%s.log' % recorded_url.warcprox_meta['warc-prefix'] + os.path.join( + self.dir, ) + diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index b6c96d6..6cbc9e4 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -156,16 +156,29 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): limit_key, limit_value = item self._enforce_limit(limit_key, limit_value, soft=True) + def _security_check(self, warcprox_meta): + ''' + Sends a 400 if `warcprox_meta` specifies a 'warc-prefix' and the + 'warc-prefix' contains a slash or backslash. + ''' + if warcprox_meta and 'warc-prefix' in warcprox_meta and ( + '/' in warcprox_meta['warc-prefix'] + or '\\' in warcprox_meta['warc-prefix']): + raise Exception( + "request rejected by warcprox: slash and backslash are not " + "permitted in warc-prefix") + def _connect_to_remote_server(self): ''' - Wraps MitmProxyHandler._connect_to_remote_server, first enforcing + Wraps `MitmProxyHandler._connect_to_remote_server`, first enforcing limits and block rules in the Warcprox-Meta request header, if any. - Raises warcprox.RequestBlockedByRule if a rule has been enforced. - Otherwise calls MitmProxyHandler._connect_to_remote_server, which - initializes self._remote_server_sock. + Raises `warcprox.RequestBlockedByRule` if a rule has been enforced. + Otherwise calls `MitmProxyHandler._connect_to_remote_server`, which + initializes `self._remote_server_sock`. ''' if 'Warcprox-Meta' in self.headers: warcprox_meta = json.loads(self.headers['Warcprox-Meta']) + self._security_check(warcprox_meta) self._enforce_limits(warcprox_meta) self._enforce_blocks(warcprox_meta) return warcprox.mitmproxy.MitmProxyHandler._connect_to_remote_server(self) From ecb07fc9cdc38ff3936e7cf68c780a703452147a Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Mon, 7 Aug 2017 13:07:54 -0700 Subject: [PATCH 04/15] heritrix-style crawl log support --- tests/test_warcprox.py | 106 +++++++++++++++++++++++++++++++++++++++-- warcprox/__init__.py | 1 + warcprox/crawl_log.py | 23 +++++---- warcprox/main.py | 8 ++++ warcprox/warcproxy.py | 6 ++- 5 files changed, 131 insertions(+), 13 deletions(-) diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index fb908d9..bdace29 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -374,13 +374,17 @@ def warcprox_(request, captures_db, dedup_db, stats_db, service_registry): options.method_filter = ['GET','POST'] + options.crawl_log_dir = tempfile.mkdtemp( + prefix='warcprox-test-', suffix='-crawl-log') + crawl_logger = warcprox.crawl_log.CrawlLogger(options.crawl_log_dir) + writer_pool = warcprox.writer.WarcWriterPool(options) warc_writer_threads = [ warcprox.writerthread.WarcWriterThread( recorded_url_q=recorded_url_q, writer_pool=writer_pool, dedup_db=dedup_db, listeners=[ - captures_db or dedup_db, playback_index_db, stats_db], - options=options) + captures_db or dedup_db, playback_index_db, stats_db, + crawl_logger], options=options) for i in range(int(proxy.max_threads ** 0.5))] warcprox_ = warcprox.controller.WarcproxController( @@ -396,7 +400,8 @@ def warcprox_(request, captures_db, dedup_db, stats_db, service_registry): logging.info('stopping warcprox') warcprox_.stop.set() warcprox_thread.join() - for f in (ca_file, ca_dir, options.directory, playback_index_db_file): + for f in (ca_file, ca_dir, options.directory, playback_index_db_file, + options.crawl_log_dir): if os.path.isdir(f): logging.info('deleting directory {}'.format(f)) shutil.rmtree(f) @@ -1442,6 +1447,101 @@ def test_slash_in_warc_prefix(warcprox_, http_daemon, archiving_proxies): assert response.status_code == 500 assert response.reason == 'request rejected by warcprox: slash and backslash are not permitted in warc-prefix' +def test_crawl_log(warcprox_, http_daemon, archiving_proxies): + try: + os.unlink(os.path.join(warcprox_.options.crawl_log_dir, 'crawl.log')) + except: + pass + + url = 'http://localhost:%s/b/d' % http_daemon.server_port + response = requests.get(url, proxies=archiving_proxies) + assert response.status_code == 200 + + url = 'http://localhost:%s/b/e' % http_daemon.server_port + headers = { + "Warcprox-Meta": json.dumps({"warc-prefix":"test_crawl_log_1"}), + "Referer": "http://example.com/referer", + } + response = requests.get(url, proxies=archiving_proxies, headers=headers) + assert response.status_code == 200 + + # should be deduplicated + url = 'http://localhost:%s/b/d' % http_daemon.server_port + headers = {"Warcprox-Meta": json.dumps({ + "warc-prefix": "test_crawl_log_2", + "metadata": {"seed": "http://example.com/seed"}})} + response = requests.get(url, proxies=archiving_proxies, headers=headers) + assert response.status_code == 200 + + start = time.time() + while time.time() - start < 10: + if os.path.exists(os.path.join( + warcprox_.options.crawl_log_dir, 'test_crawl_log_2.log')): + break + time.sleep(0.5) + + crawl_log = open(os.path.join( + warcprox_.options.crawl_log_dir, 'crawl.log'), 'rb').read() + crawl_log_1 = open(os.path.join( + warcprox_.options.crawl_log_dir, 'test_crawl_log_1.log'), 'rb').read() + crawl_log_2 = open(os.path.join( + warcprox_.options.crawl_log_dir, 'test_crawl_log_2.log'), 'rb').read() + + # tests will fail in year 3000 :) + assert re.match(b'\A2[^\n]+\n\Z', crawl_log) + assert crawl_log[24:31] == b' 200 ' + assert crawl_log[31:42] == b' 44 ' + fields = crawl_log.split() + assert len(fields) == 13 + assert fields[3].endswith(b'/b/d') + assert fields[4] == b'-' + assert fields[5] == b'-' + assert fields[6] == b'text/plain' + assert fields[7] == b'-' + assert re.match(br'^\d{17}[+]\d{3}', fields[8]) + assert fields[9] == b'sha1:NKW7OKGZHXIMRKILQGOB2EB22U2MXJLM' + assert fields[10] == b'-' + assert fields[11] == b'-' + extra_info = json.loads(fields[12].decode('utf-8')) + assert extra_info.keys() == {'contentSize','warcFilename','warcFileOffset'} + assert extra_info['contentSize'] == 135 + + assert re.match(b'\A2[^\n]+\n\Z', crawl_log_1) + assert crawl_log_1[24:31] == b' 200 ' + assert crawl_log_1[31:42] == b' 44 ' + fields = crawl_log_1.split() + assert len(fields) == 13 + assert fields[3].endswith(b'/b/e') + assert fields[4] == b'-' + assert fields[5] == b'http://example.com/referer' + assert fields[6] == b'text/plain' + assert fields[7] == b'-' + assert re.match(br'^\d{17}[+]\d{3}', fields[8]) + assert fields[9] == b'sha1:DJURQDWPRKWTNMHDA6YS2KN2RLTWQ4JJ' + assert fields[10] == b'-' + assert fields[11] == b'-' + extra_info = json.loads(fields[12].decode('utf-8')) + assert extra_info.keys() == {'contentSize','warcFilename','warcFileOffset'} + assert extra_info['contentSize'] == 135 + + assert re.match(b'\A2[^\n]+\n\Z', crawl_log_2) + assert crawl_log_2[24:31] == b' 200 ' + assert crawl_log_2[31:42] == b' 44 ' + fields = crawl_log_2.split() + assert len(fields) == 13 + assert fields[3].endswith(b'/b/d') + assert fields[4] == b'-' + assert fields[5] == b'-' + assert fields[6] == b'text/plain' + assert fields[7] == b'-' + assert re.match(br'^\d{17}[+]\d{3}', fields[8]) + assert fields[9] == b'sha1:NKW7OKGZHXIMRKILQGOB2EB22U2MXJLM' + assert fields[10] == b'http://example.com/seed' + assert fields[11] == b'-' + extra_info = json.loads(fields[12].decode('utf-8')) + assert extra_info.keys() == {'contentSize','warcFilename','warcFileOffset'} + assert extra_info['contentSize'] == 135 + if __name__ == '__main__': pytest.main() diff --git a/warcprox/__init__.py b/warcprox/__init__.py index 5564ff3..ecd6f53 100644 --- a/warcprox/__init__.py +++ b/warcprox/__init__.py @@ -114,3 +114,4 @@ import warcprox.warc as warc import warcprox.writerthread as writerthread import warcprox.stats as stats import warcprox.bigtable as bigtable +import warcprox.crawl_log as crawl_log diff --git a/warcprox/crawl_log.py b/warcprox/crawl_log.py index 4c04563..6888110 100644 --- a/warcprox/crawl_log.py +++ b/warcprox/crawl_log.py @@ -22,6 +22,8 @@ USA. import logging import datetime import json +import os +import warcprox class CrawlLogger(object): def __init__(self, dir_): @@ -36,33 +38,38 @@ class CrawlLogger(object): 'warcFileOffset': records[0].offset, } fields = [ - '{:%Y-%m-%dT%H:%M:%S}.{:03d}'.format(now, now.microsecond//1000), + '{:%Y-%m-%dT%H:%M:%S}.{:03d}Z'.format(now, now.microsecond//1000), '% 5s' % recorded_url.status, '% 10s' % (recorded_url.response_recorder.len - recorded_url.response_recorder.payload_offset), recorded_url.url, '-', # hop path recorded_url.referer or '-', - recorded_url.mimetype, + recorded_url.mimetype or '-', '-', '{:%Y%m%d%H%M%S}{:03d}+{:03d}'.format( - recorded_url.timestamp, recorded_url.microsecond//1000, + recorded_url.timestamp, + recorded_url.timestamp.microsecond//1000, recorded_url.duration.microseconds//1000), warcprox.digest_str( recorded_url.response_recorder.payload_digest, True), recorded_url.warcprox_meta.get('metadata', {}).get('seed', '-'), - 'duplicate:digest' if records[0].type == b'revisit' else '0', + 'duplicate:digest' if records[0].type == b'revisit' else '-', json.dumps(extra_info, separators=(',',':')), ] for i in range(len(fields)): - # `fields` is a mix of `bytes` and `unicode`, make them all `bytes + # `fields` is a mix of `bytes` and `unicode`, make them all `bytes` try: fields[i] = fields[i].encode('utf-8') except: pass - line = b' '.join(fields) + line = b' '.join(fields) + b'\n' if 'warc-prefix' in recorded_url.warcprox_meta: filename = '%s.log' % recorded_url.warcprox_meta['warc-prefix'] - os.path.join( - self.dir, ) + else: + filename = 'crawl.log' + + crawl_log_path = os.path.join(self.dir, filename) + with open(crawl_log_path, 'ab') as f: + f.write(line) diff --git a/warcprox/main.py b/warcprox/main.py index 54355d7..30f85d2 100644 --- a/warcprox/main.py +++ b/warcprox/main.py @@ -128,6 +128,11 @@ def _build_arg_parser(prog=os.path.basename(sys.argv[0])): default=None, help=( 'host:port of tor socks proxy, used only to connect to ' '.onion sites')) + arg_parser.add_argument( + '--crawl-log-dir', dest='crawl_log_dir', default=None, help=( + 'if specified, write crawl log files in the specified ' + 'directory; one crawl log is written per warc filename ' + 'prefix; crawl log format mimics heritrix')) arg_parser.add_argument( '--plugin', metavar='PLUGIN_CLASS', dest='plugins', action='append', help=( @@ -228,6 +233,9 @@ def init_controller(args): playback_index_db = None playback_proxy = None + if args.crawl_log_dir: + listeners.append(warcprox.crawl_log.CrawlLogger(args.crawl_log_dir)) + for qualname in args.plugins or []: try: (module_name, class_name) = qualname.rsplit('.', 1) diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index 6cbc9e4..95ca81f 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -207,7 +207,8 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): client_ip=self.client_address[0], content_type=prox_rec_res.getheader("Content-Type"), method=self.command, timestamp=timestamp, host=self.hostname, - duration=datetime.datetime.utcnow()-timestamp) + duration=datetime.datetime.utcnow()-timestamp, + referer=self.headers.get('referer')) self.server.recorded_url_q.put(recorded_url) return recorded_url @@ -314,7 +315,7 @@ class RecordedUrl: def __init__(self, url, request_data, response_recorder, remote_ip, warcprox_meta=None, content_type=None, custom_type=None, status=None, size=None, client_ip=None, method=None, - timestamp=None, host=None, duration=None): + timestamp=None, host=None, duration=None, referer=None): # XXX should test what happens with non-ascii url (when does # url-encoding happen?) if type(url) is not bytes: @@ -351,6 +352,7 @@ class RecordedUrl: self.timestamp = timestamp self.host = host self.duration = duration + self.referer = referer # inherit from object so that multiple inheritance from this class works # properly in python 2 From edcc2cc296ce44ad0e4f40ed0654bd63cb7ce03a Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Mon, 7 Aug 2017 13:23:51 -0700 Subject: [PATCH 05/15] fix crawl log test --- tests/test_warcprox.py | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index bdace29..63c96d2 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -1465,18 +1465,10 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies): response = requests.get(url, proxies=archiving_proxies, headers=headers) assert response.status_code == 200 - # should be deduplicated - url = 'http://localhost:%s/b/d' % http_daemon.server_port - headers = {"Warcprox-Meta": json.dumps({ - "warc-prefix": "test_crawl_log_2", - "metadata": {"seed": "http://example.com/seed"}})} - response = requests.get(url, proxies=archiving_proxies, headers=headers) - assert response.status_code == 200 - start = time.time() while time.time() - start < 10: if os.path.exists(os.path.join( - warcprox_.options.crawl_log_dir, 'test_crawl_log_2.log')): + warcprox_.options.crawl_log_dir, 'test_crawl_log_1.log')): break time.sleep(0.5) @@ -1484,9 +1476,6 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies): warcprox_.options.crawl_log_dir, 'crawl.log'), 'rb').read() crawl_log_1 = open(os.path.join( warcprox_.options.crawl_log_dir, 'test_crawl_log_1.log'), 'rb').read() - crawl_log_2 = open(os.path.join( - warcprox_.options.crawl_log_dir, 'test_crawl_log_2.log'), 'rb').read() - # tests will fail in year 3000 :) assert re.match(b'\A2[^\n]+\n\Z', crawl_log) assert crawl_log[24:31] == b' 200 ' @@ -1524,6 +1513,24 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies): assert extra_info.keys() == {'contentSize','warcFilename','warcFileOffset'} assert extra_info['contentSize'] == 135 + # should be deduplicated + url = 'http://localhost:%s/b/d' % http_daemon.server_port + headers = {"Warcprox-Meta": json.dumps({ + "warc-prefix": "test_crawl_log_2", + "metadata": {"seed": "http://example.com/seed"}})} + response = requests.get(url, proxies=archiving_proxies, headers=headers) + assert response.status_code == 200 + + start = time.time() + while time.time() - start < 10: + if os.path.exists(os.path.join( + warcprox_.options.crawl_log_dir, 'test_crawl_log_2.log')): + break + time.sleep(0.5) + + crawl_log_2 = open(os.path.join( + warcprox_.options.crawl_log_dir, 'test_crawl_log_2.log'), 'rb').read() + assert re.match(b'\A2[^\n]+\n\Z', crawl_log_2) assert crawl_log_2[24:31] == b' 200 ' assert crawl_log_2[31:42] == b' 44 ' @@ -1537,7 +1544,7 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies): assert re.match(br'^\d{17}[+]\d{3}', fields[8]) assert fields[9] == b'sha1:NKW7OKGZHXIMRKILQGOB2EB22U2MXJLM' assert fields[10] == b'http://example.com/seed' - assert fields[11] == b'-' + assert fields[11] == b'duplicate:digest' extra_info = json.loads(fields[12].decode('utf-8')) assert extra_info.keys() == {'contentSize','warcFilename','warcFileOffset'} assert extra_info['contentSize'] == 135 From 8a768dcd44bd0bb6fa51d1a57804b352ae3d2cfc Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Mon, 7 Aug 2017 14:06:53 -0700 Subject: [PATCH 06/15] fix crawl log test to avoid any dedup collisions --- tests/test_warcprox.py | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index 63c96d2..19636d6 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -1453,11 +1453,11 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies): except: pass - url = 'http://localhost:%s/b/d' % http_daemon.server_port + url = 'http://localhost:%s/b/aa' % http_daemon.server_port response = requests.get(url, proxies=archiving_proxies) assert response.status_code == 200 - url = 'http://localhost:%s/b/e' % http_daemon.server_port + url = 'http://localhost:%s/b/bb' % http_daemon.server_port headers = { "Warcprox-Meta": json.dumps({"warc-prefix":"test_crawl_log_1"}), "Referer": "http://example.com/referer", @@ -1474,47 +1474,47 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies): crawl_log = open(os.path.join( warcprox_.options.crawl_log_dir, 'crawl.log'), 'rb').read() - crawl_log_1 = open(os.path.join( - warcprox_.options.crawl_log_dir, 'test_crawl_log_1.log'), 'rb').read() # tests will fail in year 3000 :) assert re.match(b'\A2[^\n]+\n\Z', crawl_log) assert crawl_log[24:31] == b' 200 ' - assert crawl_log[31:42] == b' 44 ' + assert crawl_log[31:42] == b' 54 ' fields = crawl_log.split() assert len(fields) == 13 - assert fields[3].endswith(b'/b/d') + assert fields[3].endswith(b'/b/aa') assert fields[4] == b'-' assert fields[5] == b'-' assert fields[6] == b'text/plain' assert fields[7] == b'-' assert re.match(br'^\d{17}[+]\d{3}', fields[8]) - assert fields[9] == b'sha1:NKW7OKGZHXIMRKILQGOB2EB22U2MXJLM' + assert fields[9] == b'sha1:NHKRURXEJICOQEINUDERRF6OZ2LZ7JYP' assert fields[10] == b'-' assert fields[11] == b'-' extra_info = json.loads(fields[12].decode('utf-8')) assert extra_info.keys() == {'contentSize','warcFilename','warcFileOffset'} - assert extra_info['contentSize'] == 135 + assert extra_info['contentSize'] == 145 + crawl_log_1 = open(os.path.join( + warcprox_.options.crawl_log_dir, 'test_crawl_log_1.log'), 'rb').read() assert re.match(b'\A2[^\n]+\n\Z', crawl_log_1) assert crawl_log_1[24:31] == b' 200 ' - assert crawl_log_1[31:42] == b' 44 ' + assert crawl_log_1[31:42] == b' 54 ' fields = crawl_log_1.split() assert len(fields) == 13 - assert fields[3].endswith(b'/b/e') + assert fields[3].endswith(b'/b/bb') assert fields[4] == b'-' assert fields[5] == b'http://example.com/referer' assert fields[6] == b'text/plain' assert fields[7] == b'-' assert re.match(br'^\d{17}[+]\d{3}', fields[8]) - assert fields[9] == b'sha1:DJURQDWPRKWTNMHDA6YS2KN2RLTWQ4JJ' + assert fields[9] == b'sha1:TKXGVS3ZPR24VDVV3XWZXYQSPTDBWP53' assert fields[10] == b'-' assert fields[11] == b'-' extra_info = json.loads(fields[12].decode('utf-8')) assert extra_info.keys() == {'contentSize','warcFilename','warcFileOffset'} - assert extra_info['contentSize'] == 135 + assert extra_info['contentSize'] == 145 # should be deduplicated - url = 'http://localhost:%s/b/d' % http_daemon.server_port + url = 'http://localhost:%s/b/aa' % http_daemon.server_port headers = {"Warcprox-Meta": json.dumps({ "warc-prefix": "test_crawl_log_2", "metadata": {"seed": "http://example.com/seed"}})} @@ -1533,21 +1533,21 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies): assert re.match(b'\A2[^\n]+\n\Z', crawl_log_2) assert crawl_log_2[24:31] == b' 200 ' - assert crawl_log_2[31:42] == b' 44 ' + assert crawl_log_2[31:42] == b' 54 ' fields = crawl_log_2.split() assert len(fields) == 13 - assert fields[3].endswith(b'/b/d') + assert fields[3].endswith(b'/b/aa') assert fields[4] == b'-' assert fields[5] == b'-' assert fields[6] == b'text/plain' assert fields[7] == b'-' assert re.match(br'^\d{17}[+]\d{3}', fields[8]) - assert fields[9] == b'sha1:NKW7OKGZHXIMRKILQGOB2EB22U2MXJLM' + assert fields[9] == b'sha1:NHKRURXEJICOQEINUDERRF6OZ2LZ7JYP' assert fields[10] == b'http://example.com/seed' assert fields[11] == b'duplicate:digest' extra_info = json.loads(fields[12].decode('utf-8')) assert extra_info.keys() == {'contentSize','warcFilename','warcFileOffset'} - assert extra_info['contentSize'] == 135 + assert extra_info['contentSize'] == 145 if __name__ == '__main__': pytest.main() From 30b69c5838563d91f541e42d4bdc595685542244 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Mon, 7 Aug 2017 16:21:08 -0700 Subject: [PATCH 07/15] make test pass with py27 --- tests/test_warcprox.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index 19636d6..9e5fa6a 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -1490,7 +1490,8 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies): assert fields[10] == b'-' assert fields[11] == b'-' extra_info = json.loads(fields[12].decode('utf-8')) - assert extra_info.keys() == {'contentSize','warcFilename','warcFileOffset'} + assert set(extra_info.keys()) == { + 'contentSize', 'warcFilename', 'warcFileOffset'} assert extra_info['contentSize'] == 145 crawl_log_1 = open(os.path.join( @@ -1510,7 +1511,8 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies): assert fields[10] == b'-' assert fields[11] == b'-' extra_info = json.loads(fields[12].decode('utf-8')) - assert extra_info.keys() == {'contentSize','warcFilename','warcFileOffset'} + assert set(extra_info.keys()) == { + 'contentSize', 'warcFilename', 'warcFileOffset'} assert extra_info['contentSize'] == 145 # should be deduplicated @@ -1546,7 +1548,8 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies): assert fields[10] == b'http://example.com/seed' assert fields[11] == b'duplicate:digest' extra_info = json.loads(fields[12].decode('utf-8')) - assert extra_info.keys() == {'contentSize','warcFilename','warcFileOffset'} + assert set(extra_info.keys()) == { + 'contentSize', 'warcFilename', 'warcFileOffset'} assert extra_info['contentSize'] == 145 if __name__ == '__main__': From bac45a9df26211a8ecdbddbe836a36ed66cbcfec Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 8 Aug 2017 11:54:57 -0700 Subject: [PATCH 08/15] create crawl log dir at startup if it doesn't exist --- warcprox/crawl_log.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/warcprox/crawl_log.py b/warcprox/crawl_log.py index 6888110..eff4df9 100644 --- a/warcprox/crawl_log.py +++ b/warcprox/crawl_log.py @@ -28,6 +28,9 @@ import warcprox class CrawlLogger(object): def __init__(self, dir_): self.dir = dir_ + if not os.path.exists(self.dir): + logging.info('creating directory %r', self.dir) + os.mkdir(self.dir) def notify(self, recorded_url, records): # 2017-08-03T21:45:24.496Z 200 2189 https://autismcouncil.wisconsin.gov/robots.txt P https://autismcouncil.wisconsin.gov/ text/plain #001 20170803214523617+365 sha1:PBS2CEF7B4OSEXZZF3QE2XN2VHYCPNPX https://autismcouncil.wisconsin.gov/ duplicate:digest {"warcFileOffset":942,"contentSize":2495,"warcFilename":"ARCHIVEIT-2159-TEST-JOB319150-20170803214522386-00000.warc.gz"} From 42f5e9b7a42eacb46a82405a319fa2af43edd565 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 9 Nov 2017 11:21:42 -0800 Subject: [PATCH 09/15] add --crawl-log-dir option to fix failing test --- tests/test_warcprox.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index cf43949..0e8d33a 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -253,7 +253,8 @@ def warcprox_(request, rethinkdb_servers, rethinkdb_big_table): '--method-filter=POST', '--port=0', '--playback-port=0', - '--onion-tor-socks-proxy=localhost:9050'] + '--onion-tor-socks-proxy=localhost:9050', + '--crawl-log-dir=crawl-logs'] if rethinkdb_servers: rethinkdb_db = 'warcprox_test_%s' % ''.join(random.sample("abcdefghijklmnopqrstuvwxyz0123456789_",8)) argv.append('--rethinkdb-servers=%s' % rethinkdb_servers) From 72c2950c101cfa5073e715bf2098ea6fc62cbb75 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 9 Nov 2017 11:22:58 -0800 Subject: [PATCH 10/15] bump dev version number --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 3de51f7..a31c846 100755 --- a/setup.py +++ b/setup.py @@ -51,7 +51,7 @@ except: setuptools.setup( name='warcprox', - version='2.2.1b2.dev107', + version='2.2.1b2.dev108', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', From 538c9e0caf27f623ac57ff9f35f7a117456a7d5d Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 9 Nov 2017 12:34:06 -0800 Subject: [PATCH 11/15] modify test_crawl_log to expect crawl log to honor --base32 setting and add tests of WARCPROX_WRITE_RECORD request and HEAD request (not written to warc) --- setup.py | 2 +- tests/test_warcprox.py | 58 +++++++++++++++++++++++++++++++++++++++--- 2 files changed, 56 insertions(+), 4 deletions(-) diff --git a/setup.py b/setup.py index a31c846..04d0352 100755 --- a/setup.py +++ b/setup.py @@ -51,7 +51,7 @@ except: setuptools.setup( name='warcprox', - version='2.2.1b2.dev108', + version='2.2.1b2.dev109', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index 0e8d33a..21bf57e 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -1374,7 +1374,7 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies): assert fields[6] == b'text/plain' assert fields[7] == b'-' assert re.match(br'^\d{17}[+]\d{3}', fields[8]) - assert fields[9] == b'sha1:NHKRURXEJICOQEINUDERRF6OZ2LZ7JYP' + assert fields[9] == b'sha1:69d51a46e44a04e8110da0c91897cece979fa70f' assert fields[10] == b'-' assert fields[11] == b'-' extra_info = json.loads(fields[12].decode('utf-8')) @@ -1395,7 +1395,7 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies): assert fields[6] == b'text/plain' assert fields[7] == b'-' assert re.match(br'^\d{17}[+]\d{3}', fields[8]) - assert fields[9] == b'sha1:TKXGVS3ZPR24VDVV3XWZXYQSPTDBWP53' + assert fields[9] == b'sha1:9aae6acb797c75ca8eb5dded9be2127cc61b3fbb' assert fields[10] == b'-' assert fields[11] == b'-' extra_info = json.loads(fields[12].decode('utf-8')) @@ -1432,7 +1432,7 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies): assert fields[6] == b'text/plain' assert fields[7] == b'-' assert re.match(br'^\d{17}[+]\d{3}', fields[8]) - assert fields[9] == b'sha1:NHKRURXEJICOQEINUDERRF6OZ2LZ7JYP' + assert fields[9] == b'sha1:69d51a46e44a04e8110da0c91897cece979fa70f' assert fields[10] == b'http://example.com/seed' assert fields[11] == b'duplicate:digest' extra_info = json.loads(fields[12].decode('utf-8')) @@ -1440,6 +1440,58 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies): 'contentSize', 'warcFilename', 'warcFileOffset'} assert extra_info['contentSize'] == 145 + # a request that is not saved to a warc (because of --method-filter) + # currently not logged at all (XXX maybe it should be) + url = 'http://localhost:%s/b/cc' % http_daemon.server_port + headers = {'Warcprox-Meta': json.dumps({'warc-prefix': 'test_crawl_log_3'})} + response = requests.head(url, proxies=archiving_proxies, headers=headers) + time.sleep(3) + assert not os.path.exists(os.path.join( + warcprox_.options.crawl_log_dir, 'test_crawl_log_3.log')) + + # WARCPROX_WRITE_RECORD + url = 'http://fakeurl/' + payload = b'I am the WARCPROX_WRITE_RECORD payload' + headers = { + 'Content-Type': 'text/plain', + 'WARC-Type': 'metadata', + 'Host': 'N/A', + 'Warcprox-Meta': json.dumps({'warc-prefix': 'test_crawl_log_4'}), + } + response = requests.request( + method='WARCPROX_WRITE_RECORD', url=url, data=payload, + headers=headers, proxies=archiving_proxies) + assert response.status_code == 204 + + start = time.time() + while time.time() - start < 10: + if os.path.exists(os.path.join( + warcprox_.options.crawl_log_dir, 'test_crawl_log_4.log')): + break + time.sleep(0.5) + + crawl_log_4 = open(os.path.join( + warcprox_.options.crawl_log_dir, 'test_crawl_log_4.log'), 'rb').read() + + assert re.match(b'\A2[^\n]+\n\Z', crawl_log_4) + assert crawl_log_4[24:31] == b' 204 ' + assert crawl_log_4[31:42] == b' 38 ' + fields = crawl_log_4.split() + assert len(fields) == 13 + assert fields[3] == b'http://fakeurl/' + assert fields[4] == b'-' + assert fields[5] == b'-' + assert fields[6] == b'text/plain' + assert fields[7] == b'-' + assert re.match(br'^\d{17}[+]\d{3}', fields[8]) + assert fields[9] == b'sha1:bb56497c17d2684f5eca4af9df908c78ba74ca1c' + assert fields[10] == b'-' + assert fields[11] == b'-' + extra_info = json.loads(fields[12].decode('utf-8')) + assert set(extra_info.keys()) == { + 'contentSize', 'warcFilename', 'warcFileOffset'} + assert extra_info['contentSize'] == 38 + def test_long_warcprox_meta( warcprox_, http_daemon, archiving_proxies, playback_proxies): url = 'http://localhost:%s/b/g' % http_daemon.server_port From 78c6137016f602a37925bc81f1baefd4b40e4ff1 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 9 Nov 2017 12:35:10 -0800 Subject: [PATCH 12/15] fix crawl log handling of WARCPROX_WRITE_RECORD request --- setup.py | 2 +- warcprox/crawl_log.py | 18 ++++++++++++++---- warcprox/main.py | 3 ++- warcprox/warcproxy.py | 23 +++++++++++++---------- 4 files changed, 30 insertions(+), 16 deletions(-) diff --git a/setup.py b/setup.py index 04d0352..78e312b 100755 --- a/setup.py +++ b/setup.py @@ -51,7 +51,7 @@ except: setuptools.setup( name='warcprox', - version='2.2.1b2.dev109', + version='2.2.1b2.dev110', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', diff --git a/warcprox/crawl_log.py b/warcprox/crawl_log.py index eff4df9..68d1fbf 100644 --- a/warcprox/crawl_log.py +++ b/warcprox/crawl_log.py @@ -26,8 +26,9 @@ import os import warcprox class CrawlLogger(object): - def __init__(self, dir_): + def __init__(self, dir_, options=warcprox.Options()): self.dir = dir_ + self.options = options if not os.path.exists(self.dir): logging.info('creating directory %r', self.dir) os.mkdir(self.dir) @@ -40,10 +41,20 @@ class CrawlLogger(object): 'warcFilename': records[0].warc_filename, 'warcFileOffset': records[0].offset, } + if recorded_url.response_recorder: + content_length = recorded_url.response_recorder.len - recorded_url.response_recorder.payload_offset + payload_digest = warcprox.digest_str( + recorded_url.response_recorder.payload_digest, + self.options.base32) + else: + # WARCPROX_WRITE_RECORD request + content_length = len(recorded_url.request_data) + payload_digest = records[0].get_header( + b'WARC-Payload-Digest') fields = [ '{:%Y-%m-%dT%H:%M:%S}.{:03d}Z'.format(now, now.microsecond//1000), '% 5s' % recorded_url.status, - '% 10s' % (recorded_url.response_recorder.len - recorded_url.response_recorder.payload_offset), + '% 10s' % content_length, recorded_url.url, '-', # hop path recorded_url.referer or '-', @@ -53,8 +64,7 @@ class CrawlLogger(object): recorded_url.timestamp, recorded_url.timestamp.microsecond//1000, recorded_url.duration.microseconds//1000), - warcprox.digest_str( - recorded_url.response_recorder.payload_digest, True), + payload_digest, recorded_url.warcprox_meta.get('metadata', {}).get('seed', '-'), 'duplicate:digest' if records[0].type == b'revisit' else '-', json.dumps(extra_info, separators=(',',':')), diff --git a/warcprox/main.py b/warcprox/main.py index e21ff6a..1e6aaf8 100644 --- a/warcprox/main.py +++ b/warcprox/main.py @@ -238,7 +238,8 @@ def init_controller(args): playback_proxy = None if args.crawl_log_dir: - listeners.append(warcprox.crawl_log.CrawlLogger(args.crawl_log_dir)) + listeners.append(warcprox.crawl_log.CrawlLogger( + args.crawl_log_dir, options=options)) for qualname in args.plugins or []: try: diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index 544dc61..afe1835 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -293,16 +293,19 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): if raw_warcprox_meta: warcprox_meta = json.loads(raw_warcprox_meta) - rec_custom = RecordedUrl(url=self.url, - request_data=request_data, - response_recorder=None, - remote_ip=b'', - warcprox_meta=warcprox_meta, - content_type=self.headers['Content-Type'], - custom_type=warc_type or self.headers['WARC-Type'].encode('utf-8'), - status=204, size=len(request_data), - client_ip=self.client_address[0], - method=self.command, timestamp=timestamp) + rec_custom = RecordedUrl( + url=self.url, + request_data=request_data, + response_recorder=None, + remote_ip=b'', + warcprox_meta=warcprox_meta, + content_type=self.headers['Content-Type'], + custom_type=warc_type or self.headers['WARC-Type'].encode('utf-8'), + status=204, size=len(request_data), + client_ip=self.client_address[0], + method=self.command, + timestamp=timestamp, + duration=datetime.datetime.utcnow()-timestamp) self.server.recorded_url_q.put(rec_custom) self.send_response(204, 'OK') From df6d7f1ce684b5f5b9be145bc57ffa66652d3392 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 9 Nov 2017 13:09:07 -0800 Subject: [PATCH 13/15] make test_crawl_log expect HEAD request to be logged --- setup.py | 2 +- tests/test_warcprox.py | 30 +++++++++++++++++++++++++++--- 2 files changed, 28 insertions(+), 4 deletions(-) diff --git a/setup.py b/setup.py index 78e312b..02853d5 100755 --- a/setup.py +++ b/setup.py @@ -51,7 +51,7 @@ except: setuptools.setup( name='warcprox', - version='2.2.1b2.dev110', + version='2.2.1b2.dev111', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index 21bf57e..0a357b2 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -1445,9 +1445,33 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies): url = 'http://localhost:%s/b/cc' % http_daemon.server_port headers = {'Warcprox-Meta': json.dumps({'warc-prefix': 'test_crawl_log_3'})} response = requests.head(url, proxies=archiving_proxies, headers=headers) - time.sleep(3) - assert not os.path.exists(os.path.join( - warcprox_.options.crawl_log_dir, 'test_crawl_log_3.log')) + + start = time.time() + while time.time() - start < 10: + if os.path.exists(os.path.join( + warcprox_.options.crawl_log_dir, 'test_crawl_log_3.log')): + break + time.sleep(0.5) + + crawl_log_3 = open(os.path.join( + warcprox_.options.crawl_log_dir, 'test_crawl_log_3.log'), 'rb').read() + + assert re.match(b'\A2[^\n]+\n\Z', crawl_log_3) + assert crawl_log_3[24:31] == b' 200 ' + assert crawl_log_3[31:42] == b' 0 ' + fields = crawl_log_3.split() + assert len(fields) == 13 + assert fields[3].endswith(b'/b/cc') + assert fields[4] == b'-' + assert fields[5] == b'-' + assert fields[6] == b'text/plain' + assert fields[7] == b'-' + assert re.match(br'^\d{17}[+]\d{3}', fields[8]) + assert fields[9] == b'sha1:da39a3ee5e6b4b0d3255bfef95601890afd80709' + assert fields[10] == b'-' + assert fields[11] == b'-' + extra_info = json.loads(fields[12].decode('utf-8')) + assert extra_info == {'contentSize': 91} # WARCPROX_WRITE_RECORD url = 'http://fakeurl/' From 700056cc0428b447e41c1c33cab3dc21a147af0a Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 9 Nov 2017 13:10:57 -0800 Subject: [PATCH 14/15] fix failing test just committed, which involves running "listeners" for all urls, including those not archived; make adjustments accordingly --- setup.py | 2 +- warcprox/bigtable.py | 7 ++++--- warcprox/crawl_log.py | 14 ++++++-------- warcprox/dedup.py | 4 ++-- warcprox/playback.py | 3 ++- warcprox/stats.py | 16 ++++++++-------- warcprox/writerthread.py | 18 ++++++++++++------ 7 files changed, 35 insertions(+), 29 deletions(-) diff --git a/setup.py b/setup.py index 02853d5..0b5c891 100755 --- a/setup.py +++ b/setup.py @@ -51,7 +51,7 @@ except: setuptools.setup( name='warcprox', - version='2.2.1b2.dev111', + version='2.2.1b2.dev112', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', diff --git a/warcprox/bigtable.py b/warcprox/bigtable.py index f3d897d..c9547b2 100644 --- a/warcprox/bigtable.py +++ b/warcprox/bigtable.py @@ -200,9 +200,10 @@ class RethinkCaptures: return entry def notify(self, recorded_url, records): - entry = self._assemble_entry(recorded_url, records) - with self._batch_lock: - self._batch.append(entry) + if records: + entry = self._assemble_entry(recorded_url, records) + with self._batch_lock: + self._batch.append(entry) def close(self): self.stop() diff --git a/warcprox/crawl_log.py b/warcprox/crawl_log.py index 68d1fbf..5b4a4fc 100644 --- a/warcprox/crawl_log.py +++ b/warcprox/crawl_log.py @@ -36,11 +36,10 @@ class CrawlLogger(object): def notify(self, recorded_url, records): # 2017-08-03T21:45:24.496Z 200 2189 https://autismcouncil.wisconsin.gov/robots.txt P https://autismcouncil.wisconsin.gov/ text/plain #001 20170803214523617+365 sha1:PBS2CEF7B4OSEXZZF3QE2XN2VHYCPNPX https://autismcouncil.wisconsin.gov/ duplicate:digest {"warcFileOffset":942,"contentSize":2495,"warcFilename":"ARCHIVEIT-2159-TEST-JOB319150-20170803214522386-00000.warc.gz"} now = datetime.datetime.utcnow() - extra_info = { - 'contentSize': recorded_url.size, - 'warcFilename': records[0].warc_filename, - 'warcFileOffset': records[0].offset, - } + extra_info = {'contentSize': recorded_url.size,} + if records: + extra_info['warcFilename'] = records[0].warc_filename + extra_info['warcFileOffset'] = records[0].offset if recorded_url.response_recorder: content_length = recorded_url.response_recorder.len - recorded_url.response_recorder.payload_offset payload_digest = warcprox.digest_str( @@ -49,8 +48,7 @@ class CrawlLogger(object): else: # WARCPROX_WRITE_RECORD request content_length = len(recorded_url.request_data) - payload_digest = records[0].get_header( - b'WARC-Payload-Digest') + payload_digest = records[0].get_header(b'WARC-Payload-Digest') fields = [ '{:%Y-%m-%dT%H:%M:%S}.{:03d}Z'.format(now, now.microsecond//1000), '% 5s' % recorded_url.status, @@ -66,7 +64,7 @@ class CrawlLogger(object): recorded_url.duration.microseconds//1000), payload_digest, recorded_url.warcprox_meta.get('metadata', {}).get('seed', '-'), - 'duplicate:digest' if records[0].type == b'revisit' else '-', + 'duplicate:digest' if records and records[0].type == b'revisit' else '-', json.dumps(extra_info, separators=(',',':')), ] for i in range(len(fields)): diff --git a/warcprox/dedup.py b/warcprox/dedup.py index e70f5f9..b9cd223 100644 --- a/warcprox/dedup.py +++ b/warcprox/dedup.py @@ -93,7 +93,7 @@ class DedupDb(object): return result def notify(self, recorded_url, records): - if (records[0].get_header(warctools.WarcRecord.TYPE) == warctools.WarcRecord.RESPONSE + if (records and records[0].type == b'response' and recorded_url.response_recorder.payload_size() > 0): digest_key = warcprox.digest_str( recorded_url.response_recorder.payload_digest, @@ -172,7 +172,7 @@ class RethinkDedupDb: return result def notify(self, recorded_url, records): - if (records[0].get_header(warctools.WarcRecord.TYPE) == warctools.WarcRecord.RESPONSE + if (records and records[0].type == b'response' and recorded_url.response_recorder.payload_size() > 0): digest_key = warcprox.digest_str(recorded_url.response_recorder.payload_digest, self.options.base32) diff --git a/warcprox/playback.py b/warcprox/playback.py index a9aa47d..1a698c0 100644 --- a/warcprox/playback.py +++ b/warcprox/playback.py @@ -259,7 +259,8 @@ class PlaybackIndexDb(object): pass def notify(self, recorded_url, records): - self.save(records[0].warc_filename, records, records[0].offset) + if records: + self.save(records[0].warc_filename, records, records[0].offset) def save(self, warcfile, recordset, offset): response_record = recordset[0] diff --git a/warcprox/stats.py b/warcprox/stats.py index 55693a2..99e6804 100644 --- a/warcprox/stats.py +++ b/warcprox/stats.py @@ -170,12 +170,13 @@ class StatsDb: bucket_stats["total"]["urls"] += 1 bucket_stats["total"]["wire_bytes"] += recorded_url.size - if records[0].get_header(warctools.WarcRecord.TYPE) == warctools.WarcRecord.REVISIT: - bucket_stats["revisit"]["urls"] += 1 - bucket_stats["revisit"]["wire_bytes"] += recorded_url.size - else: - bucket_stats["new"]["urls"] += 1 - bucket_stats["new"]["wire_bytes"] += recorded_url.size + if records: + if records[0].type == b'revisit': + bucket_stats["revisit"]["urls"] += 1 + bucket_stats["revisit"]["wire_bytes"] += recorded_url.size + else: + bucket_stats["new"]["urls"] += 1 + bucket_stats["new"]["wire_bytes"] += recorded_url.size json_value = json.dumps(bucket_stats, separators=(',',':')) conn.execute( @@ -304,8 +305,7 @@ class RethinkStatsDb(StatsDb): def tally(self, recorded_url, records): buckets = self.buckets(recorded_url) - is_revisit = records[0].get_header( - warctools.WarcRecord.TYPE) == warctools.WarcRecord.REVISIT + is_revisit = records[0].type == b'revisit' with self._batch_lock: for bucket in buckets: bucket_stats = self._batch.setdefault( diff --git a/warcprox/writerthread.py b/warcprox/writerthread.py index e422a65..a8a6ef7 100644 --- a/warcprox/writerthread.py +++ b/warcprox/writerthread.py @@ -82,13 +82,15 @@ class WarcWriterThread(threading.Thread): self.logger.info("%s urls left to write", qsize) recorded_url = self.recorded_url_q.get(block=True, timeout=0.5) + records = [] self.idle = None if self._filter_accepts(recorded_url): if self.dedup_db: warcprox.dedup.decorate_with_dedup_info(self.dedup_db, recorded_url, base32=self.options.base32) records = self.writer_pool.write_records(recorded_url) - self._final_tasks(recorded_url, records) + + self._final_tasks(recorded_url, records) # try to release resources in a timely fashion if recorded_url.response_recorder and recorded_url.response_recorder.tempfile: @@ -134,11 +136,15 @@ class WarcWriterThread(threading.Thread): payload_digest = "-" # 2015-07-17T22:32:23.672Z 1 58 dns:www.dhss.delaware.gov P http://www.dhss.delaware.gov/dhss/ text/dns #045 20150717223214881+316 sha1:63UTPB7GTWIHAGIK3WWL76E57BBTJGAK http://www.dhss.delaware.gov/dhss/ - {"warcFileOffset":2964,"warcFilename":"ARCHIVEIT-1303-WEEKLY-JOB165158-20150717223222113-00000.warc.gz"} - self.logger.info("{} {} {} {} {} size={} {} {} {} offset={}".format( - recorded_url.client_ip, recorded_url.status, recorded_url.method, - recorded_url.url.decode("utf-8"), recorded_url.mimetype, - recorded_url.size, payload_digest, records[0].type.decode("utf-8"), - records[0].warc_filename, records[0].offset)) + type_ = records[0].type.decode("utf-8") if records else '-' + filename = records[0].warc_filename if records else '-' + offset = records[0].offset if records else '-' + self.logger.info( + "%s %s %s %s %s size=%s %s %s %s offset=%s", + recorded_url.client_ip, recorded_url.status, + recorded_url.method, recorded_url.url.decode("utf-8"), + recorded_url.mimetype, recorded_url.size, payload_digest, + type_, filename, offset) def _final_tasks(self, recorded_url, records): if self.listeners: From 750a333aa6ed3275fb19b882c6bc02fcd889ccfe Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 9 Nov 2017 15:23:15 -0800 Subject: [PATCH 15/15] not gonna bother figuring out why pypy regex is not matching https://travis-ci.org/internetarchive/warcprox/jobs/299864258#L615 --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index 5f7d8b3..d51105f 100644 --- a/.travis.yml +++ b/.travis.yml @@ -13,6 +13,7 @@ python: matrix: allow_failures: + - python: pypy - python: pypy3 - python: nightly - python: 3.7-dev