diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index fb908d9..bdace29 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -374,13 +374,17 @@ def warcprox_(request, captures_db, dedup_db, stats_db, service_registry): options.method_filter = ['GET','POST'] + options.crawl_log_dir = tempfile.mkdtemp( + prefix='warcprox-test-', suffix='-crawl-log') + crawl_logger = warcprox.crawl_log.CrawlLogger(options.crawl_log_dir) + writer_pool = warcprox.writer.WarcWriterPool(options) warc_writer_threads = [ warcprox.writerthread.WarcWriterThread( recorded_url_q=recorded_url_q, writer_pool=writer_pool, dedup_db=dedup_db, listeners=[ - captures_db or dedup_db, playback_index_db, stats_db], - options=options) + captures_db or dedup_db, playback_index_db, stats_db, + crawl_logger], options=options) for i in range(int(proxy.max_threads ** 0.5))] warcprox_ = warcprox.controller.WarcproxController( @@ -396,7 +400,8 @@ def warcprox_(request, captures_db, dedup_db, stats_db, service_registry): logging.info('stopping warcprox') warcprox_.stop.set() warcprox_thread.join() - for f in (ca_file, ca_dir, options.directory, playback_index_db_file): + for f in (ca_file, ca_dir, options.directory, playback_index_db_file, + options.crawl_log_dir): if os.path.isdir(f): logging.info('deleting directory {}'.format(f)) shutil.rmtree(f) @@ -1442,6 +1447,101 @@ def test_slash_in_warc_prefix(warcprox_, http_daemon, archiving_proxies): assert response.status_code == 500 assert response.reason == 'request rejected by warcprox: slash and backslash are not permitted in warc-prefix' +def test_crawl_log(warcprox_, http_daemon, archiving_proxies): + try: + os.unlink(os.path.join(warcprox_.options.crawl_log_dir, 'crawl.log')) + except: + pass + + url = 'http://localhost:%s/b/d' % http_daemon.server_port + response = requests.get(url, proxies=archiving_proxies) + assert response.status_code == 200 + + url = 'http://localhost:%s/b/e' % http_daemon.server_port + headers = { + "Warcprox-Meta": json.dumps({"warc-prefix":"test_crawl_log_1"}), + "Referer": "http://example.com/referer", + } + response = requests.get(url, proxies=archiving_proxies, headers=headers) + assert response.status_code == 200 + + # should be deduplicated + url = 'http://localhost:%s/b/d' % http_daemon.server_port + headers = {"Warcprox-Meta": json.dumps({ + "warc-prefix": "test_crawl_log_2", + "metadata": {"seed": "http://example.com/seed"}})} + response = requests.get(url, proxies=archiving_proxies, headers=headers) + assert response.status_code == 200 + + start = time.time() + while time.time() - start < 10: + if os.path.exists(os.path.join( + warcprox_.options.crawl_log_dir, 'test_crawl_log_2.log')): + break + time.sleep(0.5) + + crawl_log = open(os.path.join( + warcprox_.options.crawl_log_dir, 'crawl.log'), 'rb').read() + crawl_log_1 = open(os.path.join( + warcprox_.options.crawl_log_dir, 'test_crawl_log_1.log'), 'rb').read() + crawl_log_2 = open(os.path.join( + warcprox_.options.crawl_log_dir, 'test_crawl_log_2.log'), 'rb').read() + + # tests will fail in year 3000 :) + assert re.match(b'\A2[^\n]+\n\Z', crawl_log) + assert crawl_log[24:31] == b' 200 ' + assert crawl_log[31:42] == b' 44 ' + fields = crawl_log.split() + assert len(fields) == 13 + assert fields[3].endswith(b'/b/d') + assert fields[4] == b'-' + assert fields[5] == b'-' + assert fields[6] == b'text/plain' + assert fields[7] == b'-' + assert re.match(br'^\d{17}[+]\d{3}', fields[8]) + assert fields[9] == b'sha1:NKW7OKGZHXIMRKILQGOB2EB22U2MXJLM' + assert fields[10] == b'-' + assert fields[11] == b'-' + extra_info = json.loads(fields[12].decode('utf-8')) + assert extra_info.keys() == {'contentSize','warcFilename','warcFileOffset'} + assert extra_info['contentSize'] == 135 + + assert re.match(b'\A2[^\n]+\n\Z', crawl_log_1) + assert crawl_log_1[24:31] == b' 200 ' + assert crawl_log_1[31:42] == b' 44 ' + fields = crawl_log_1.split() + assert len(fields) == 13 + assert fields[3].endswith(b'/b/e') + assert fields[4] == b'-' + assert fields[5] == b'http://example.com/referer' + assert fields[6] == b'text/plain' + assert fields[7] == b'-' + assert re.match(br'^\d{17}[+]\d{3}', fields[8]) + assert fields[9] == b'sha1:DJURQDWPRKWTNMHDA6YS2KN2RLTWQ4JJ' + assert fields[10] == b'-' + assert fields[11] == b'-' + extra_info = json.loads(fields[12].decode('utf-8')) + assert extra_info.keys() == {'contentSize','warcFilename','warcFileOffset'} + assert extra_info['contentSize'] == 135 + + assert re.match(b'\A2[^\n]+\n\Z', crawl_log_2) + assert crawl_log_2[24:31] == b' 200 ' + assert crawl_log_2[31:42] == b' 44 ' + fields = crawl_log_2.split() + assert len(fields) == 13 + assert fields[3].endswith(b'/b/d') + assert fields[4] == b'-' + assert fields[5] == b'-' + assert fields[6] == b'text/plain' + assert fields[7] == b'-' + assert re.match(br'^\d{17}[+]\d{3}', fields[8]) + assert fields[9] == b'sha1:NKW7OKGZHXIMRKILQGOB2EB22U2MXJLM' + assert fields[10] == b'http://example.com/seed' + assert fields[11] == b'-' + extra_info = json.loads(fields[12].decode('utf-8')) + assert extra_info.keys() == {'contentSize','warcFilename','warcFileOffset'} + assert extra_info['contentSize'] == 135 + if __name__ == '__main__': pytest.main() diff --git a/warcprox/__init__.py b/warcprox/__init__.py index 5564ff3..ecd6f53 100644 --- a/warcprox/__init__.py +++ b/warcprox/__init__.py @@ -114,3 +114,4 @@ import warcprox.warc as warc import warcprox.writerthread as writerthread import warcprox.stats as stats import warcprox.bigtable as bigtable +import warcprox.crawl_log as crawl_log diff --git a/warcprox/crawl_log.py b/warcprox/crawl_log.py index 4c04563..6888110 100644 --- a/warcprox/crawl_log.py +++ b/warcprox/crawl_log.py @@ -22,6 +22,8 @@ USA. import logging import datetime import json +import os +import warcprox class CrawlLogger(object): def __init__(self, dir_): @@ -36,33 +38,38 @@ class CrawlLogger(object): 'warcFileOffset': records[0].offset, } fields = [ - '{:%Y-%m-%dT%H:%M:%S}.{:03d}'.format(now, now.microsecond//1000), + '{:%Y-%m-%dT%H:%M:%S}.{:03d}Z'.format(now, now.microsecond//1000), '% 5s' % recorded_url.status, '% 10s' % (recorded_url.response_recorder.len - recorded_url.response_recorder.payload_offset), recorded_url.url, '-', # hop path recorded_url.referer or '-', - recorded_url.mimetype, + recorded_url.mimetype or '-', '-', '{:%Y%m%d%H%M%S}{:03d}+{:03d}'.format( - recorded_url.timestamp, recorded_url.microsecond//1000, + recorded_url.timestamp, + recorded_url.timestamp.microsecond//1000, recorded_url.duration.microseconds//1000), warcprox.digest_str( recorded_url.response_recorder.payload_digest, True), recorded_url.warcprox_meta.get('metadata', {}).get('seed', '-'), - 'duplicate:digest' if records[0].type == b'revisit' else '0', + 'duplicate:digest' if records[0].type == b'revisit' else '-', json.dumps(extra_info, separators=(',',':')), ] for i in range(len(fields)): - # `fields` is a mix of `bytes` and `unicode`, make them all `bytes + # `fields` is a mix of `bytes` and `unicode`, make them all `bytes` try: fields[i] = fields[i].encode('utf-8') except: pass - line = b' '.join(fields) + line = b' '.join(fields) + b'\n' if 'warc-prefix' in recorded_url.warcprox_meta: filename = '%s.log' % recorded_url.warcprox_meta['warc-prefix'] - os.path.join( - self.dir, ) + else: + filename = 'crawl.log' + + crawl_log_path = os.path.join(self.dir, filename) + with open(crawl_log_path, 'ab') as f: + f.write(line) diff --git a/warcprox/main.py b/warcprox/main.py index 54355d7..30f85d2 100644 --- a/warcprox/main.py +++ b/warcprox/main.py @@ -128,6 +128,11 @@ def _build_arg_parser(prog=os.path.basename(sys.argv[0])): default=None, help=( 'host:port of tor socks proxy, used only to connect to ' '.onion sites')) + arg_parser.add_argument( + '--crawl-log-dir', dest='crawl_log_dir', default=None, help=( + 'if specified, write crawl log files in the specified ' + 'directory; one crawl log is written per warc filename ' + 'prefix; crawl log format mimics heritrix')) arg_parser.add_argument( '--plugin', metavar='PLUGIN_CLASS', dest='plugins', action='append', help=( @@ -228,6 +233,9 @@ def init_controller(args): playback_index_db = None playback_proxy = None + if args.crawl_log_dir: + listeners.append(warcprox.crawl_log.CrawlLogger(args.crawl_log_dir)) + for qualname in args.plugins or []: try: (module_name, class_name) = qualname.rsplit('.', 1) diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index 6cbc9e4..95ca81f 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -207,7 +207,8 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): client_ip=self.client_address[0], content_type=prox_rec_res.getheader("Content-Type"), method=self.command, timestamp=timestamp, host=self.hostname, - duration=datetime.datetime.utcnow()-timestamp) + duration=datetime.datetime.utcnow()-timestamp, + referer=self.headers.get('referer')) self.server.recorded_url_q.put(recorded_url) return recorded_url @@ -314,7 +315,7 @@ class RecordedUrl: def __init__(self, url, request_data, response_recorder, remote_ip, warcprox_meta=None, content_type=None, custom_type=None, status=None, size=None, client_ip=None, method=None, - timestamp=None, host=None, duration=None): + timestamp=None, host=None, duration=None, referer=None): # XXX should test what happens with non-ascii url (when does # url-encoding happen?) if type(url) is not bytes: @@ -351,6 +352,7 @@ class RecordedUrl: self.timestamp = timestamp self.host = host self.duration = duration + self.referer = referer # inherit from object so that multiple inheritance from this class works # properly in python 2