From 3c215b42b56f45537973e91cc4a66050364a31b8 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 10 Nov 2017 14:34:06 -0800 Subject: [PATCH 1/4] missed a spot handling case of no warc records written --- setup.py | 2 +- warcprox/stats.py | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/setup.py b/setup.py index 0b5c891..56c39f5 100755 --- a/setup.py +++ b/setup.py @@ -51,7 +51,7 @@ except: setuptools.setup( name='warcprox', - version='2.2.1b2.dev112', + version='2.2.1b2.dev113', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', diff --git a/warcprox/stats.py b/warcprox/stats.py index 99e6804..254f764 100644 --- a/warcprox/stats.py +++ b/warcprox/stats.py @@ -305,7 +305,6 @@ class RethinkStatsDb(StatsDb): def tally(self, recorded_url, records): buckets = self.buckets(recorded_url) - is_revisit = records[0].type == b'revisit' with self._batch_lock: for bucket in buckets: bucket_stats = self._batch.setdefault( @@ -314,12 +313,13 @@ class RethinkStatsDb(StatsDb): bucket_stats["total"]["urls"] += 1 bucket_stats["total"]["wire_bytes"] += recorded_url.size - if is_revisit: - bucket_stats["revisit"]["urls"] += 1 - bucket_stats["revisit"]["wire_bytes"] += recorded_url.size - else: - bucket_stats["new"]["urls"] += 1 - bucket_stats["new"]["wire_bytes"] += recorded_url.size + if records: + if records[0].type == b'revisit': + bucket_stats["revisit"]["urls"] += 1 + bucket_stats["revisit"]["wire_bytes"] += recorded_url.size + else: + bucket_stats["new"]["urls"] += 1 + bucket_stats["new"]["wire_bytes"] += recorded_url.size def _add_to_batch(self, add_me): with self._batch_lock: From 30b6b0b337782c1f5ce918355f9009770bbae6a2 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 10 Nov 2017 17:02:33 -0800 Subject: [PATCH 2/4] new failing test for correct calculation of payload digest which should match rfc2616 entity body, which is transfer decoded but not content-decoded --- tests/test_warcprox.py | 149 ++++++++++++++++++++++++++++++++++++++++- warcprox/__init__.py | 2 +- warcprox/mitmproxy.py | 3 +- 3 files changed, 151 insertions(+), 3 deletions(-) diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index 0a357b2..4d1caab 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -46,6 +46,10 @@ from collections import Counter import socket import datetime import warcio.archiveiterator +import io +import gzip +import mock +import email.message try: import http.server as http_server @@ -84,7 +88,7 @@ def _send(self, data): # http_client.HTTPConnection.send = _send logging.basicConfig( - stream=sys.stdout, level=logging.DEBUG, # level=warcprox.TRACE, + stream=sys.stdout, level=warcprox.TRACE, format='%(asctime)s %(process)d %(levelname)s %(threadName)s ' '%(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s') logging.getLogger("requests.packages.urllib3").setLevel(logging.WARN) @@ -134,6 +138,24 @@ def dump_state(signum=None, frame=None): signal.signal(signal.SIGQUIT, dump_state) +def chunkify(buf, chunk_size=13): + i = 0 + result = b'' + while i < len(buf): + chunk_len = min(len(buf) - i, chunk_size) + result += ('%x\r\n' % chunk_len).encode('ascii') + result += buf[i:i+chunk_len] + result += b'\r\n' + i += chunk_size + result += b'0\r\n\r\n' + return result + +# def gzipify(buf): +# with io.BytesIO() as outbuf: +# with gzip.GzipFile(fileobj=outbuf, mode='wb') as gz: +# gz.write(buf) +# return outbuf.getvalue() + class _TestHttpRequestHandler(http_server.BaseHTTPRequestHandler): def build_response(self): m = re.match(r'^/([^/]+)/([^/]+)$', self.path) @@ -150,6 +172,71 @@ class _TestHttpRequestHandler(http_server.BaseHTTPRequestHandler): + b'Content-Type: text/plain\r\n' + b'\r\n') payload = b'This response is missing a Content-Length http header.' + elif self.path.startswith('/test_payload_digest-'): + content_body = ( + b'Hello. How are you. I am the test_payload_digest ' + b'content body. The entity body is a possibly content-' + b'encoded version of me. The message body is a possibly ' + b'transfer-encoded version of the entity body.\n') + gzipped = ( + b"\x1f\x8b\x08\x00jA\x06Z\x02\xffm\x8d1\x0e\xc20\x10\x04{^" + b"\xb1\x1f\xc0\xef\x08=}t\x897\xc1\x92\xed\x8b|\x07\xc8" + b"\xbf'\n\xa2@J9\xab\x19\xed\xc0\x9c5`\xd07\xa4\x11]\x9f" + b"\x017H\x81?\x08\xa7\xf9\xb8I\xcf*q\x8ci\xdd\x11\xb3VguL" + b"\x1a{\xc0}\xb7vJ\xde\x8f\x01\xc9 \xd8\xd4,M\xb9\xff\xdc" + b"+\xeb\xac\x91\x11/6KZ\xa1\x0b\n\xbfq\xa1\x99\xac<\xab" + b"\xbdI\xb5\x85\xed,\xf7\xff\xdfp\xf9\x00\xfc\t\x02\xb0" + b"\xc8\x00\x00\x00") + double_gzipped = ( + b"\x1f\x8b\x08\x00jA\x06Z\x02\xff\x01\x89\x00v\xff\x1f\x8b" + b"\x08\x00jA\x06Z\x02\xffm\x8d1\x0e\xc20\x10\x04{^\xb1\x1f" + b"\xc0\xef\x08=}t\x897\xc1\x92\xed\x8b|\x07\xc8\xbf'\n\xa2" + b"@J9\xab\x19\xed\xc0\x9c5`\xd07\xa4\x11]\x9f\x017H\x81?" + b"\x08\xa7\xf9\xb8I\xcf*q\x8ci\xdd\x11\xb3VguL\x1a{\xc0}" + b"\xb7vJ\xde\x8f\x01\xc9 \xd8\xd4,M\xb9\xff\xdc+\xeb\xac" + b"\x91\x11/6KZ\xa1\x0b\n\xbfq\xa1\x99\xac<\xab\xbdI\xb5" + b"\x85\xed,\xf7\xff\xdfp\xf9\x00\xfc\t\x02\xb0\xc8\x00\x00" + b"\x00\xf9\xdd\x8f\xed\x89\x00\x00\x00") + if self.path == '/test_payload_digest-plain': + payload = content_body + actual_headers = (b'Content-Type: text/plain\r\n' + + b'Content-Length: ' + str(len(payload)).encode('ascii') + b'\r\n') + elif self.path == '/test_payload_digest-gzip': + payload = gzipped + actual_headers = (b'Content-Type: application/gzip\r\n' + + b'Content-Length: ' + str(len(payload)).encode('ascii') + b'\r\n') + elif self.path == '/test_payload_digest-ce-gzip': + payload = gzipped + actual_headers = (b'Content-Type: text/plain\r\n' + + b'Content-Encoding: gzip\r\n' + + b'Content-Length: ' + str(len(payload)).encode('ascii') + b'\r\n') + elif self.path == '/test_payload_digest-gzip-ce-gzip': + payload = double_gzipped + actual_headers = (b'Content-Type: application/gzip\r\n' + + b'Content-Encoding: gzip\r\n' + + b'Content-Length: ' + str(len(payload)).encode('ascii') + b'\r\n') + elif self.path == '/test_payload_digest-te-chunked': + payload = chunkify(content_body) + actual_headers = (b'Content-Type: text/plain\r\n' + + b'Transfer-Encoding: chunked\r\n') + elif self.path == '/test_payload_digest-gzip-te-chunked': + payload = chunkify(gzipped) + actual_headers = (b'Content-Type: application/gzip\r\n' + + b'Transfer-Encoding: chunked\r\n') + elif self.path == '/test_payload_digest-ce-gzip-te-chunked': + payload = chunkify(gzipped) + actual_headers = (b'Content-Type: text/plain\r\n' + + b'Content-Encoding: gzip\r\n' + + b'Transfer-Encoding: chunked\r\n') + elif self.path == '/test_payload_digest-gzip-ce-gzip-te-chunked': + payload = chunkify(double_gzipped) + actual_headers = (b'Content-Type: application/gzip\r\n' + + b'Content-Encoding: gzip\r\n' + + b'Transfer-Encoding: chunked\r\n') + else: + raise Exception('bad path') + headers = b'HTTP/1.1 200 OK\r\n' + actual_headers + b'\r\n' + logging.info('headers=%r payload=%r', headers, payload) else: payload = b'404 Not Found\n' headers = (b'HTTP/1.1 404 Not Found\r\n' @@ -1554,6 +1641,66 @@ def test_long_warcprox_meta( with pytest.raises(StopIteration): next(rec_iter) +def test_payload_digest(warcprox_, http_daemon): + ''' + Tests that digest is of RFC2616 "entity body" + (transfer-decoded but not content-decoded) + ''' + class HalfMockedMitm(warcprox.mitmproxy.MitmProxyHandler): + def __init__(self, url): + self.path = url + self.request_version = 'HTTP/1.1' + self.client_address = mock.MagicMock() + self.headers = email.message.Message() + self.headers.add_header('Host', 'localhost:%s' % http_daemon.server_port) + self.server = warcprox_.proxy + self.command = 'GET' + self.connection = mock.Mock() + + PLAIN_SHA1 = b'sha1:881289333370aa4e3214505f1173423cc5a896b7' + GZIP_SHA1 = b'sha1:634e25de71ae01edb5c5d9e2e99c4836bbe94129' + GZIP_GZIP_SHA1 = b'sha1:cecbf3a5c4975072f5e4c5e0489f808ef246c2b4' + + # plain + mitm = HalfMockedMitm('http://localhost:%s/test_payload_digest-plain' % http_daemon.server_port) + req, prox_rec_res = mitm.do_GET() + assert warcprox.digest_str(prox_rec_res.payload_digest) == PLAIN_SHA1 + + # content-type: application/gzip + mitm = HalfMockedMitm('http://localhost:%s/test_payload_digest-gzip' % http_daemon.server_port) + req, prox_rec_res = mitm.do_GET() + assert warcprox.digest_str(prox_rec_res.payload_digest) == GZIP_SHA1 + + # content-encoding: gzip + mitm = HalfMockedMitm('http://localhost:%s/test_payload_digest-ce-gzip' % http_daemon.server_port) + req, prox_rec_res = mitm.do_GET() + assert warcprox.digest_str(prox_rec_res.payload_digest) == GZIP_SHA1 + + # content-type: application/gzip && content-encoding: gzip + mitm = HalfMockedMitm('http://localhost:%s/test_payload_digest-gzip-ce-gzip' % http_daemon.server_port) + req, prox_rec_res = mitm.do_GET() + assert warcprox.digest_str(prox_rec_res.payload_digest) == GZIP_GZIP_SHA1 + + # chunked plain + mitm = HalfMockedMitm('http://localhost:%s/test_payload_digest-te-chunked' % http_daemon.server_port) + req, prox_rec_res = mitm.do_GET() + assert warcprox.digest_str(prox_rec_res.payload_digest) == PLAIN_SHA1 + + # chunked content-type: application/gzip + mitm = HalfMockedMitm('http://localhost:%s/test_payload_digest-gzip-te-chunked' % http_daemon.server_port) + req, prox_rec_res = mitm.do_GET() + assert warcprox.digest_str(prox_rec_res.payload_digest) == GZIP_SHA1 + + # chunked content-encoding: gzip + mitm = HalfMockedMitm('http://localhost:%s/test_payload_digest-ce-gzip-te-chunked' % http_daemon.server_port) + req, prox_rec_res = mitm.do_GET() + assert warcprox.digest_str(prox_rec_res.payload_digest) == GZIP_SHA1 + + # chunked content-type: application/gzip && content-encoding: gzip + mitm = HalfMockedMitm('http://localhost:%s/test_payload_digest-gzip-ce-gzip-te-chunked' % http_daemon.server_port) + req, prox_rec_res = mitm.do_GET() + assert warcprox.digest_str(prox_rec_res.payload_digest) == GZIP_GZIP_SHA1 + if __name__ == '__main__': pytest.main() diff --git a/warcprox/__init__.py b/warcprox/__init__.py index ecd6f53..e50a415 100644 --- a/warcprox/__init__.py +++ b/warcprox/__init__.py @@ -28,7 +28,7 @@ except ImportError: import Queue as queue import datetime -def digest_str(hash_obj, base32): +def digest_str(hash_obj, base32=False): import base64 return hash_obj.name.encode('utf-8') + b':' + ( base64.b32encode(hash_obj.digest()) if base32 diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index b14cddf..722311b 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -361,7 +361,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): return try: - self._proxy_request() + return self._proxy_request() except: self.logger.error("exception proxying request", exc_info=True) raise @@ -406,6 +406,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): if 'Content-Length' in self.headers: req += self.rfile.read(int(self.headers['Content-Length'])) + prox_rec_res = None try: self.logger.debug('sending to remote server req=%r', req) From 3a0f6e0947c859ebd45032ea7421d3d3449cbfd9 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 10 Nov 2017 17:18:22 -0800 Subject: [PATCH 3/4] fix payload digest by pulling calculation up one level where content has already been transfer-decoded --- setup.py | 2 +- tests/single-threaded-proxy.py | 4 ++-- warcprox/bigtable.py | 8 ++++---- warcprox/crawl_log.py | 2 +- warcprox/dedup.py | 11 +++++------ warcprox/mitmproxy.py | 25 +++++++++++++------------ warcprox/warc.py | 13 +++++-------- warcprox/warcproxy.py | 7 +++++-- 8 files changed, 36 insertions(+), 36 deletions(-) diff --git a/setup.py b/setup.py index 56c39f5..625abbb 100755 --- a/setup.py +++ b/setup.py @@ -51,7 +51,7 @@ except: setuptools.setup( name='warcprox', - version='2.2.1b2.dev113', + version='2.2.1b2.dev114', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', diff --git a/tests/single-threaded-proxy.py b/tests/single-threaded-proxy.py index dd5e709..1c176a2 100755 --- a/tests/single-threaded-proxy.py +++ b/tests/single-threaded-proxy.py @@ -3,7 +3,7 @@ tests/single-threaded-proxy.py - single-threaded MITM proxy, useful for debugging, does not write warcs -Copyright (C) 2015-2016 Internet Archive +Copyright (C) 2015-2017 Internet Archive This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License @@ -46,7 +46,7 @@ class FakeQueue(object): logging.info("{} {} {} {} {} size={} {}".format( recorded_url.client_ip, recorded_url.status, recorded_url.method, recorded_url.url.decode("utf-8"), recorded_url.mimetype, - recorded_url.size, warcprox.digest_str(recorded_url.response_recorder.payload_digest, False).decode('utf-8'))) + recorded_url.size, warcprox.digest_str(recorded_url.payload_digest, False).decode('utf-8'))) def parse_args(): prog = os.path.basename(sys.argv[0]) diff --git a/warcprox/bigtable.py b/warcprox/bigtable.py index c9547b2..115aed9 100644 --- a/warcprox/bigtable.py +++ b/warcprox/bigtable.py @@ -140,16 +140,16 @@ class RethinkCaptures: return result def _assemble_entry(self, recorded_url, records): - if recorded_url.response_recorder: - if recorded_url.response_recorder.payload_digest.name == "sha1": + if recorded_url.payload_digest: + if recorded_url.payload_digest.name == "sha1": sha1base32 = base64.b32encode( - recorded_url.response_recorder.payload_digest.digest() + recorded_url.payload_digest.digest() ).decode("utf-8") else: self.logger.warn( "digest type is %r but big captures table is indexed " "by sha1", - recorded_url.response_recorder.payload_digest.name) + recorded_url.payload_digest.name) else: digest = hashlib.new("sha1", records[0].content[1]) sha1base32 = base64.b32encode(digest.digest()).decode("utf-8") diff --git a/warcprox/crawl_log.py b/warcprox/crawl_log.py index 5b4a4fc..f28683a 100644 --- a/warcprox/crawl_log.py +++ b/warcprox/crawl_log.py @@ -43,7 +43,7 @@ class CrawlLogger(object): if recorded_url.response_recorder: content_length = recorded_url.response_recorder.len - recorded_url.response_recorder.payload_offset payload_digest = warcprox.digest_str( - recorded_url.response_recorder.payload_digest, + recorded_url.payload_digest, self.options.base32) else: # WARCPROX_WRITE_RECORD request diff --git a/warcprox/dedup.py b/warcprox/dedup.py index b9cd223..be49104 100644 --- a/warcprox/dedup.py +++ b/warcprox/dedup.py @@ -96,8 +96,7 @@ class DedupDb(object): if (records and records[0].type == b'response' and recorded_url.response_recorder.payload_size() > 0): digest_key = warcprox.digest_str( - recorded_url.response_recorder.payload_digest, - self.options.base32) + recorded_url.payload_digest, self.options.base32) if recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta: self.save( digest_key, records[0], @@ -108,9 +107,9 @@ class DedupDb(object): def decorate_with_dedup_info(dedup_db, recorded_url, base32=False): if (recorded_url.response_recorder - and recorded_url.response_recorder.payload_digest + and recorded_url.payload_digest and recorded_url.response_recorder.payload_size() > 0): - digest_key = warcprox.digest_str(recorded_url.response_recorder.payload_digest, base32) + digest_key = warcprox.digest_str(recorded_url.payload_digest, base32) if recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta: recorded_url.dedup_info = dedup_db.lookup(digest_key, recorded_url.warcprox_meta["captures-bucket"], recorded_url.url) @@ -174,8 +173,8 @@ class RethinkDedupDb: def notify(self, recorded_url, records): if (records and records[0].type == b'response' and recorded_url.response_recorder.payload_size() > 0): - digest_key = warcprox.digest_str(recorded_url.response_recorder.payload_digest, - self.options.base32) + digest_key = warcprox.digest_str( + recorded_url.payload_digest, self.options.base32) if recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta: self.save(digest_key, records[0], bucket=recorded_url.warcprox_meta["captures-bucket"]) else: diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index 722311b..2c34bcd 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -66,7 +66,7 @@ import time class ProxyingRecorder(object): """ Wraps a socket._fileobject, recording the bytes as they are read, - calculating digests, and sending them on to the proxy client. + calculating the block digest, and sending them on to the proxy client. """ logger = logging.getLogger("warcprox.mitmproxy.ProxyingRecorder") @@ -78,27 +78,19 @@ class ProxyingRecorder(object): self.digest_algorithm = digest_algorithm self.block_digest = hashlib.new(digest_algorithm) self.payload_offset = None - self.payload_digest = None self.proxy_client = proxy_client self._proxy_client_conn_open = True self.len = 0 self.url = url def payload_starts_now(self): - self.payload_digest = hashlib.new(self.digest_algorithm) self.payload_offset = self.len - def _update_payload_digest(self, hunk): - if self.payload_digest: - self.payload_digest.update(hunk) - def _update(self, hunk): - self._update_payload_digest(hunk) self.block_digest.update(hunk) - self.tempfile.write(hunk) - if self.payload_digest and self._proxy_client_conn_open: + if self.payload_offset is not None and self._proxy_client_conn_open: try: self.proxy_client.sendall(hunk) except BaseException as e: @@ -157,6 +149,7 @@ class ProxyingRecordingHTTPResponse(http_client.HTTPResponse): self, sock, debuglevel=debuglevel, method=method) self.proxy_client = proxy_client self.url = url + self.digest_algorithm = digest_algorithm # Keep around extra reference to self.fp because HTTPResponse sets # self.fp=None after it finishes reading, but we still need it @@ -164,6 +157,8 @@ class ProxyingRecordingHTTPResponse(http_client.HTTPResponse): self.fp, proxy_client, digest_algorithm, url=url) self.fp = self.recorder + self.payload_digest = None + def begin(self, extra_response_headers={}): http_client.HTTPResponse.begin(self) # reads status line, headers @@ -185,6 +180,12 @@ class ProxyingRecordingHTTPResponse(http_client.HTTPResponse): self.proxy_client.sendall(status_and_headers.encode('latin1')) self.recorder.payload_starts_now() + self.payload_digest = hashlib.new(self.digest_algorithm) + + def read(self, amt=None): + buf = http_client.HTTPResponse.read(self, amt) + self.payload_digest.update(buf) + return buf def via_header_value(orig, request_version): via = orig @@ -419,9 +420,9 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): url=self.url, method=self.command) prox_rec_res.begin(extra_response_headers=extra_response_headers) - buf = prox_rec_res.read(8192) + buf = prox_rec_res.read(65536) while buf != b'': - buf = prox_rec_res.read(8192) + buf = prox_rec_res.read(65536) self.log_request(prox_rec_res.status, prox_rec_res.recorder.len) except Exception as e: diff --git a/warcprox/warc.py b/warcprox/warc.py index de0ec06..6b9cbcf 100644 --- a/warcprox/warc.py +++ b/warcprox/warc.py @@ -53,7 +53,8 @@ class WarcRecordBuilder: refers_to=recorded_url.dedup_info.get('id'), refers_to_target_uri=recorded_url.dedup_info['url'], refers_to_date=recorded_url.dedup_info['date'], - payload_digest=warcprox.digest_str(recorded_url.response_recorder.payload_digest, self.base32), + payload_digest=warcprox.digest_str( + recorded_url.payload_digest, self.base32), profile=warctools.WarcRecord.PROFILE_IDENTICAL_PAYLOAD_DIGEST, content_type=hanzo.httptools.ResponseMessage.CONTENT_TYPE, remote_ip=recorded_url.remote_ip) @@ -64,7 +65,9 @@ class WarcRecordBuilder: recorder=recorded_url.response_recorder, warc_type=warctools.WarcRecord.RESPONSE, content_type=hanzo.httptools.ResponseMessage.CONTENT_TYPE, - remote_ip=recorded_url.remote_ip) + remote_ip=recorded_url.remote_ip, + payload_digest=warcprox.digest_str( + recorded_url.payload_digest, self.base32)) def build_warc_records(self, recorded_url): """Returns a tuple of hanzo.warctools.warc.WarcRecord (principal_record, ...)""" @@ -122,13 +125,8 @@ class WarcRecordBuilder: headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(len(recorder)).encode('latin1'))) headers.append((warctools.WarcRecord.BLOCK_DIGEST, warcprox.digest_str(recorder.block_digest, self.base32))) - if recorder.payload_digest is not None: - headers.append((warctools.WarcRecord.PAYLOAD_DIGEST, - warcprox.digest_str(recorder.payload_digest, self.base32))) - recorder.tempfile.seek(0) record = warctools.WarcRecord(headers=headers, content_file=recorder.tempfile) - else: headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(len(data)).encode('latin1'))) digest = hashlib.new(self.digest_algorithm, data) @@ -137,7 +135,6 @@ class WarcRecordBuilder: if not payload_digest: headers.append((warctools.WarcRecord.PAYLOAD_DIGEST, warcprox.digest_str(digest, self.base32))) - content_tuple = content_type, data record = warctools.WarcRecord(headers=headers, content=content_tuple) diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index afe1835..f1de01e 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -218,7 +218,8 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): content_type=content_type, method=self.command, timestamp=timestamp, host=self.hostname, duration=datetime.datetime.utcnow()-timestamp, - referer=self.headers.get('referer')) + referer=self.headers.get('referer'), + payload_digest=prox_rec_res.payload_digest) self.server.recorded_url_q.put(recorded_url) return recorded_url @@ -328,7 +329,8 @@ class RecordedUrl: def __init__(self, url, request_data, response_recorder, remote_ip, warcprox_meta=None, content_type=None, custom_type=None, status=None, size=None, client_ip=None, method=None, - timestamp=None, host=None, duration=None, referer=None): + timestamp=None, host=None, duration=None, referer=None, + payload_digest=None): # XXX should test what happens with non-ascii url (when does # url-encoding happen?) if type(url) is not bytes: @@ -366,6 +368,7 @@ class RecordedUrl: self.host = host self.duration = duration self.referer = referer + self.payload_digest = payload_digest # inherit from object so that multiple inheritance from this class works # properly in python 2 From ffc8a268ab517e76ccb4e543305195a9ca552932 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Mon, 13 Nov 2017 11:45:06 -0800 Subject: [PATCH 4/4] hopefully fix test failing occasionally apparently due to race condition by checking that the file we're waiting for has some content --- setup.py | 2 +- tests/test_warcprox.py | 32 +++++++++++++++++--------------- 2 files changed, 18 insertions(+), 16 deletions(-) diff --git a/setup.py b/setup.py index 625abbb..ebeb213 100755 --- a/setup.py +++ b/setup.py @@ -51,7 +51,7 @@ except: setuptools.setup( name='warcprox', - version='2.2.1b2.dev114', + version='2.2.1b2.dev115', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index 4d1caab..97e4351 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -1441,11 +1441,14 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies): assert response.status_code == 200 start = time.time() + file = os.path.join(warcprox_.options.crawl_log_dir, 'test_crawl_log_1.log') while time.time() - start < 10: - if os.path.exists(os.path.join( - warcprox_.options.crawl_log_dir, 'test_crawl_log_1.log')): + if os.path.exists(file) and os.stat(file).st_size > 0: break time.sleep(0.5) + assert os.path.exists(file) + assert os.path.exists(os.path.join( + warcprox_.options.crawl_log_dir, 'crawl.log')) crawl_log = open(os.path.join( warcprox_.options.crawl_log_dir, 'crawl.log'), 'rb').read() @@ -1499,14 +1502,14 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies): assert response.status_code == 200 start = time.time() + file = os.path.join(warcprox_.options.crawl_log_dir, 'test_crawl_log_2.log') while time.time() - start < 10: - if os.path.exists(os.path.join( - warcprox_.options.crawl_log_dir, 'test_crawl_log_2.log')): + if os.path.exists(file) and os.stat(file).st_size > 0: break time.sleep(0.5) + assert os.path.exists(file) - crawl_log_2 = open(os.path.join( - warcprox_.options.crawl_log_dir, 'test_crawl_log_2.log'), 'rb').read() + crawl_log_2 = open(file, 'rb').read() assert re.match(b'\A2[^\n]+\n\Z', crawl_log_2) assert crawl_log_2[24:31] == b' 200 ' @@ -1533,16 +1536,15 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies): headers = {'Warcprox-Meta': json.dumps({'warc-prefix': 'test_crawl_log_3'})} response = requests.head(url, proxies=archiving_proxies, headers=headers) + file = os.path.join(warcprox_.options.crawl_log_dir, 'test_crawl_log_3.log') start = time.time() while time.time() - start < 10: - if os.path.exists(os.path.join( - warcprox_.options.crawl_log_dir, 'test_crawl_log_3.log')): + if os.path.exists(file) and os.stat(file).st_size > 0: break time.sleep(0.5) - crawl_log_3 = open(os.path.join( - warcprox_.options.crawl_log_dir, 'test_crawl_log_3.log'), 'rb').read() - + assert os.path.exists(file) + crawl_log_3 = open(file, 'rb').read() assert re.match(b'\A2[^\n]+\n\Z', crawl_log_3) assert crawl_log_3[24:31] == b' 200 ' assert crawl_log_3[31:42] == b' 0 ' @@ -1575,14 +1577,14 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies): assert response.status_code == 204 start = time.time() + file = os.path.join(warcprox_.options.crawl_log_dir, 'test_crawl_log_4.log') while time.time() - start < 10: - if os.path.exists(os.path.join( - warcprox_.options.crawl_log_dir, 'test_crawl_log_4.log')): + if os.path.exists(file) and os.stat(file).st_size > 0: break time.sleep(0.5) - crawl_log_4 = open(os.path.join( - warcprox_.options.crawl_log_dir, 'test_crawl_log_4.log'), 'rb').read() + assert os.path.exists(file) + crawl_log_4 = open(file, 'rb').read() assert re.match(b'\A2[^\n]+\n\Z', crawl_log_4) assert crawl_log_4[24:31] == b' 204 '