diff --git a/setup.py b/setup.py index 85aa827..f630e37 100755 --- a/setup.py +++ b/setup.py @@ -30,7 +30,7 @@ except: deps.append('futures') setuptools.setup(name='warcprox', - version='1.5.0.dev2', + version='1.5.0.dev3', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index 0d724c7..5dcd4df 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -22,6 +22,7 @@ import warnings import pprint import traceback import signal +from collections import Counter try: import http.server as http_server @@ -662,6 +663,8 @@ def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies, (offset, record, errors) = next(record_iter) assert record.type == b'response' assert record.url == url1.encode('ascii') + # check for duplicate warc record headers + assert Counter(h[0] for h in record.headers).most_common(1)[0][1] == 1 assert record.content[1] == b'HTTP/1.1 200 OK\r\nContent-Type: text/plain\r\nwarcprox-test-header: k!\r\nContent-Length: 44\r\n\r\nI am the warcprox test payload! llllllllll!\n' (offset, record, errors) = next(record_iter) assert record.type == b'request' @@ -670,6 +673,8 @@ def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies, (offset, record, errors) = next(record_iter) assert record.type == b'response' assert record.url == url2.encode('ascii') + # check for duplicate warc record headers + assert Counter(h[0] for h in record.headers).most_common(1)[0][1] == 1 assert record.content[1] == b'HTTP/1.1 200 OK\r\nContent-Type: text/plain\r\nwarcprox-test-header: k!\r\nContent-Length: 44\r\n\r\nI am the warcprox test payload! llllllllll!\n' (offset, record, errors) = next(record_iter) assert record.type == b'request' @@ -678,6 +683,8 @@ def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies, (offset, record, errors) = next(record_iter) assert record.type == b'revisit' assert record.url == url2.encode('ascii') + # check for duplicate warc record headers + assert Counter(h[0] for h in record.headers).most_common(1)[0][1] == 1 assert record.content[1] == b'HTTP/1.1 200 OK\r\nContent-Type: text/plain\r\nwarcprox-test-header: k!\r\nContent-Length: 44\r\n\r\n' (offset, record, errors) = next(record_iter) assert record.type == b'request' @@ -686,6 +693,8 @@ def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies, (offset, record, errors) = next(record_iter) assert record.type == b'revisit' assert record.url == url1.encode('ascii') + # check for duplicate warc record headers + assert Counter(h[0] for h in record.headers).most_common(1)[0][1] == 1 assert record.content[1] == b'HTTP/1.1 200 OK\r\nContent-Type: text/plain\r\nwarcprox-test-header: k!\r\nContent-Length: 44\r\n\r\n' (offset, record, errors) = next(record_iter) assert record.type == b'request' diff --git a/warcprox/bigtable.py b/warcprox/bigtable.py index 231d54c..9bb3d6d 100644 --- a/warcprox/bigtable.py +++ b/warcprox/bigtable.py @@ -14,7 +14,7 @@ import rethinkstuff class RethinkCaptures: """Inserts in batches every 0.5 seconds""" - logger = logging.getLogger("warcprox.bigtables.RethinkCaptures") + logger = logging.getLogger("warcprox.bigtable.RethinkCaptures") def __init__(self, r, table="captures", shards=None, replicas=None, options=warcprox.Options()): self.r = r diff --git a/warcprox/warc.py b/warcprox/warc.py index eed045c..5cba38d 100644 --- a/warcprox/warc.py +++ b/warcprox/warc.py @@ -114,8 +114,9 @@ class WarcRecordBuilder: digest = hashlib.new(self.digest_algorithm, data) headers.append((warctools.WarcRecord.BLOCK_DIGEST, warcprox.digest_str(digest, self.base32))) - headers.append((warctools.WarcRecord.PAYLOAD_DIGEST, - warcprox.digest_str(digest, self.base32))) + if not payload_digest: + headers.append((warctools.WarcRecord.PAYLOAD_DIGEST, + warcprox.digest_str(digest, self.base32))) content_tuple = content_type, data record = warctools.WarcRecord(headers=headers, content=content_tuple)