fix bug where two warc-payload-digest headers were written to revisit records

This commit is contained in:
Noah Levitt 2016-03-15 06:27:21 +00:00
parent 2c91eb03d3
commit 42a81d8f8f
4 changed files with 14 additions and 4 deletions

View File

@ -30,7 +30,7 @@ except:
deps.append('futures')
setuptools.setup(name='warcprox',
version='1.5.0.dev2',
version='1.5.0.dev3',
description='WARC writing MITM HTTP/S proxy',
url='https://github.com/internetarchive/warcprox',
author='Noah Levitt',

View File

@ -22,6 +22,7 @@ import warnings
import pprint
import traceback
import signal
from collections import Counter
try:
import http.server as http_server
@ -662,6 +663,8 @@ def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies,
(offset, record, errors) = next(record_iter)
assert record.type == b'response'
assert record.url == url1.encode('ascii')
# check for duplicate warc record headers
assert Counter(h[0] for h in record.headers).most_common(1)[0][1] == 1
assert record.content[1] == b'HTTP/1.1 200 OK\r\nContent-Type: text/plain\r\nwarcprox-test-header: k!\r\nContent-Length: 44\r\n\r\nI am the warcprox test payload! llllllllll!\n'
(offset, record, errors) = next(record_iter)
assert record.type == b'request'
@ -670,6 +673,8 @@ def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies,
(offset, record, errors) = next(record_iter)
assert record.type == b'response'
assert record.url == url2.encode('ascii')
# check for duplicate warc record headers
assert Counter(h[0] for h in record.headers).most_common(1)[0][1] == 1
assert record.content[1] == b'HTTP/1.1 200 OK\r\nContent-Type: text/plain\r\nwarcprox-test-header: k!\r\nContent-Length: 44\r\n\r\nI am the warcprox test payload! llllllllll!\n'
(offset, record, errors) = next(record_iter)
assert record.type == b'request'
@ -678,6 +683,8 @@ def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies,
(offset, record, errors) = next(record_iter)
assert record.type == b'revisit'
assert record.url == url2.encode('ascii')
# check for duplicate warc record headers
assert Counter(h[0] for h in record.headers).most_common(1)[0][1] == 1
assert record.content[1] == b'HTTP/1.1 200 OK\r\nContent-Type: text/plain\r\nwarcprox-test-header: k!\r\nContent-Length: 44\r\n\r\n'
(offset, record, errors) = next(record_iter)
assert record.type == b'request'
@ -686,6 +693,8 @@ def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies,
(offset, record, errors) = next(record_iter)
assert record.type == b'revisit'
assert record.url == url1.encode('ascii')
# check for duplicate warc record headers
assert Counter(h[0] for h in record.headers).most_common(1)[0][1] == 1
assert record.content[1] == b'HTTP/1.1 200 OK\r\nContent-Type: text/plain\r\nwarcprox-test-header: k!\r\nContent-Length: 44\r\n\r\n'
(offset, record, errors) = next(record_iter)
assert record.type == b'request'

View File

@ -14,7 +14,7 @@ import rethinkstuff
class RethinkCaptures:
"""Inserts in batches every 0.5 seconds"""
logger = logging.getLogger("warcprox.bigtables.RethinkCaptures")
logger = logging.getLogger("warcprox.bigtable.RethinkCaptures")
def __init__(self, r, table="captures", shards=None, replicas=None, options=warcprox.Options()):
self.r = r

View File

@ -114,8 +114,9 @@ class WarcRecordBuilder:
digest = hashlib.new(self.digest_algorithm, data)
headers.append((warctools.WarcRecord.BLOCK_DIGEST,
warcprox.digest_str(digest, self.base32)))
headers.append((warctools.WarcRecord.PAYLOAD_DIGEST,
warcprox.digest_str(digest, self.base32)))
if not payload_digest:
headers.append((warctools.WarcRecord.PAYLOAD_DIGEST,
warcprox.digest_str(digest, self.base32)))
content_tuple = content_type, data
record = warctools.WarcRecord(headers=headers, content=content_tuple)