mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
fix bug where two warc-payload-digest headers were written to revisit records
This commit is contained in:
parent
2c91eb03d3
commit
42a81d8f8f
2
setup.py
2
setup.py
@ -30,7 +30,7 @@ except:
|
||||
deps.append('futures')
|
||||
|
||||
setuptools.setup(name='warcprox',
|
||||
version='1.5.0.dev2',
|
||||
version='1.5.0.dev3',
|
||||
description='WARC writing MITM HTTP/S proxy',
|
||||
url='https://github.com/internetarchive/warcprox',
|
||||
author='Noah Levitt',
|
||||
|
@ -22,6 +22,7 @@ import warnings
|
||||
import pprint
|
||||
import traceback
|
||||
import signal
|
||||
from collections import Counter
|
||||
|
||||
try:
|
||||
import http.server as http_server
|
||||
@ -662,6 +663,8 @@ def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies,
|
||||
(offset, record, errors) = next(record_iter)
|
||||
assert record.type == b'response'
|
||||
assert record.url == url1.encode('ascii')
|
||||
# check for duplicate warc record headers
|
||||
assert Counter(h[0] for h in record.headers).most_common(1)[0][1] == 1
|
||||
assert record.content[1] == b'HTTP/1.1 200 OK\r\nContent-Type: text/plain\r\nwarcprox-test-header: k!\r\nContent-Length: 44\r\n\r\nI am the warcprox test payload! llllllllll!\n'
|
||||
(offset, record, errors) = next(record_iter)
|
||||
assert record.type == b'request'
|
||||
@ -670,6 +673,8 @@ def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies,
|
||||
(offset, record, errors) = next(record_iter)
|
||||
assert record.type == b'response'
|
||||
assert record.url == url2.encode('ascii')
|
||||
# check for duplicate warc record headers
|
||||
assert Counter(h[0] for h in record.headers).most_common(1)[0][1] == 1
|
||||
assert record.content[1] == b'HTTP/1.1 200 OK\r\nContent-Type: text/plain\r\nwarcprox-test-header: k!\r\nContent-Length: 44\r\n\r\nI am the warcprox test payload! llllllllll!\n'
|
||||
(offset, record, errors) = next(record_iter)
|
||||
assert record.type == b'request'
|
||||
@ -678,6 +683,8 @@ def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies,
|
||||
(offset, record, errors) = next(record_iter)
|
||||
assert record.type == b'revisit'
|
||||
assert record.url == url2.encode('ascii')
|
||||
# check for duplicate warc record headers
|
||||
assert Counter(h[0] for h in record.headers).most_common(1)[0][1] == 1
|
||||
assert record.content[1] == b'HTTP/1.1 200 OK\r\nContent-Type: text/plain\r\nwarcprox-test-header: k!\r\nContent-Length: 44\r\n\r\n'
|
||||
(offset, record, errors) = next(record_iter)
|
||||
assert record.type == b'request'
|
||||
@ -686,6 +693,8 @@ def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies,
|
||||
(offset, record, errors) = next(record_iter)
|
||||
assert record.type == b'revisit'
|
||||
assert record.url == url1.encode('ascii')
|
||||
# check for duplicate warc record headers
|
||||
assert Counter(h[0] for h in record.headers).most_common(1)[0][1] == 1
|
||||
assert record.content[1] == b'HTTP/1.1 200 OK\r\nContent-Type: text/plain\r\nwarcprox-test-header: k!\r\nContent-Length: 44\r\n\r\n'
|
||||
(offset, record, errors) = next(record_iter)
|
||||
assert record.type == b'request'
|
||||
|
@ -14,7 +14,7 @@ import rethinkstuff
|
||||
|
||||
class RethinkCaptures:
|
||||
"""Inserts in batches every 0.5 seconds"""
|
||||
logger = logging.getLogger("warcprox.bigtables.RethinkCaptures")
|
||||
logger = logging.getLogger("warcprox.bigtable.RethinkCaptures")
|
||||
|
||||
def __init__(self, r, table="captures", shards=None, replicas=None, options=warcprox.Options()):
|
||||
self.r = r
|
||||
|
@ -114,8 +114,9 @@ class WarcRecordBuilder:
|
||||
digest = hashlib.new(self.digest_algorithm, data)
|
||||
headers.append((warctools.WarcRecord.BLOCK_DIGEST,
|
||||
warcprox.digest_str(digest, self.base32)))
|
||||
headers.append((warctools.WarcRecord.PAYLOAD_DIGEST,
|
||||
warcprox.digest_str(digest, self.base32)))
|
||||
if not payload_digest:
|
||||
headers.append((warctools.WarcRecord.PAYLOAD_DIGEST,
|
||||
warcprox.digest_str(digest, self.base32)))
|
||||
|
||||
content_tuple = content_type, data
|
||||
record = warctools.WarcRecord(headers=headers, content=content_tuple)
|
||||
|
Loading…
x
Reference in New Issue
Block a user