mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
fix bug where two warc-payload-digest headers were written to revisit records
This commit is contained in:
parent
2c91eb03d3
commit
42a81d8f8f
2
setup.py
2
setup.py
@ -30,7 +30,7 @@ except:
|
|||||||
deps.append('futures')
|
deps.append('futures')
|
||||||
|
|
||||||
setuptools.setup(name='warcprox',
|
setuptools.setup(name='warcprox',
|
||||||
version='1.5.0.dev2',
|
version='1.5.0.dev3',
|
||||||
description='WARC writing MITM HTTP/S proxy',
|
description='WARC writing MITM HTTP/S proxy',
|
||||||
url='https://github.com/internetarchive/warcprox',
|
url='https://github.com/internetarchive/warcprox',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
@ -22,6 +22,7 @@ import warnings
|
|||||||
import pprint
|
import pprint
|
||||||
import traceback
|
import traceback
|
||||||
import signal
|
import signal
|
||||||
|
from collections import Counter
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import http.server as http_server
|
import http.server as http_server
|
||||||
@ -662,6 +663,8 @@ def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies,
|
|||||||
(offset, record, errors) = next(record_iter)
|
(offset, record, errors) = next(record_iter)
|
||||||
assert record.type == b'response'
|
assert record.type == b'response'
|
||||||
assert record.url == url1.encode('ascii')
|
assert record.url == url1.encode('ascii')
|
||||||
|
# check for duplicate warc record headers
|
||||||
|
assert Counter(h[0] for h in record.headers).most_common(1)[0][1] == 1
|
||||||
assert record.content[1] == b'HTTP/1.1 200 OK\r\nContent-Type: text/plain\r\nwarcprox-test-header: k!\r\nContent-Length: 44\r\n\r\nI am the warcprox test payload! llllllllll!\n'
|
assert record.content[1] == b'HTTP/1.1 200 OK\r\nContent-Type: text/plain\r\nwarcprox-test-header: k!\r\nContent-Length: 44\r\n\r\nI am the warcprox test payload! llllllllll!\n'
|
||||||
(offset, record, errors) = next(record_iter)
|
(offset, record, errors) = next(record_iter)
|
||||||
assert record.type == b'request'
|
assert record.type == b'request'
|
||||||
@ -670,6 +673,8 @@ def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies,
|
|||||||
(offset, record, errors) = next(record_iter)
|
(offset, record, errors) = next(record_iter)
|
||||||
assert record.type == b'response'
|
assert record.type == b'response'
|
||||||
assert record.url == url2.encode('ascii')
|
assert record.url == url2.encode('ascii')
|
||||||
|
# check for duplicate warc record headers
|
||||||
|
assert Counter(h[0] for h in record.headers).most_common(1)[0][1] == 1
|
||||||
assert record.content[1] == b'HTTP/1.1 200 OK\r\nContent-Type: text/plain\r\nwarcprox-test-header: k!\r\nContent-Length: 44\r\n\r\nI am the warcprox test payload! llllllllll!\n'
|
assert record.content[1] == b'HTTP/1.1 200 OK\r\nContent-Type: text/plain\r\nwarcprox-test-header: k!\r\nContent-Length: 44\r\n\r\nI am the warcprox test payload! llllllllll!\n'
|
||||||
(offset, record, errors) = next(record_iter)
|
(offset, record, errors) = next(record_iter)
|
||||||
assert record.type == b'request'
|
assert record.type == b'request'
|
||||||
@ -678,6 +683,8 @@ def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies,
|
|||||||
(offset, record, errors) = next(record_iter)
|
(offset, record, errors) = next(record_iter)
|
||||||
assert record.type == b'revisit'
|
assert record.type == b'revisit'
|
||||||
assert record.url == url2.encode('ascii')
|
assert record.url == url2.encode('ascii')
|
||||||
|
# check for duplicate warc record headers
|
||||||
|
assert Counter(h[0] for h in record.headers).most_common(1)[0][1] == 1
|
||||||
assert record.content[1] == b'HTTP/1.1 200 OK\r\nContent-Type: text/plain\r\nwarcprox-test-header: k!\r\nContent-Length: 44\r\n\r\n'
|
assert record.content[1] == b'HTTP/1.1 200 OK\r\nContent-Type: text/plain\r\nwarcprox-test-header: k!\r\nContent-Length: 44\r\n\r\n'
|
||||||
(offset, record, errors) = next(record_iter)
|
(offset, record, errors) = next(record_iter)
|
||||||
assert record.type == b'request'
|
assert record.type == b'request'
|
||||||
@ -686,6 +693,8 @@ def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies,
|
|||||||
(offset, record, errors) = next(record_iter)
|
(offset, record, errors) = next(record_iter)
|
||||||
assert record.type == b'revisit'
|
assert record.type == b'revisit'
|
||||||
assert record.url == url1.encode('ascii')
|
assert record.url == url1.encode('ascii')
|
||||||
|
# check for duplicate warc record headers
|
||||||
|
assert Counter(h[0] for h in record.headers).most_common(1)[0][1] == 1
|
||||||
assert record.content[1] == b'HTTP/1.1 200 OK\r\nContent-Type: text/plain\r\nwarcprox-test-header: k!\r\nContent-Length: 44\r\n\r\n'
|
assert record.content[1] == b'HTTP/1.1 200 OK\r\nContent-Type: text/plain\r\nwarcprox-test-header: k!\r\nContent-Length: 44\r\n\r\n'
|
||||||
(offset, record, errors) = next(record_iter)
|
(offset, record, errors) = next(record_iter)
|
||||||
assert record.type == b'request'
|
assert record.type == b'request'
|
||||||
|
@ -14,7 +14,7 @@ import rethinkstuff
|
|||||||
|
|
||||||
class RethinkCaptures:
|
class RethinkCaptures:
|
||||||
"""Inserts in batches every 0.5 seconds"""
|
"""Inserts in batches every 0.5 seconds"""
|
||||||
logger = logging.getLogger("warcprox.bigtables.RethinkCaptures")
|
logger = logging.getLogger("warcprox.bigtable.RethinkCaptures")
|
||||||
|
|
||||||
def __init__(self, r, table="captures", shards=None, replicas=None, options=warcprox.Options()):
|
def __init__(self, r, table="captures", shards=None, replicas=None, options=warcprox.Options()):
|
||||||
self.r = r
|
self.r = r
|
||||||
|
@ -114,8 +114,9 @@ class WarcRecordBuilder:
|
|||||||
digest = hashlib.new(self.digest_algorithm, data)
|
digest = hashlib.new(self.digest_algorithm, data)
|
||||||
headers.append((warctools.WarcRecord.BLOCK_DIGEST,
|
headers.append((warctools.WarcRecord.BLOCK_DIGEST,
|
||||||
warcprox.digest_str(digest, self.base32)))
|
warcprox.digest_str(digest, self.base32)))
|
||||||
headers.append((warctools.WarcRecord.PAYLOAD_DIGEST,
|
if not payload_digest:
|
||||||
warcprox.digest_str(digest, self.base32)))
|
headers.append((warctools.WarcRecord.PAYLOAD_DIGEST,
|
||||||
|
warcprox.digest_str(digest, self.base32)))
|
||||||
|
|
||||||
content_tuple = content_type, data
|
content_tuple = content_type, data
|
||||||
record = warctools.WarcRecord(headers=headers, content=content_tuple)
|
record = warctools.WarcRecord(headers=headers, content=content_tuple)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user