fix payload digest by pulling calculation up one level where content has already been transfer-decoded

This commit is contained in:
Noah Levitt 2017-11-10 17:18:22 -08:00
parent 30b6b0b337
commit 3a0f6e0947
8 changed files with 36 additions and 36 deletions

View File

@ -51,7 +51,7 @@ except:
setuptools.setup(
name='warcprox',
version='2.2.1b2.dev113',
version='2.2.1b2.dev114',
description='WARC writing MITM HTTP/S proxy',
url='https://github.com/internetarchive/warcprox',
author='Noah Levitt',

View File

@ -3,7 +3,7 @@
tests/single-threaded-proxy.py - single-threaded MITM proxy, useful for
debugging, does not write warcs
Copyright (C) 2015-2016 Internet Archive
Copyright (C) 2015-2017 Internet Archive
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
@ -46,7 +46,7 @@ class FakeQueue(object):
logging.info("{} {} {} {} {} size={} {}".format(
recorded_url.client_ip, recorded_url.status, recorded_url.method,
recorded_url.url.decode("utf-8"), recorded_url.mimetype,
recorded_url.size, warcprox.digest_str(recorded_url.response_recorder.payload_digest, False).decode('utf-8')))
recorded_url.size, warcprox.digest_str(recorded_url.payload_digest, False).decode('utf-8')))
def parse_args():
prog = os.path.basename(sys.argv[0])

View File

@ -140,16 +140,16 @@ class RethinkCaptures:
return result
def _assemble_entry(self, recorded_url, records):
if recorded_url.response_recorder:
if recorded_url.response_recorder.payload_digest.name == "sha1":
if recorded_url.payload_digest:
if recorded_url.payload_digest.name == "sha1":
sha1base32 = base64.b32encode(
recorded_url.response_recorder.payload_digest.digest()
recorded_url.payload_digest.digest()
).decode("utf-8")
else:
self.logger.warn(
"digest type is %r but big captures table is indexed "
"by sha1",
recorded_url.response_recorder.payload_digest.name)
recorded_url.payload_digest.name)
else:
digest = hashlib.new("sha1", records[0].content[1])
sha1base32 = base64.b32encode(digest.digest()).decode("utf-8")

View File

@ -43,7 +43,7 @@ class CrawlLogger(object):
if recorded_url.response_recorder:
content_length = recorded_url.response_recorder.len - recorded_url.response_recorder.payload_offset
payload_digest = warcprox.digest_str(
recorded_url.response_recorder.payload_digest,
recorded_url.payload_digest,
self.options.base32)
else:
# WARCPROX_WRITE_RECORD request

View File

@ -96,8 +96,7 @@ class DedupDb(object):
if (records and records[0].type == b'response'
and recorded_url.response_recorder.payload_size() > 0):
digest_key = warcprox.digest_str(
recorded_url.response_recorder.payload_digest,
self.options.base32)
recorded_url.payload_digest, self.options.base32)
if recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta:
self.save(
digest_key, records[0],
@ -108,9 +107,9 @@ class DedupDb(object):
def decorate_with_dedup_info(dedup_db, recorded_url, base32=False):
if (recorded_url.response_recorder
and recorded_url.response_recorder.payload_digest
and recorded_url.payload_digest
and recorded_url.response_recorder.payload_size() > 0):
digest_key = warcprox.digest_str(recorded_url.response_recorder.payload_digest, base32)
digest_key = warcprox.digest_str(recorded_url.payload_digest, base32)
if recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta:
recorded_url.dedup_info = dedup_db.lookup(digest_key, recorded_url.warcprox_meta["captures-bucket"],
recorded_url.url)
@ -174,8 +173,8 @@ class RethinkDedupDb:
def notify(self, recorded_url, records):
if (records and records[0].type == b'response'
and recorded_url.response_recorder.payload_size() > 0):
digest_key = warcprox.digest_str(recorded_url.response_recorder.payload_digest,
self.options.base32)
digest_key = warcprox.digest_str(
recorded_url.payload_digest, self.options.base32)
if recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta:
self.save(digest_key, records[0], bucket=recorded_url.warcprox_meta["captures-bucket"])
else:

View File

@ -66,7 +66,7 @@ import time
class ProxyingRecorder(object):
"""
Wraps a socket._fileobject, recording the bytes as they are read,
calculating digests, and sending them on to the proxy client.
calculating the block digest, and sending them on to the proxy client.
"""
logger = logging.getLogger("warcprox.mitmproxy.ProxyingRecorder")
@ -78,27 +78,19 @@ class ProxyingRecorder(object):
self.digest_algorithm = digest_algorithm
self.block_digest = hashlib.new(digest_algorithm)
self.payload_offset = None
self.payload_digest = None
self.proxy_client = proxy_client
self._proxy_client_conn_open = True
self.len = 0
self.url = url
def payload_starts_now(self):
self.payload_digest = hashlib.new(self.digest_algorithm)
self.payload_offset = self.len
def _update_payload_digest(self, hunk):
if self.payload_digest:
self.payload_digest.update(hunk)
def _update(self, hunk):
self._update_payload_digest(hunk)
self.block_digest.update(hunk)
self.tempfile.write(hunk)
if self.payload_digest and self._proxy_client_conn_open:
if self.payload_offset is not None and self._proxy_client_conn_open:
try:
self.proxy_client.sendall(hunk)
except BaseException as e:
@ -157,6 +149,7 @@ class ProxyingRecordingHTTPResponse(http_client.HTTPResponse):
self, sock, debuglevel=debuglevel, method=method)
self.proxy_client = proxy_client
self.url = url
self.digest_algorithm = digest_algorithm
# Keep around extra reference to self.fp because HTTPResponse sets
# self.fp=None after it finishes reading, but we still need it
@ -164,6 +157,8 @@ class ProxyingRecordingHTTPResponse(http_client.HTTPResponse):
self.fp, proxy_client, digest_algorithm, url=url)
self.fp = self.recorder
self.payload_digest = None
def begin(self, extra_response_headers={}):
http_client.HTTPResponse.begin(self) # reads status line, headers
@ -185,6 +180,12 @@ class ProxyingRecordingHTTPResponse(http_client.HTTPResponse):
self.proxy_client.sendall(status_and_headers.encode('latin1'))
self.recorder.payload_starts_now()
self.payload_digest = hashlib.new(self.digest_algorithm)
def read(self, amt=None):
buf = http_client.HTTPResponse.read(self, amt)
self.payload_digest.update(buf)
return buf
def via_header_value(orig, request_version):
via = orig
@ -419,9 +420,9 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
url=self.url, method=self.command)
prox_rec_res.begin(extra_response_headers=extra_response_headers)
buf = prox_rec_res.read(8192)
buf = prox_rec_res.read(65536)
while buf != b'':
buf = prox_rec_res.read(8192)
buf = prox_rec_res.read(65536)
self.log_request(prox_rec_res.status, prox_rec_res.recorder.len)
except Exception as e:

View File

@ -53,7 +53,8 @@ class WarcRecordBuilder:
refers_to=recorded_url.dedup_info.get('id'),
refers_to_target_uri=recorded_url.dedup_info['url'],
refers_to_date=recorded_url.dedup_info['date'],
payload_digest=warcprox.digest_str(recorded_url.response_recorder.payload_digest, self.base32),
payload_digest=warcprox.digest_str(
recorded_url.payload_digest, self.base32),
profile=warctools.WarcRecord.PROFILE_IDENTICAL_PAYLOAD_DIGEST,
content_type=hanzo.httptools.ResponseMessage.CONTENT_TYPE,
remote_ip=recorded_url.remote_ip)
@ -64,7 +65,9 @@ class WarcRecordBuilder:
recorder=recorded_url.response_recorder,
warc_type=warctools.WarcRecord.RESPONSE,
content_type=hanzo.httptools.ResponseMessage.CONTENT_TYPE,
remote_ip=recorded_url.remote_ip)
remote_ip=recorded_url.remote_ip,
payload_digest=warcprox.digest_str(
recorded_url.payload_digest, self.base32))
def build_warc_records(self, recorded_url):
"""Returns a tuple of hanzo.warctools.warc.WarcRecord (principal_record, ...)"""
@ -122,13 +125,8 @@ class WarcRecordBuilder:
headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(len(recorder)).encode('latin1')))
headers.append((warctools.WarcRecord.BLOCK_DIGEST,
warcprox.digest_str(recorder.block_digest, self.base32)))
if recorder.payload_digest is not None:
headers.append((warctools.WarcRecord.PAYLOAD_DIGEST,
warcprox.digest_str(recorder.payload_digest, self.base32)))
recorder.tempfile.seek(0)
record = warctools.WarcRecord(headers=headers, content_file=recorder.tempfile)
else:
headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(len(data)).encode('latin1')))
digest = hashlib.new(self.digest_algorithm, data)
@ -137,7 +135,6 @@ class WarcRecordBuilder:
if not payload_digest:
headers.append((warctools.WarcRecord.PAYLOAD_DIGEST,
warcprox.digest_str(digest, self.base32)))
content_tuple = content_type, data
record = warctools.WarcRecord(headers=headers, content=content_tuple)

View File

@ -218,7 +218,8 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
content_type=content_type, method=self.command,
timestamp=timestamp, host=self.hostname,
duration=datetime.datetime.utcnow()-timestamp,
referer=self.headers.get('referer'))
referer=self.headers.get('referer'),
payload_digest=prox_rec_res.payload_digest)
self.server.recorded_url_q.put(recorded_url)
return recorded_url
@ -328,7 +329,8 @@ class RecordedUrl:
def __init__(self, url, request_data, response_recorder, remote_ip,
warcprox_meta=None, content_type=None, custom_type=None,
status=None, size=None, client_ip=None, method=None,
timestamp=None, host=None, duration=None, referer=None):
timestamp=None, host=None, duration=None, referer=None,
payload_digest=None):
# XXX should test what happens with non-ascii url (when does
# url-encoding happen?)
if type(url) is not bytes:
@ -366,6 +368,7 @@ class RecordedUrl:
self.host = host
self.duration = duration
self.referer = referer
self.payload_digest = payload_digest
# inherit from object so that multiple inheritance from this class works
# properly in python 2