mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
fix payload digest by pulling calculation up one level where content has already been transfer-decoded
This commit is contained in:
parent
30b6b0b337
commit
3a0f6e0947
2
setup.py
2
setup.py
@ -51,7 +51,7 @@ except:
|
||||
|
||||
setuptools.setup(
|
||||
name='warcprox',
|
||||
version='2.2.1b2.dev113',
|
||||
version='2.2.1b2.dev114',
|
||||
description='WARC writing MITM HTTP/S proxy',
|
||||
url='https://github.com/internetarchive/warcprox',
|
||||
author='Noah Levitt',
|
||||
|
@ -3,7 +3,7 @@
|
||||
tests/single-threaded-proxy.py - single-threaded MITM proxy, useful for
|
||||
debugging, does not write warcs
|
||||
|
||||
Copyright (C) 2015-2016 Internet Archive
|
||||
Copyright (C) 2015-2017 Internet Archive
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU General Public License
|
||||
@ -46,7 +46,7 @@ class FakeQueue(object):
|
||||
logging.info("{} {} {} {} {} size={} {}".format(
|
||||
recorded_url.client_ip, recorded_url.status, recorded_url.method,
|
||||
recorded_url.url.decode("utf-8"), recorded_url.mimetype,
|
||||
recorded_url.size, warcprox.digest_str(recorded_url.response_recorder.payload_digest, False).decode('utf-8')))
|
||||
recorded_url.size, warcprox.digest_str(recorded_url.payload_digest, False).decode('utf-8')))
|
||||
|
||||
def parse_args():
|
||||
prog = os.path.basename(sys.argv[0])
|
||||
|
@ -140,16 +140,16 @@ class RethinkCaptures:
|
||||
return result
|
||||
|
||||
def _assemble_entry(self, recorded_url, records):
|
||||
if recorded_url.response_recorder:
|
||||
if recorded_url.response_recorder.payload_digest.name == "sha1":
|
||||
if recorded_url.payload_digest:
|
||||
if recorded_url.payload_digest.name == "sha1":
|
||||
sha1base32 = base64.b32encode(
|
||||
recorded_url.response_recorder.payload_digest.digest()
|
||||
recorded_url.payload_digest.digest()
|
||||
).decode("utf-8")
|
||||
else:
|
||||
self.logger.warn(
|
||||
"digest type is %r but big captures table is indexed "
|
||||
"by sha1",
|
||||
recorded_url.response_recorder.payload_digest.name)
|
||||
recorded_url.payload_digest.name)
|
||||
else:
|
||||
digest = hashlib.new("sha1", records[0].content[1])
|
||||
sha1base32 = base64.b32encode(digest.digest()).decode("utf-8")
|
||||
|
@ -43,7 +43,7 @@ class CrawlLogger(object):
|
||||
if recorded_url.response_recorder:
|
||||
content_length = recorded_url.response_recorder.len - recorded_url.response_recorder.payload_offset
|
||||
payload_digest = warcprox.digest_str(
|
||||
recorded_url.response_recorder.payload_digest,
|
||||
recorded_url.payload_digest,
|
||||
self.options.base32)
|
||||
else:
|
||||
# WARCPROX_WRITE_RECORD request
|
||||
|
@ -96,8 +96,7 @@ class DedupDb(object):
|
||||
if (records and records[0].type == b'response'
|
||||
and recorded_url.response_recorder.payload_size() > 0):
|
||||
digest_key = warcprox.digest_str(
|
||||
recorded_url.response_recorder.payload_digest,
|
||||
self.options.base32)
|
||||
recorded_url.payload_digest, self.options.base32)
|
||||
if recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta:
|
||||
self.save(
|
||||
digest_key, records[0],
|
||||
@ -108,9 +107,9 @@ class DedupDb(object):
|
||||
|
||||
def decorate_with_dedup_info(dedup_db, recorded_url, base32=False):
|
||||
if (recorded_url.response_recorder
|
||||
and recorded_url.response_recorder.payload_digest
|
||||
and recorded_url.payload_digest
|
||||
and recorded_url.response_recorder.payload_size() > 0):
|
||||
digest_key = warcprox.digest_str(recorded_url.response_recorder.payload_digest, base32)
|
||||
digest_key = warcprox.digest_str(recorded_url.payload_digest, base32)
|
||||
if recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta:
|
||||
recorded_url.dedup_info = dedup_db.lookup(digest_key, recorded_url.warcprox_meta["captures-bucket"],
|
||||
recorded_url.url)
|
||||
@ -174,8 +173,8 @@ class RethinkDedupDb:
|
||||
def notify(self, recorded_url, records):
|
||||
if (records and records[0].type == b'response'
|
||||
and recorded_url.response_recorder.payload_size() > 0):
|
||||
digest_key = warcprox.digest_str(recorded_url.response_recorder.payload_digest,
|
||||
self.options.base32)
|
||||
digest_key = warcprox.digest_str(
|
||||
recorded_url.payload_digest, self.options.base32)
|
||||
if recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta:
|
||||
self.save(digest_key, records[0], bucket=recorded_url.warcprox_meta["captures-bucket"])
|
||||
else:
|
||||
|
@ -66,7 +66,7 @@ import time
|
||||
class ProxyingRecorder(object):
|
||||
"""
|
||||
Wraps a socket._fileobject, recording the bytes as they are read,
|
||||
calculating digests, and sending them on to the proxy client.
|
||||
calculating the block digest, and sending them on to the proxy client.
|
||||
"""
|
||||
|
||||
logger = logging.getLogger("warcprox.mitmproxy.ProxyingRecorder")
|
||||
@ -78,27 +78,19 @@ class ProxyingRecorder(object):
|
||||
self.digest_algorithm = digest_algorithm
|
||||
self.block_digest = hashlib.new(digest_algorithm)
|
||||
self.payload_offset = None
|
||||
self.payload_digest = None
|
||||
self.proxy_client = proxy_client
|
||||
self._proxy_client_conn_open = True
|
||||
self.len = 0
|
||||
self.url = url
|
||||
|
||||
def payload_starts_now(self):
|
||||
self.payload_digest = hashlib.new(self.digest_algorithm)
|
||||
self.payload_offset = self.len
|
||||
|
||||
def _update_payload_digest(self, hunk):
|
||||
if self.payload_digest:
|
||||
self.payload_digest.update(hunk)
|
||||
|
||||
def _update(self, hunk):
|
||||
self._update_payload_digest(hunk)
|
||||
self.block_digest.update(hunk)
|
||||
|
||||
self.tempfile.write(hunk)
|
||||
|
||||
if self.payload_digest and self._proxy_client_conn_open:
|
||||
if self.payload_offset is not None and self._proxy_client_conn_open:
|
||||
try:
|
||||
self.proxy_client.sendall(hunk)
|
||||
except BaseException as e:
|
||||
@ -157,6 +149,7 @@ class ProxyingRecordingHTTPResponse(http_client.HTTPResponse):
|
||||
self, sock, debuglevel=debuglevel, method=method)
|
||||
self.proxy_client = proxy_client
|
||||
self.url = url
|
||||
self.digest_algorithm = digest_algorithm
|
||||
|
||||
# Keep around extra reference to self.fp because HTTPResponse sets
|
||||
# self.fp=None after it finishes reading, but we still need it
|
||||
@ -164,6 +157,8 @@ class ProxyingRecordingHTTPResponse(http_client.HTTPResponse):
|
||||
self.fp, proxy_client, digest_algorithm, url=url)
|
||||
self.fp = self.recorder
|
||||
|
||||
self.payload_digest = None
|
||||
|
||||
def begin(self, extra_response_headers={}):
|
||||
http_client.HTTPResponse.begin(self) # reads status line, headers
|
||||
|
||||
@ -185,6 +180,12 @@ class ProxyingRecordingHTTPResponse(http_client.HTTPResponse):
|
||||
self.proxy_client.sendall(status_and_headers.encode('latin1'))
|
||||
|
||||
self.recorder.payload_starts_now()
|
||||
self.payload_digest = hashlib.new(self.digest_algorithm)
|
||||
|
||||
def read(self, amt=None):
|
||||
buf = http_client.HTTPResponse.read(self, amt)
|
||||
self.payload_digest.update(buf)
|
||||
return buf
|
||||
|
||||
def via_header_value(orig, request_version):
|
||||
via = orig
|
||||
@ -419,9 +420,9 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
||||
url=self.url, method=self.command)
|
||||
prox_rec_res.begin(extra_response_headers=extra_response_headers)
|
||||
|
||||
buf = prox_rec_res.read(8192)
|
||||
buf = prox_rec_res.read(65536)
|
||||
while buf != b'':
|
||||
buf = prox_rec_res.read(8192)
|
||||
buf = prox_rec_res.read(65536)
|
||||
|
||||
self.log_request(prox_rec_res.status, prox_rec_res.recorder.len)
|
||||
except Exception as e:
|
||||
|
@ -53,7 +53,8 @@ class WarcRecordBuilder:
|
||||
refers_to=recorded_url.dedup_info.get('id'),
|
||||
refers_to_target_uri=recorded_url.dedup_info['url'],
|
||||
refers_to_date=recorded_url.dedup_info['date'],
|
||||
payload_digest=warcprox.digest_str(recorded_url.response_recorder.payload_digest, self.base32),
|
||||
payload_digest=warcprox.digest_str(
|
||||
recorded_url.payload_digest, self.base32),
|
||||
profile=warctools.WarcRecord.PROFILE_IDENTICAL_PAYLOAD_DIGEST,
|
||||
content_type=hanzo.httptools.ResponseMessage.CONTENT_TYPE,
|
||||
remote_ip=recorded_url.remote_ip)
|
||||
@ -64,7 +65,9 @@ class WarcRecordBuilder:
|
||||
recorder=recorded_url.response_recorder,
|
||||
warc_type=warctools.WarcRecord.RESPONSE,
|
||||
content_type=hanzo.httptools.ResponseMessage.CONTENT_TYPE,
|
||||
remote_ip=recorded_url.remote_ip)
|
||||
remote_ip=recorded_url.remote_ip,
|
||||
payload_digest=warcprox.digest_str(
|
||||
recorded_url.payload_digest, self.base32))
|
||||
|
||||
def build_warc_records(self, recorded_url):
|
||||
"""Returns a tuple of hanzo.warctools.warc.WarcRecord (principal_record, ...)"""
|
||||
@ -122,13 +125,8 @@ class WarcRecordBuilder:
|
||||
headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(len(recorder)).encode('latin1')))
|
||||
headers.append((warctools.WarcRecord.BLOCK_DIGEST,
|
||||
warcprox.digest_str(recorder.block_digest, self.base32)))
|
||||
if recorder.payload_digest is not None:
|
||||
headers.append((warctools.WarcRecord.PAYLOAD_DIGEST,
|
||||
warcprox.digest_str(recorder.payload_digest, self.base32)))
|
||||
|
||||
recorder.tempfile.seek(0)
|
||||
record = warctools.WarcRecord(headers=headers, content_file=recorder.tempfile)
|
||||
|
||||
else:
|
||||
headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(len(data)).encode('latin1')))
|
||||
digest = hashlib.new(self.digest_algorithm, data)
|
||||
@ -137,7 +135,6 @@ class WarcRecordBuilder:
|
||||
if not payload_digest:
|
||||
headers.append((warctools.WarcRecord.PAYLOAD_DIGEST,
|
||||
warcprox.digest_str(digest, self.base32)))
|
||||
|
||||
content_tuple = content_type, data
|
||||
record = warctools.WarcRecord(headers=headers, content=content_tuple)
|
||||
|
||||
|
@ -218,7 +218,8 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
||||
content_type=content_type, method=self.command,
|
||||
timestamp=timestamp, host=self.hostname,
|
||||
duration=datetime.datetime.utcnow()-timestamp,
|
||||
referer=self.headers.get('referer'))
|
||||
referer=self.headers.get('referer'),
|
||||
payload_digest=prox_rec_res.payload_digest)
|
||||
self.server.recorded_url_q.put(recorded_url)
|
||||
|
||||
return recorded_url
|
||||
@ -328,7 +329,8 @@ class RecordedUrl:
|
||||
def __init__(self, url, request_data, response_recorder, remote_ip,
|
||||
warcprox_meta=None, content_type=None, custom_type=None,
|
||||
status=None, size=None, client_ip=None, method=None,
|
||||
timestamp=None, host=None, duration=None, referer=None):
|
||||
timestamp=None, host=None, duration=None, referer=None,
|
||||
payload_digest=None):
|
||||
# XXX should test what happens with non-ascii url (when does
|
||||
# url-encoding happen?)
|
||||
if type(url) is not bytes:
|
||||
@ -366,6 +368,7 @@ class RecordedUrl:
|
||||
self.host = host
|
||||
self.duration = duration
|
||||
self.referer = referer
|
||||
self.payload_digest = payload_digest
|
||||
|
||||
# inherit from object so that multiple inheritance from this class works
|
||||
# properly in python 2
|
||||
|
Loading…
x
Reference in New Issue
Block a user