mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
fix payload digest by pulling calculation up one level where content has already been transfer-decoded
This commit is contained in:
parent
30b6b0b337
commit
3a0f6e0947
2
setup.py
2
setup.py
@ -51,7 +51,7 @@ except:
|
|||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='warcprox',
|
name='warcprox',
|
||||||
version='2.2.1b2.dev113',
|
version='2.2.1b2.dev114',
|
||||||
description='WARC writing MITM HTTP/S proxy',
|
description='WARC writing MITM HTTP/S proxy',
|
||||||
url='https://github.com/internetarchive/warcprox',
|
url='https://github.com/internetarchive/warcprox',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
@ -3,7 +3,7 @@
|
|||||||
tests/single-threaded-proxy.py - single-threaded MITM proxy, useful for
|
tests/single-threaded-proxy.py - single-threaded MITM proxy, useful for
|
||||||
debugging, does not write warcs
|
debugging, does not write warcs
|
||||||
|
|
||||||
Copyright (C) 2015-2016 Internet Archive
|
Copyright (C) 2015-2017 Internet Archive
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or
|
This program is free software; you can redistribute it and/or
|
||||||
modify it under the terms of the GNU General Public License
|
modify it under the terms of the GNU General Public License
|
||||||
@ -46,7 +46,7 @@ class FakeQueue(object):
|
|||||||
logging.info("{} {} {} {} {} size={} {}".format(
|
logging.info("{} {} {} {} {} size={} {}".format(
|
||||||
recorded_url.client_ip, recorded_url.status, recorded_url.method,
|
recorded_url.client_ip, recorded_url.status, recorded_url.method,
|
||||||
recorded_url.url.decode("utf-8"), recorded_url.mimetype,
|
recorded_url.url.decode("utf-8"), recorded_url.mimetype,
|
||||||
recorded_url.size, warcprox.digest_str(recorded_url.response_recorder.payload_digest, False).decode('utf-8')))
|
recorded_url.size, warcprox.digest_str(recorded_url.payload_digest, False).decode('utf-8')))
|
||||||
|
|
||||||
def parse_args():
|
def parse_args():
|
||||||
prog = os.path.basename(sys.argv[0])
|
prog = os.path.basename(sys.argv[0])
|
||||||
|
@ -140,16 +140,16 @@ class RethinkCaptures:
|
|||||||
return result
|
return result
|
||||||
|
|
||||||
def _assemble_entry(self, recorded_url, records):
|
def _assemble_entry(self, recorded_url, records):
|
||||||
if recorded_url.response_recorder:
|
if recorded_url.payload_digest:
|
||||||
if recorded_url.response_recorder.payload_digest.name == "sha1":
|
if recorded_url.payload_digest.name == "sha1":
|
||||||
sha1base32 = base64.b32encode(
|
sha1base32 = base64.b32encode(
|
||||||
recorded_url.response_recorder.payload_digest.digest()
|
recorded_url.payload_digest.digest()
|
||||||
).decode("utf-8")
|
).decode("utf-8")
|
||||||
else:
|
else:
|
||||||
self.logger.warn(
|
self.logger.warn(
|
||||||
"digest type is %r but big captures table is indexed "
|
"digest type is %r but big captures table is indexed "
|
||||||
"by sha1",
|
"by sha1",
|
||||||
recorded_url.response_recorder.payload_digest.name)
|
recorded_url.payload_digest.name)
|
||||||
else:
|
else:
|
||||||
digest = hashlib.new("sha1", records[0].content[1])
|
digest = hashlib.new("sha1", records[0].content[1])
|
||||||
sha1base32 = base64.b32encode(digest.digest()).decode("utf-8")
|
sha1base32 = base64.b32encode(digest.digest()).decode("utf-8")
|
||||||
|
@ -43,7 +43,7 @@ class CrawlLogger(object):
|
|||||||
if recorded_url.response_recorder:
|
if recorded_url.response_recorder:
|
||||||
content_length = recorded_url.response_recorder.len - recorded_url.response_recorder.payload_offset
|
content_length = recorded_url.response_recorder.len - recorded_url.response_recorder.payload_offset
|
||||||
payload_digest = warcprox.digest_str(
|
payload_digest = warcprox.digest_str(
|
||||||
recorded_url.response_recorder.payload_digest,
|
recorded_url.payload_digest,
|
||||||
self.options.base32)
|
self.options.base32)
|
||||||
else:
|
else:
|
||||||
# WARCPROX_WRITE_RECORD request
|
# WARCPROX_WRITE_RECORD request
|
||||||
|
@ -96,8 +96,7 @@ class DedupDb(object):
|
|||||||
if (records and records[0].type == b'response'
|
if (records and records[0].type == b'response'
|
||||||
and recorded_url.response_recorder.payload_size() > 0):
|
and recorded_url.response_recorder.payload_size() > 0):
|
||||||
digest_key = warcprox.digest_str(
|
digest_key = warcprox.digest_str(
|
||||||
recorded_url.response_recorder.payload_digest,
|
recorded_url.payload_digest, self.options.base32)
|
||||||
self.options.base32)
|
|
||||||
if recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta:
|
if recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta:
|
||||||
self.save(
|
self.save(
|
||||||
digest_key, records[0],
|
digest_key, records[0],
|
||||||
@ -108,9 +107,9 @@ class DedupDb(object):
|
|||||||
|
|
||||||
def decorate_with_dedup_info(dedup_db, recorded_url, base32=False):
|
def decorate_with_dedup_info(dedup_db, recorded_url, base32=False):
|
||||||
if (recorded_url.response_recorder
|
if (recorded_url.response_recorder
|
||||||
and recorded_url.response_recorder.payload_digest
|
and recorded_url.payload_digest
|
||||||
and recorded_url.response_recorder.payload_size() > 0):
|
and recorded_url.response_recorder.payload_size() > 0):
|
||||||
digest_key = warcprox.digest_str(recorded_url.response_recorder.payload_digest, base32)
|
digest_key = warcprox.digest_str(recorded_url.payload_digest, base32)
|
||||||
if recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta:
|
if recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta:
|
||||||
recorded_url.dedup_info = dedup_db.lookup(digest_key, recorded_url.warcprox_meta["captures-bucket"],
|
recorded_url.dedup_info = dedup_db.lookup(digest_key, recorded_url.warcprox_meta["captures-bucket"],
|
||||||
recorded_url.url)
|
recorded_url.url)
|
||||||
@ -174,8 +173,8 @@ class RethinkDedupDb:
|
|||||||
def notify(self, recorded_url, records):
|
def notify(self, recorded_url, records):
|
||||||
if (records and records[0].type == b'response'
|
if (records and records[0].type == b'response'
|
||||||
and recorded_url.response_recorder.payload_size() > 0):
|
and recorded_url.response_recorder.payload_size() > 0):
|
||||||
digest_key = warcprox.digest_str(recorded_url.response_recorder.payload_digest,
|
digest_key = warcprox.digest_str(
|
||||||
self.options.base32)
|
recorded_url.payload_digest, self.options.base32)
|
||||||
if recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta:
|
if recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta:
|
||||||
self.save(digest_key, records[0], bucket=recorded_url.warcprox_meta["captures-bucket"])
|
self.save(digest_key, records[0], bucket=recorded_url.warcprox_meta["captures-bucket"])
|
||||||
else:
|
else:
|
||||||
|
@ -66,7 +66,7 @@ import time
|
|||||||
class ProxyingRecorder(object):
|
class ProxyingRecorder(object):
|
||||||
"""
|
"""
|
||||||
Wraps a socket._fileobject, recording the bytes as they are read,
|
Wraps a socket._fileobject, recording the bytes as they are read,
|
||||||
calculating digests, and sending them on to the proxy client.
|
calculating the block digest, and sending them on to the proxy client.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
logger = logging.getLogger("warcprox.mitmproxy.ProxyingRecorder")
|
logger = logging.getLogger("warcprox.mitmproxy.ProxyingRecorder")
|
||||||
@ -78,27 +78,19 @@ class ProxyingRecorder(object):
|
|||||||
self.digest_algorithm = digest_algorithm
|
self.digest_algorithm = digest_algorithm
|
||||||
self.block_digest = hashlib.new(digest_algorithm)
|
self.block_digest = hashlib.new(digest_algorithm)
|
||||||
self.payload_offset = None
|
self.payload_offset = None
|
||||||
self.payload_digest = None
|
|
||||||
self.proxy_client = proxy_client
|
self.proxy_client = proxy_client
|
||||||
self._proxy_client_conn_open = True
|
self._proxy_client_conn_open = True
|
||||||
self.len = 0
|
self.len = 0
|
||||||
self.url = url
|
self.url = url
|
||||||
|
|
||||||
def payload_starts_now(self):
|
def payload_starts_now(self):
|
||||||
self.payload_digest = hashlib.new(self.digest_algorithm)
|
|
||||||
self.payload_offset = self.len
|
self.payload_offset = self.len
|
||||||
|
|
||||||
def _update_payload_digest(self, hunk):
|
|
||||||
if self.payload_digest:
|
|
||||||
self.payload_digest.update(hunk)
|
|
||||||
|
|
||||||
def _update(self, hunk):
|
def _update(self, hunk):
|
||||||
self._update_payload_digest(hunk)
|
|
||||||
self.block_digest.update(hunk)
|
self.block_digest.update(hunk)
|
||||||
|
|
||||||
self.tempfile.write(hunk)
|
self.tempfile.write(hunk)
|
||||||
|
|
||||||
if self.payload_digest and self._proxy_client_conn_open:
|
if self.payload_offset is not None and self._proxy_client_conn_open:
|
||||||
try:
|
try:
|
||||||
self.proxy_client.sendall(hunk)
|
self.proxy_client.sendall(hunk)
|
||||||
except BaseException as e:
|
except BaseException as e:
|
||||||
@ -157,6 +149,7 @@ class ProxyingRecordingHTTPResponse(http_client.HTTPResponse):
|
|||||||
self, sock, debuglevel=debuglevel, method=method)
|
self, sock, debuglevel=debuglevel, method=method)
|
||||||
self.proxy_client = proxy_client
|
self.proxy_client = proxy_client
|
||||||
self.url = url
|
self.url = url
|
||||||
|
self.digest_algorithm = digest_algorithm
|
||||||
|
|
||||||
# Keep around extra reference to self.fp because HTTPResponse sets
|
# Keep around extra reference to self.fp because HTTPResponse sets
|
||||||
# self.fp=None after it finishes reading, but we still need it
|
# self.fp=None after it finishes reading, but we still need it
|
||||||
@ -164,6 +157,8 @@ class ProxyingRecordingHTTPResponse(http_client.HTTPResponse):
|
|||||||
self.fp, proxy_client, digest_algorithm, url=url)
|
self.fp, proxy_client, digest_algorithm, url=url)
|
||||||
self.fp = self.recorder
|
self.fp = self.recorder
|
||||||
|
|
||||||
|
self.payload_digest = None
|
||||||
|
|
||||||
def begin(self, extra_response_headers={}):
|
def begin(self, extra_response_headers={}):
|
||||||
http_client.HTTPResponse.begin(self) # reads status line, headers
|
http_client.HTTPResponse.begin(self) # reads status line, headers
|
||||||
|
|
||||||
@ -185,6 +180,12 @@ class ProxyingRecordingHTTPResponse(http_client.HTTPResponse):
|
|||||||
self.proxy_client.sendall(status_and_headers.encode('latin1'))
|
self.proxy_client.sendall(status_and_headers.encode('latin1'))
|
||||||
|
|
||||||
self.recorder.payload_starts_now()
|
self.recorder.payload_starts_now()
|
||||||
|
self.payload_digest = hashlib.new(self.digest_algorithm)
|
||||||
|
|
||||||
|
def read(self, amt=None):
|
||||||
|
buf = http_client.HTTPResponse.read(self, amt)
|
||||||
|
self.payload_digest.update(buf)
|
||||||
|
return buf
|
||||||
|
|
||||||
def via_header_value(orig, request_version):
|
def via_header_value(orig, request_version):
|
||||||
via = orig
|
via = orig
|
||||||
@ -419,9 +420,9 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
|||||||
url=self.url, method=self.command)
|
url=self.url, method=self.command)
|
||||||
prox_rec_res.begin(extra_response_headers=extra_response_headers)
|
prox_rec_res.begin(extra_response_headers=extra_response_headers)
|
||||||
|
|
||||||
buf = prox_rec_res.read(8192)
|
buf = prox_rec_res.read(65536)
|
||||||
while buf != b'':
|
while buf != b'':
|
||||||
buf = prox_rec_res.read(8192)
|
buf = prox_rec_res.read(65536)
|
||||||
|
|
||||||
self.log_request(prox_rec_res.status, prox_rec_res.recorder.len)
|
self.log_request(prox_rec_res.status, prox_rec_res.recorder.len)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
@ -53,7 +53,8 @@ class WarcRecordBuilder:
|
|||||||
refers_to=recorded_url.dedup_info.get('id'),
|
refers_to=recorded_url.dedup_info.get('id'),
|
||||||
refers_to_target_uri=recorded_url.dedup_info['url'],
|
refers_to_target_uri=recorded_url.dedup_info['url'],
|
||||||
refers_to_date=recorded_url.dedup_info['date'],
|
refers_to_date=recorded_url.dedup_info['date'],
|
||||||
payload_digest=warcprox.digest_str(recorded_url.response_recorder.payload_digest, self.base32),
|
payload_digest=warcprox.digest_str(
|
||||||
|
recorded_url.payload_digest, self.base32),
|
||||||
profile=warctools.WarcRecord.PROFILE_IDENTICAL_PAYLOAD_DIGEST,
|
profile=warctools.WarcRecord.PROFILE_IDENTICAL_PAYLOAD_DIGEST,
|
||||||
content_type=hanzo.httptools.ResponseMessage.CONTENT_TYPE,
|
content_type=hanzo.httptools.ResponseMessage.CONTENT_TYPE,
|
||||||
remote_ip=recorded_url.remote_ip)
|
remote_ip=recorded_url.remote_ip)
|
||||||
@ -64,7 +65,9 @@ class WarcRecordBuilder:
|
|||||||
recorder=recorded_url.response_recorder,
|
recorder=recorded_url.response_recorder,
|
||||||
warc_type=warctools.WarcRecord.RESPONSE,
|
warc_type=warctools.WarcRecord.RESPONSE,
|
||||||
content_type=hanzo.httptools.ResponseMessage.CONTENT_TYPE,
|
content_type=hanzo.httptools.ResponseMessage.CONTENT_TYPE,
|
||||||
remote_ip=recorded_url.remote_ip)
|
remote_ip=recorded_url.remote_ip,
|
||||||
|
payload_digest=warcprox.digest_str(
|
||||||
|
recorded_url.payload_digest, self.base32))
|
||||||
|
|
||||||
def build_warc_records(self, recorded_url):
|
def build_warc_records(self, recorded_url):
|
||||||
"""Returns a tuple of hanzo.warctools.warc.WarcRecord (principal_record, ...)"""
|
"""Returns a tuple of hanzo.warctools.warc.WarcRecord (principal_record, ...)"""
|
||||||
@ -122,13 +125,8 @@ class WarcRecordBuilder:
|
|||||||
headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(len(recorder)).encode('latin1')))
|
headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(len(recorder)).encode('latin1')))
|
||||||
headers.append((warctools.WarcRecord.BLOCK_DIGEST,
|
headers.append((warctools.WarcRecord.BLOCK_DIGEST,
|
||||||
warcprox.digest_str(recorder.block_digest, self.base32)))
|
warcprox.digest_str(recorder.block_digest, self.base32)))
|
||||||
if recorder.payload_digest is not None:
|
|
||||||
headers.append((warctools.WarcRecord.PAYLOAD_DIGEST,
|
|
||||||
warcprox.digest_str(recorder.payload_digest, self.base32)))
|
|
||||||
|
|
||||||
recorder.tempfile.seek(0)
|
recorder.tempfile.seek(0)
|
||||||
record = warctools.WarcRecord(headers=headers, content_file=recorder.tempfile)
|
record = warctools.WarcRecord(headers=headers, content_file=recorder.tempfile)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(len(data)).encode('latin1')))
|
headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(len(data)).encode('latin1')))
|
||||||
digest = hashlib.new(self.digest_algorithm, data)
|
digest = hashlib.new(self.digest_algorithm, data)
|
||||||
@ -137,7 +135,6 @@ class WarcRecordBuilder:
|
|||||||
if not payload_digest:
|
if not payload_digest:
|
||||||
headers.append((warctools.WarcRecord.PAYLOAD_DIGEST,
|
headers.append((warctools.WarcRecord.PAYLOAD_DIGEST,
|
||||||
warcprox.digest_str(digest, self.base32)))
|
warcprox.digest_str(digest, self.base32)))
|
||||||
|
|
||||||
content_tuple = content_type, data
|
content_tuple = content_type, data
|
||||||
record = warctools.WarcRecord(headers=headers, content=content_tuple)
|
record = warctools.WarcRecord(headers=headers, content=content_tuple)
|
||||||
|
|
||||||
|
@ -218,7 +218,8 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
|||||||
content_type=content_type, method=self.command,
|
content_type=content_type, method=self.command,
|
||||||
timestamp=timestamp, host=self.hostname,
|
timestamp=timestamp, host=self.hostname,
|
||||||
duration=datetime.datetime.utcnow()-timestamp,
|
duration=datetime.datetime.utcnow()-timestamp,
|
||||||
referer=self.headers.get('referer'))
|
referer=self.headers.get('referer'),
|
||||||
|
payload_digest=prox_rec_res.payload_digest)
|
||||||
self.server.recorded_url_q.put(recorded_url)
|
self.server.recorded_url_q.put(recorded_url)
|
||||||
|
|
||||||
return recorded_url
|
return recorded_url
|
||||||
@ -328,7 +329,8 @@ class RecordedUrl:
|
|||||||
def __init__(self, url, request_data, response_recorder, remote_ip,
|
def __init__(self, url, request_data, response_recorder, remote_ip,
|
||||||
warcprox_meta=None, content_type=None, custom_type=None,
|
warcprox_meta=None, content_type=None, custom_type=None,
|
||||||
status=None, size=None, client_ip=None, method=None,
|
status=None, size=None, client_ip=None, method=None,
|
||||||
timestamp=None, host=None, duration=None, referer=None):
|
timestamp=None, host=None, duration=None, referer=None,
|
||||||
|
payload_digest=None):
|
||||||
# XXX should test what happens with non-ascii url (when does
|
# XXX should test what happens with non-ascii url (when does
|
||||||
# url-encoding happen?)
|
# url-encoding happen?)
|
||||||
if type(url) is not bytes:
|
if type(url) is not bytes:
|
||||||
@ -366,6 +368,7 @@ class RecordedUrl:
|
|||||||
self.host = host
|
self.host = host
|
||||||
self.duration = duration
|
self.duration = duration
|
||||||
self.referer = referer
|
self.referer = referer
|
||||||
|
self.payload_digest = payload_digest
|
||||||
|
|
||||||
# inherit from object so that multiple inheritance from this class works
|
# inherit from object so that multiple inheritance from this class works
|
||||||
# properly in python 2
|
# properly in python 2
|
||||||
|
Loading…
x
Reference in New Issue
Block a user