support revisit records in playback proxy

This commit is contained in:
Noah Levitt 2013-11-01 19:04:48 -07:00
parent 77d33f21a8
commit 121ecca830
3 changed files with 107 additions and 38 deletions

View File

@ -85,11 +85,12 @@ incorporated into warctools mainline.
- python3 - python3
- special handling for 304 not-modified (write nothing or write revisit - special handling for 304 not-modified (write nothing or write revisit
record... and/or modify request so server never responds with 304) record... and/or modify request so server never responds with 304)
- instant playback on a second proxy port - ~~instant playback on a second proxy port~~
- browser plugin for warcprox mode - browser plugin for warcprox mode
* accept warcprox CA cert only when in warcprox mode * accept warcprox CA cert only when in warcprox mode
* separate temporary cookie store, like incognito * separate temporary cookie store, like incognito
* "careful! your activity is being archived" banner * "careful! your activity is being archived" banner
* easy switch between archiving and instant playback proxy port
#### To not do #### To not do

View File

@ -5,7 +5,7 @@
""" """
Dump contents of database to stdout. Database can be any file that the anydbm Dump contents of database to stdout. Database can be any file that the anydbm
module can read. Included with warcprox because it's useful for inspecting a module can read. Included with warcprox because it's useful for inspecting a
deduplication database, but it is a generic tool. deduplication database or a playback index database, but it is a generic tool.
""" """
import anydbm import anydbm

View File

@ -14,7 +14,7 @@ import OpenSSL
import ssl import ssl
import logging import logging
import sys import sys
from hanzo import warctools from hanzo import warctools, httptools
import hashlib import hashlib
from datetime import datetime from datetime import datetime
import Queue import Queue
@ -30,7 +30,7 @@ import tempfile
import base64 import base64
import anydbm import anydbm
import json import json
import contextlib import traceback
class CertificateAuthority(object): class CertificateAuthority(object):
@ -247,6 +247,7 @@ class MitmProxyHandler(BaseHTTPServer.BaseHTTPRequestHandler):
) )
) )
def _connect_to_host(self): def _connect_to_host(self):
# Connect to destination # Connect to destination
self._proxy_sock = socket.socket() self._proxy_sock = socket.socket()
@ -323,7 +324,6 @@ class MitmProxyHandler(BaseHTTPServer.BaseHTTPRequestHandler):
def _proxy_request(self): def _proxy_request(self):
raise Exception('_proxy_request() not implemented in MitmProxyHandler, must be implemented in subclass!') raise Exception('_proxy_request() not implemented in MitmProxyHandler, must be implemented in subclass!')
def __getattr__(self, item): def __getattr__(self, item):
if item.startswith('do_'): if item.startswith('do_'):
return self.do_COMMAND return self.do_COMMAND
@ -423,20 +423,30 @@ class PlaybackProxyHandler(MitmProxyHandler):
date, location = self.server.playback_index_db.lookup_latest(self.url) date, location = self.server.playback_index_db.lookup_latest(self.url)
logging.info('lookup_latest returned {}:{}'.format(date, location)) logging.info('lookup_latest returned {}:{}'.format(date, location))
response = None
if location is not None: if location is not None:
response = self.gather_response(location['f'], location['o']) try:
response = self.gather_response(location['f'], location['o'])
if response is None: except:
logging.error('PlaybackProxyHandler problem playing back {}'.format(self.url), exc_info=1)
payload = '500 Warcprox Error\n\n{}\n'.format(traceback.format_exc())
response = ('HTTP/1.1 500 Internal Server Error\r\n'
+ 'Content-Type: text/plain\r\n'
+ 'Content-Length: {}\r\n'
+ '\r\n'
+ '{}').format(len(payload), payload)
else:
response = ('HTTP/1.1 404 Not Found\r\n' response = ('HTTP/1.1 404 Not Found\r\n'
+ 'Content-Type: text/plain\r\n' + 'Content-Type: text/plain\r\n'
+ 'Content-Length: 15\r\n' + 'Content-Length: 19\r\n'
+ '\r\n' + '\r\n'
+ 'not in archive\n') + '404 Not in Archive\n')
self.connection.sendall(response) self.connection.sendall(response)
def gather_response(self, warcfilename, offset):
def open_warc_at_offset(self, warcfilename, offset):
logging.debug('opening {} at offset {}'.format(warcfilename, offset))
warcpath = None warcpath = None
for p in (os.path.sep.join([self.server.warcs_dir, warcfilename]), for p in (os.path.sep.join([self.server.warcs_dir, warcfilename]),
os.path.sep.join([self.server.warcs_dir, '{}.open'.format(warcfilename)])): os.path.sep.join([self.server.warcs_dir, '{}.open'.format(warcfilename)])):
@ -444,27 +454,72 @@ class PlaybackProxyHandler(MitmProxyHandler):
warcpath = p warcpath = p
if warcpath is None: if warcpath is None:
logging.error('{} not found'.format(warcfilename)) raise Exception('{} not found'.format(warcfilename))
return None
fh = warctools.warc.WarcRecord.open_archive(filename=warcpath, mode='rb', offset=offset) return warctools.warc.WarcRecord.open_archive(filename=warcpath, mode='rb', offset=offset)
with contextlib.closing(fh):
# returns payload starting after http headers
def warc_record_http_payload(self, warcfilename, offset):
fh = self.open_warc_at_offset(warcfilename, offset)
try:
for (offset, record, errors) in fh.read_records(limit=1, offsets=True): for (offset, record, errors) in fh.read_records(limit=1, offsets=True):
pass pass
logging.info('record_stream.read_records() returned {}'.format((offset,record,errors))) if errors:
raise Exception('warc errors at {}:{} -- {}'.format(warcfilename, offset, errors))
if record: warc_type = record.get_header(warctools.WarcRecord.TYPE)
content_type, content = record.content if warc_type != warctools.WarcRecord.RESPONSE:
return content raise Exception('invalid attempt to retrieve http payload of "{}" record'.format(warc_type))
# if record.type == WarcRecord.RESPONSE and content_type.startswith('application/http'):
# content = parse_http_response(record)
elif errors:
logging.error('warc errors at {}:{} -- {}'.format(warcpath, offset, errors))
return None
logging.error('warctools reader returned no warc record and no errors??') m = re.search(r'\n\r?\n', record.content[1])
return None if m is None:
raise Exception('end of http headers not found in record at {} offset {}'.format(warcfilename, offset))
return record.content[1][m.end():]
finally:
fh.close()
def gather_response(self, warcfilename, offset):
fh = self.open_warc_at_offset(warcfilename, offset)
try:
for (offset, record, errors) in fh.read_records(limit=1, offsets=True):
pass
if errors:
raise Exception('warc errors at {}:{} -- {}'.format(warcfilename, offset, errors))
warc_type = record.get_header(warctools.WarcRecord.TYPE)
if warc_type == warctools.WarcRecord.RESPONSE:
return record.content[1]
elif warc_type == warctools.WarcRecord.REVISIT:
# response consists of http headers from revisit record and
# payload from the referenced record
warc_profile = record.get_header(warctools.WarcRecord.PROFILE)
if warc_profile != warctools.WarcRecord.PROFILE_IDENTICAL_PAYLOAD_DIGEST:
raise Exception('unknown revisit record profile {}'.format(warc_profile))
refers_to_target_uri = record.get_header(warctools.WarcRecord.REFERS_TO_TARGET_URI)
refers_to_date = record.get_header(warctools.WarcRecord.REFERS_TO_DATE)
logging.debug('revisit record references {} capture of {}'.format(refers_to_date, refers_to_target_uri))
location = self.server.playback_index_db.lookup_exact(refers_to_target_uri, refers_to_date)
logging.debug('loading http payload from {}'.format(location))
http_payload = self.warc_record_http_payload(location['f'], location['o'])
return record.content[1] + http_payload
else:
raise Exception('unknown warc record type {}'.format(warc_type))
finally:
fh.close()
raise Exception('should not reach this point')
class PlaybackProxy(SocketServer.ThreadingMixIn, BaseHTTPServer.HTTPServer): class PlaybackProxy(SocketServer.ThreadingMixIn, BaseHTTPServer.HTTPServer):
@ -514,7 +569,7 @@ class DedupDb:
json_value = json.dumps(py_value, separators=(',',':')) json_value = json.dumps(py_value, separators=(',',':'))
self.db[key] = json_value self.db[key] = json_value
logging.info('dedup db saved {}:{}'.format(key, json_value)) logging.debug('dedup db saved {}:{}'.format(key, json_value))
def lookup(self, key): def lookup(self, key):
@ -589,23 +644,22 @@ class WarcWriterThread(threading.Thread):
refers_to_target_uri=dedup_info['u'], refers_to_target_uri=dedup_info['u'],
refers_to_date=dedup_info['d'], refers_to_date=dedup_info['d'],
profile=warctools.WarcRecord.PROFILE_IDENTICAL_PAYLOAD_DIGEST, profile=warctools.WarcRecord.PROFILE_IDENTICAL_PAYLOAD_DIGEST,
content_type=warctools.WarcRecord.HTTP_RESPONSE_MIMETYPE, content_type=httptools.ResponseMessage.CONTENT_TYPE,
remote_ip=recorded_url.remote_ip) remote_ip=recorded_url.remote_ip)
else: else:
# response record # response record
principal_record, principal_record_id = self.build_warc_record( principal_record, principal_record_id = self.build_warc_record(
url=recorded_url.url, warc_date=warc_date, url=recorded_url.url, warc_date=warc_date,
recorder=recorded_url.response_recorder, recorder=recorded_url.response_recorder,
warc_type=warctools.WarcRecord.RESPONSE, warc_type=warctools.WarcRecord.RESPONSE,
content_type=warctools.WarcRecord.HTTP_RESPONSE_MIMETYPE, content_type=httptools.ResponseMessage.CONTENT_TYPE,
remote_ip=recorded_url.remote_ip) remote_ip=recorded_url.remote_ip)
request_record, request_record_id = self.build_warc_record( request_record, request_record_id = self.build_warc_record(
url=recorded_url.url, warc_date=warc_date, url=recorded_url.url, warc_date=warc_date,
data=recorded_url.request_data, data=recorded_url.request_data,
warc_type=warctools.WarcRecord.REQUEST, warc_type=warctools.WarcRecord.REQUEST,
content_type=warctools.WarcRecord.HTTP_REQUEST_MIMETYPE, content_type=httptools.RequestMessage.CONTENT_TYPE,
concurrent_to=principal_record_id) concurrent_to=principal_record_id)
return principal_record, request_record return principal_record, request_record
@ -635,7 +689,7 @@ class WarcWriterThread(threading.Thread):
if remote_ip is not None: if remote_ip is not None:
headers.append((warctools.WarcRecord.IP_ADDRESS, remote_ip)) headers.append((warctools.WarcRecord.IP_ADDRESS, remote_ip))
if profile is not None: if profile is not None:
headers.append((warctools.WarcRecord.TYPE, profile)) headers.append((warctools.WarcRecord.PROFILE, profile))
if refers_to is not None: if refers_to is not None:
headers.append((warctools.WarcRecord.REFERS_TO, refers_to)) headers.append((warctools.WarcRecord.REFERS_TO, refers_to))
if refers_to_target_uri is not None: if refers_to_target_uri is not None:
@ -764,7 +818,7 @@ class WarcWriterThread(threading.Thread):
for record in recordset: for record in recordset:
offset = writer.tell() offset = writer.tell()
record.write_to(writer, gzip=self.gzip) record.write_to(writer, gzip=self.gzip)
logging.info('wrote warc record: warc_type={} content_length={} url={} warc={} offset={}'.format( logging.debug('wrote warc record: warc_type={} content_length={} url={} warc={} offset={}'.format(
record.get_header(warctools.WarcRecord.TYPE), record.get_header(warctools.WarcRecord.TYPE),
record.get_header(warctools.WarcRecord.CONTENT_LENGTH), record.get_header(warctools.WarcRecord.CONTENT_LENGTH),
record.get_header(warctools.WarcRecord.URL), record.get_header(warctools.WarcRecord.URL),
@ -779,7 +833,7 @@ class WarcWriterThread(threading.Thread):
and self.rollover_idle_time is not None and self.rollover_idle_time is not None
and self.rollover_idle_time > 0 and self.rollover_idle_time > 0
and time.time() - self._last_activity > self.rollover_idle_time): and time.time() - self._last_activity > self.rollover_idle_time):
logging.info('rolling over warc file after {} seconds idle'.format(time.time() - self._last_activity)) logging.debug('rolling over warc file after {} seconds idle'.format(time.time() - self._last_activity))
self._close_writer() self._close_writer()
if time.time() - self._last_sync > 60: if time.time() - self._last_sync > 60:
@ -814,7 +868,7 @@ class PlaybackIndexDb:
def save(self, warcfile, recordset, offset): def save(self, warcfile, recordset, offset):
response_record = recordset[0] response_record = recordset[0]
# XXX canonicalize url # XXX canonicalize url?
url = response_record.get_header(warctools.WarcRecord.URL) url = response_record.get_header(warctools.WarcRecord.URL)
date = response_record.get_header(warctools.WarcRecord.DATE) date = response_record.get_header(warctools.WarcRecord.DATE)
@ -831,7 +885,7 @@ class PlaybackIndexDb:
self.db[url] = json_value self.db[url] = json_value
logging.info('playback index saved: {}:{}'.format(url, json_value)) logging.debug('playback index saved: {}:{}'.format(url, json_value))
def lookup_latest(self, url): def lookup_latest(self, url):
@ -844,12 +898,26 @@ class PlaybackIndexDb:
latest_date = max(py_value) latest_date = max(py_value)
return latest_date, py_value[latest_date] return latest_date, py_value[latest_date]
def lookup_exact(self, url, warc_date):
if url not in self.db:
return None
json_value = self.db[url]
py_value = json.loads(json_value)
if warc_date in py_value:
return py_value[warc_date]
else:
return None
if __name__ == '__main__': if __name__ == '__main__':
arg_parser = argparse.ArgumentParser( arg_parser = argparse.ArgumentParser(
description='warcprox - WARC writing MITM HTTP/S proxy', description='warcprox - WARC writing MITM HTTP/S proxy',
formatter_class=argparse.ArgumentDefaultsHelpFormatter) formatter_class=argparse.ArgumentDefaultsHelpFormatter)
arg_parser.add_argument('-p', '--port', dest='port', default='8080', arg_parser.add_argument('-p', '--port', dest='port', default='8000',
help='port to listen on') help='port to listen on')
arg_parser.add_argument('-b', '--address', dest='address', arg_parser.add_argument('-b', '--address', dest='address',
default='localhost', help='address to listen on') default='localhost', help='address to listen on')