mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
support revisit records in playback proxy
This commit is contained in:
parent
77d33f21a8
commit
121ecca830
@ -85,11 +85,12 @@ incorporated into warctools mainline.
|
|||||||
- python3
|
- python3
|
||||||
- special handling for 304 not-modified (write nothing or write revisit
|
- special handling for 304 not-modified (write nothing or write revisit
|
||||||
record... and/or modify request so server never responds with 304)
|
record... and/or modify request so server never responds with 304)
|
||||||
- instant playback on a second proxy port
|
- ~~instant playback on a second proxy port~~
|
||||||
- browser plugin for warcprox mode
|
- browser plugin for warcprox mode
|
||||||
* accept warcprox CA cert only when in warcprox mode
|
* accept warcprox CA cert only when in warcprox mode
|
||||||
* separate temporary cookie store, like incognito
|
* separate temporary cookie store, like incognito
|
||||||
* "careful! your activity is being archived" banner
|
* "careful! your activity is being archived" banner
|
||||||
|
* easy switch between archiving and instant playback proxy port
|
||||||
|
|
||||||
#### To not do
|
#### To not do
|
||||||
|
|
||||||
|
@ -5,7 +5,7 @@
|
|||||||
"""
|
"""
|
||||||
Dump contents of database to stdout. Database can be any file that the anydbm
|
Dump contents of database to stdout. Database can be any file that the anydbm
|
||||||
module can read. Included with warcprox because it's useful for inspecting a
|
module can read. Included with warcprox because it's useful for inspecting a
|
||||||
deduplication database, but it is a generic tool.
|
deduplication database or a playback index database, but it is a generic tool.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import anydbm
|
import anydbm
|
||||||
|
140
warcprox.py
140
warcprox.py
@ -14,7 +14,7 @@ import OpenSSL
|
|||||||
import ssl
|
import ssl
|
||||||
import logging
|
import logging
|
||||||
import sys
|
import sys
|
||||||
from hanzo import warctools
|
from hanzo import warctools, httptools
|
||||||
import hashlib
|
import hashlib
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
import Queue
|
import Queue
|
||||||
@ -30,7 +30,7 @@ import tempfile
|
|||||||
import base64
|
import base64
|
||||||
import anydbm
|
import anydbm
|
||||||
import json
|
import json
|
||||||
import contextlib
|
import traceback
|
||||||
|
|
||||||
class CertificateAuthority(object):
|
class CertificateAuthority(object):
|
||||||
|
|
||||||
@ -247,6 +247,7 @@ class MitmProxyHandler(BaseHTTPServer.BaseHTTPRequestHandler):
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def _connect_to_host(self):
|
def _connect_to_host(self):
|
||||||
# Connect to destination
|
# Connect to destination
|
||||||
self._proxy_sock = socket.socket()
|
self._proxy_sock = socket.socket()
|
||||||
@ -323,7 +324,6 @@ class MitmProxyHandler(BaseHTTPServer.BaseHTTPRequestHandler):
|
|||||||
def _proxy_request(self):
|
def _proxy_request(self):
|
||||||
raise Exception('_proxy_request() not implemented in MitmProxyHandler, must be implemented in subclass!')
|
raise Exception('_proxy_request() not implemented in MitmProxyHandler, must be implemented in subclass!')
|
||||||
|
|
||||||
|
|
||||||
def __getattr__(self, item):
|
def __getattr__(self, item):
|
||||||
if item.startswith('do_'):
|
if item.startswith('do_'):
|
||||||
return self.do_COMMAND
|
return self.do_COMMAND
|
||||||
@ -423,20 +423,30 @@ class PlaybackProxyHandler(MitmProxyHandler):
|
|||||||
date, location = self.server.playback_index_db.lookup_latest(self.url)
|
date, location = self.server.playback_index_db.lookup_latest(self.url)
|
||||||
logging.info('lookup_latest returned {}:{}'.format(date, location))
|
logging.info('lookup_latest returned {}:{}'.format(date, location))
|
||||||
|
|
||||||
response = None
|
|
||||||
if location is not None:
|
if location is not None:
|
||||||
response = self.gather_response(location['f'], location['o'])
|
try:
|
||||||
|
response = self.gather_response(location['f'], location['o'])
|
||||||
if response is None:
|
except:
|
||||||
|
logging.error('PlaybackProxyHandler problem playing back {}'.format(self.url), exc_info=1)
|
||||||
|
payload = '500 Warcprox Error\n\n{}\n'.format(traceback.format_exc())
|
||||||
|
response = ('HTTP/1.1 500 Internal Server Error\r\n'
|
||||||
|
+ 'Content-Type: text/plain\r\n'
|
||||||
|
+ 'Content-Length: {}\r\n'
|
||||||
|
+ '\r\n'
|
||||||
|
+ '{}').format(len(payload), payload)
|
||||||
|
else:
|
||||||
response = ('HTTP/1.1 404 Not Found\r\n'
|
response = ('HTTP/1.1 404 Not Found\r\n'
|
||||||
+ 'Content-Type: text/plain\r\n'
|
+ 'Content-Type: text/plain\r\n'
|
||||||
+ 'Content-Length: 15\r\n'
|
+ 'Content-Length: 19\r\n'
|
||||||
+ '\r\n'
|
+ '\r\n'
|
||||||
+ 'not in archive\n')
|
+ '404 Not in Archive\n')
|
||||||
|
|
||||||
self.connection.sendall(response)
|
self.connection.sendall(response)
|
||||||
|
|
||||||
def gather_response(self, warcfilename, offset):
|
|
||||||
|
def open_warc_at_offset(self, warcfilename, offset):
|
||||||
|
logging.debug('opening {} at offset {}'.format(warcfilename, offset))
|
||||||
|
|
||||||
warcpath = None
|
warcpath = None
|
||||||
for p in (os.path.sep.join([self.server.warcs_dir, warcfilename]),
|
for p in (os.path.sep.join([self.server.warcs_dir, warcfilename]),
|
||||||
os.path.sep.join([self.server.warcs_dir, '{}.open'.format(warcfilename)])):
|
os.path.sep.join([self.server.warcs_dir, '{}.open'.format(warcfilename)])):
|
||||||
@ -444,27 +454,72 @@ class PlaybackProxyHandler(MitmProxyHandler):
|
|||||||
warcpath = p
|
warcpath = p
|
||||||
|
|
||||||
if warcpath is None:
|
if warcpath is None:
|
||||||
logging.error('{} not found'.format(warcfilename))
|
raise Exception('{} not found'.format(warcfilename))
|
||||||
return None
|
|
||||||
|
|
||||||
fh = warctools.warc.WarcRecord.open_archive(filename=warcpath, mode='rb', offset=offset)
|
return warctools.warc.WarcRecord.open_archive(filename=warcpath, mode='rb', offset=offset)
|
||||||
with contextlib.closing(fh):
|
|
||||||
|
|
||||||
|
# returns payload starting after http headers
|
||||||
|
def warc_record_http_payload(self, warcfilename, offset):
|
||||||
|
fh = self.open_warc_at_offset(warcfilename, offset)
|
||||||
|
try:
|
||||||
for (offset, record, errors) in fh.read_records(limit=1, offsets=True):
|
for (offset, record, errors) in fh.read_records(limit=1, offsets=True):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
logging.info('record_stream.read_records() returned {}'.format((offset,record,errors)))
|
if errors:
|
||||||
|
raise Exception('warc errors at {}:{} -- {}'.format(warcfilename, offset, errors))
|
||||||
|
|
||||||
if record:
|
warc_type = record.get_header(warctools.WarcRecord.TYPE)
|
||||||
content_type, content = record.content
|
if warc_type != warctools.WarcRecord.RESPONSE:
|
||||||
return content
|
raise Exception('invalid attempt to retrieve http payload of "{}" record'.format(warc_type))
|
||||||
# if record.type == WarcRecord.RESPONSE and content_type.startswith('application/http'):
|
|
||||||
# content = parse_http_response(record)
|
|
||||||
elif errors:
|
|
||||||
logging.error('warc errors at {}:{} -- {}'.format(warcpath, offset, errors))
|
|
||||||
return None
|
|
||||||
|
|
||||||
logging.error('warctools reader returned no warc record and no errors??')
|
m = re.search(r'\n\r?\n', record.content[1])
|
||||||
return None
|
if m is None:
|
||||||
|
raise Exception('end of http headers not found in record at {} offset {}'.format(warcfilename, offset))
|
||||||
|
return record.content[1][m.end():]
|
||||||
|
|
||||||
|
finally:
|
||||||
|
fh.close()
|
||||||
|
|
||||||
|
|
||||||
|
def gather_response(self, warcfilename, offset):
|
||||||
|
fh = self.open_warc_at_offset(warcfilename, offset)
|
||||||
|
try:
|
||||||
|
for (offset, record, errors) in fh.read_records(limit=1, offsets=True):
|
||||||
|
pass
|
||||||
|
|
||||||
|
if errors:
|
||||||
|
raise Exception('warc errors at {}:{} -- {}'.format(warcfilename, offset, errors))
|
||||||
|
|
||||||
|
warc_type = record.get_header(warctools.WarcRecord.TYPE)
|
||||||
|
|
||||||
|
if warc_type == warctools.WarcRecord.RESPONSE:
|
||||||
|
return record.content[1]
|
||||||
|
|
||||||
|
elif warc_type == warctools.WarcRecord.REVISIT:
|
||||||
|
# response consists of http headers from revisit record and
|
||||||
|
# payload from the referenced record
|
||||||
|
warc_profile = record.get_header(warctools.WarcRecord.PROFILE)
|
||||||
|
if warc_profile != warctools.WarcRecord.PROFILE_IDENTICAL_PAYLOAD_DIGEST:
|
||||||
|
raise Exception('unknown revisit record profile {}'.format(warc_profile))
|
||||||
|
|
||||||
|
refers_to_target_uri = record.get_header(warctools.WarcRecord.REFERS_TO_TARGET_URI)
|
||||||
|
refers_to_date = record.get_header(warctools.WarcRecord.REFERS_TO_DATE)
|
||||||
|
|
||||||
|
logging.debug('revisit record references {} capture of {}'.format(refers_to_date, refers_to_target_uri))
|
||||||
|
location = self.server.playback_index_db.lookup_exact(refers_to_target_uri, refers_to_date)
|
||||||
|
logging.debug('loading http payload from {}'.format(location))
|
||||||
|
http_payload = self.warc_record_http_payload(location['f'], location['o'])
|
||||||
|
|
||||||
|
return record.content[1] + http_payload
|
||||||
|
|
||||||
|
else:
|
||||||
|
raise Exception('unknown warc record type {}'.format(warc_type))
|
||||||
|
|
||||||
|
finally:
|
||||||
|
fh.close()
|
||||||
|
|
||||||
|
raise Exception('should not reach this point')
|
||||||
|
|
||||||
|
|
||||||
class PlaybackProxy(SocketServer.ThreadingMixIn, BaseHTTPServer.HTTPServer):
|
class PlaybackProxy(SocketServer.ThreadingMixIn, BaseHTTPServer.HTTPServer):
|
||||||
@ -514,7 +569,7 @@ class DedupDb:
|
|||||||
json_value = json.dumps(py_value, separators=(',',':'))
|
json_value = json.dumps(py_value, separators=(',',':'))
|
||||||
|
|
||||||
self.db[key] = json_value
|
self.db[key] = json_value
|
||||||
logging.info('dedup db saved {}:{}'.format(key, json_value))
|
logging.debug('dedup db saved {}:{}'.format(key, json_value))
|
||||||
|
|
||||||
|
|
||||||
def lookup(self, key):
|
def lookup(self, key):
|
||||||
@ -589,23 +644,22 @@ class WarcWriterThread(threading.Thread):
|
|||||||
refers_to_target_uri=dedup_info['u'],
|
refers_to_target_uri=dedup_info['u'],
|
||||||
refers_to_date=dedup_info['d'],
|
refers_to_date=dedup_info['d'],
|
||||||
profile=warctools.WarcRecord.PROFILE_IDENTICAL_PAYLOAD_DIGEST,
|
profile=warctools.WarcRecord.PROFILE_IDENTICAL_PAYLOAD_DIGEST,
|
||||||
content_type=warctools.WarcRecord.HTTP_RESPONSE_MIMETYPE,
|
content_type=httptools.ResponseMessage.CONTENT_TYPE,
|
||||||
remote_ip=recorded_url.remote_ip)
|
remote_ip=recorded_url.remote_ip)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
# response record
|
# response record
|
||||||
principal_record, principal_record_id = self.build_warc_record(
|
principal_record, principal_record_id = self.build_warc_record(
|
||||||
url=recorded_url.url, warc_date=warc_date,
|
url=recorded_url.url, warc_date=warc_date,
|
||||||
recorder=recorded_url.response_recorder,
|
recorder=recorded_url.response_recorder,
|
||||||
warc_type=warctools.WarcRecord.RESPONSE,
|
warc_type=warctools.WarcRecord.RESPONSE,
|
||||||
content_type=warctools.WarcRecord.HTTP_RESPONSE_MIMETYPE,
|
content_type=httptools.ResponseMessage.CONTENT_TYPE,
|
||||||
remote_ip=recorded_url.remote_ip)
|
remote_ip=recorded_url.remote_ip)
|
||||||
|
|
||||||
request_record, request_record_id = self.build_warc_record(
|
request_record, request_record_id = self.build_warc_record(
|
||||||
url=recorded_url.url, warc_date=warc_date,
|
url=recorded_url.url, warc_date=warc_date,
|
||||||
data=recorded_url.request_data,
|
data=recorded_url.request_data,
|
||||||
warc_type=warctools.WarcRecord.REQUEST,
|
warc_type=warctools.WarcRecord.REQUEST,
|
||||||
content_type=warctools.WarcRecord.HTTP_REQUEST_MIMETYPE,
|
content_type=httptools.RequestMessage.CONTENT_TYPE,
|
||||||
concurrent_to=principal_record_id)
|
concurrent_to=principal_record_id)
|
||||||
|
|
||||||
return principal_record, request_record
|
return principal_record, request_record
|
||||||
@ -635,7 +689,7 @@ class WarcWriterThread(threading.Thread):
|
|||||||
if remote_ip is not None:
|
if remote_ip is not None:
|
||||||
headers.append((warctools.WarcRecord.IP_ADDRESS, remote_ip))
|
headers.append((warctools.WarcRecord.IP_ADDRESS, remote_ip))
|
||||||
if profile is not None:
|
if profile is not None:
|
||||||
headers.append((warctools.WarcRecord.TYPE, profile))
|
headers.append((warctools.WarcRecord.PROFILE, profile))
|
||||||
if refers_to is not None:
|
if refers_to is not None:
|
||||||
headers.append((warctools.WarcRecord.REFERS_TO, refers_to))
|
headers.append((warctools.WarcRecord.REFERS_TO, refers_to))
|
||||||
if refers_to_target_uri is not None:
|
if refers_to_target_uri is not None:
|
||||||
@ -764,7 +818,7 @@ class WarcWriterThread(threading.Thread):
|
|||||||
for record in recordset:
|
for record in recordset:
|
||||||
offset = writer.tell()
|
offset = writer.tell()
|
||||||
record.write_to(writer, gzip=self.gzip)
|
record.write_to(writer, gzip=self.gzip)
|
||||||
logging.info('wrote warc record: warc_type={} content_length={} url={} warc={} offset={}'.format(
|
logging.debug('wrote warc record: warc_type={} content_length={} url={} warc={} offset={}'.format(
|
||||||
record.get_header(warctools.WarcRecord.TYPE),
|
record.get_header(warctools.WarcRecord.TYPE),
|
||||||
record.get_header(warctools.WarcRecord.CONTENT_LENGTH),
|
record.get_header(warctools.WarcRecord.CONTENT_LENGTH),
|
||||||
record.get_header(warctools.WarcRecord.URL),
|
record.get_header(warctools.WarcRecord.URL),
|
||||||
@ -779,7 +833,7 @@ class WarcWriterThread(threading.Thread):
|
|||||||
and self.rollover_idle_time is not None
|
and self.rollover_idle_time is not None
|
||||||
and self.rollover_idle_time > 0
|
and self.rollover_idle_time > 0
|
||||||
and time.time() - self._last_activity > self.rollover_idle_time):
|
and time.time() - self._last_activity > self.rollover_idle_time):
|
||||||
logging.info('rolling over warc file after {} seconds idle'.format(time.time() - self._last_activity))
|
logging.debug('rolling over warc file after {} seconds idle'.format(time.time() - self._last_activity))
|
||||||
self._close_writer()
|
self._close_writer()
|
||||||
|
|
||||||
if time.time() - self._last_sync > 60:
|
if time.time() - self._last_sync > 60:
|
||||||
@ -814,7 +868,7 @@ class PlaybackIndexDb:
|
|||||||
|
|
||||||
def save(self, warcfile, recordset, offset):
|
def save(self, warcfile, recordset, offset):
|
||||||
response_record = recordset[0]
|
response_record = recordset[0]
|
||||||
# XXX canonicalize url
|
# XXX canonicalize url?
|
||||||
url = response_record.get_header(warctools.WarcRecord.URL)
|
url = response_record.get_header(warctools.WarcRecord.URL)
|
||||||
date = response_record.get_header(warctools.WarcRecord.DATE)
|
date = response_record.get_header(warctools.WarcRecord.DATE)
|
||||||
|
|
||||||
@ -831,7 +885,7 @@ class PlaybackIndexDb:
|
|||||||
|
|
||||||
self.db[url] = json_value
|
self.db[url] = json_value
|
||||||
|
|
||||||
logging.info('playback index saved: {}:{}'.format(url, json_value))
|
logging.debug('playback index saved: {}:{}'.format(url, json_value))
|
||||||
|
|
||||||
|
|
||||||
def lookup_latest(self, url):
|
def lookup_latest(self, url):
|
||||||
@ -844,12 +898,26 @@ class PlaybackIndexDb:
|
|||||||
latest_date = max(py_value)
|
latest_date = max(py_value)
|
||||||
return latest_date, py_value[latest_date]
|
return latest_date, py_value[latest_date]
|
||||||
|
|
||||||
|
|
||||||
|
def lookup_exact(self, url, warc_date):
|
||||||
|
if url not in self.db:
|
||||||
|
return None
|
||||||
|
|
||||||
|
json_value = self.db[url]
|
||||||
|
py_value = json.loads(json_value)
|
||||||
|
|
||||||
|
if warc_date in py_value:
|
||||||
|
return py_value[warc_date]
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
||||||
arg_parser = argparse.ArgumentParser(
|
arg_parser = argparse.ArgumentParser(
|
||||||
description='warcprox - WARC writing MITM HTTP/S proxy',
|
description='warcprox - WARC writing MITM HTTP/S proxy',
|
||||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||||
arg_parser.add_argument('-p', '--port', dest='port', default='8080',
|
arg_parser.add_argument('-p', '--port', dest='port', default='8000',
|
||||||
help='port to listen on')
|
help='port to listen on')
|
||||||
arg_parser.add_argument('-b', '--address', dest='address',
|
arg_parser.add_argument('-b', '--address', dest='address',
|
||||||
default='localhost', help='address to listen on')
|
default='localhost', help='address to listen on')
|
||||||
|
Loading…
x
Reference in New Issue
Block a user