custom PUTMETA http verb for writing warc metadata records; code borrowed from Ilya's fork https://github.com/ikreymer/warcprox

This commit is contained in:
Noah Levitt 2015-07-14 15:58:15 -07:00
parent f79e744823
commit 403404f590
3 changed files with 49 additions and 3 deletions

View File

@ -116,6 +116,13 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
def do_COMMAND(self):
if not self.is_connect:
if self.command == 'PUTMETA':
self._handle_custom_record(type_='metadata')
return
# if self.command == 'PUTRES':
# self._handle_custom_record(type_='resource')
# return
try:
# Connect to destination
self._determine_host_port()
@ -130,6 +137,8 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
self._proxy_request()
def _handle_custom_record(self, type_):
raise Exception('Not supported')
def _proxy_request(self):
raise Exception('_proxy_request() not implemented in MitmProxyHandler, must be implemented in subclass!')

View File

@ -215,9 +215,33 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
return recorded_url
def _handle_custom_record(self, type_):
self.url = self.path
if 'Content-Length' in self.headers and 'Content-Type' in self.headers:
request_data = self.rfile.read(int(self.headers['Content-Length']))
warcprox_meta = self.headers.get('Warcprox-Meta')
rec_custom = RecordedUrl(url=self.url,
request_data=request_data,
response_recorder=None,
remote_ip=b'',
warcprox_meta=warcprox_meta,
content_type=self.headers['Content-Type'].encode('latin1'),
custom_type=type_)
self.server.recorded_url_q.put(rec_custom)
self.send_response(204, 'OK')
else:
self.send_error(400, 'Bad request')
self.end_headers()
class RecordedUrl(object):
def __init__(self, url, request_data, response_recorder, remote_ip, warcprox_meta=None):
def __init__(self, url, request_data, response_recorder, remote_ip,
warcprox_meta=None, content_type=None, custom_type=None):
# XXX should test what happens with non-ascii url (when does
# url-encoding happen?)
if type(url) is not bytes:
@ -238,6 +262,9 @@ class RecordedUrl(object):
else:
self.warcprox_meta = {}
self.content_type = content_type
self.custom_type = custom_type
class WarcProxy(socketserver.ThreadingMixIn, http_server.HTTPServer):
logger = logging.getLogger("warcprox.warcprox.WarcProxy")

View File

@ -56,6 +56,16 @@ class WarcWriter:
warc_date = warctools.warc.warc_datetime_str(datetime.utcnow())
dedup_info = None
# metadata special case
if recorded_url.custom_type == 'metadata':
metadata_rec = self.build_warc_record(url=recorded_url.url,
warc_date=warc_date,
data=recorded_url.request_data,
warc_type=warctools.WarcRecord.METADATA,
content_type=recorded_url.content_type)
return [metadata_rec]
if self.dedup_db is not None and recorded_url.response_recorder.payload_digest is not None:
key = self.digest_str(recorded_url.response_recorder.payload_digest)
dedup_info = self.dedup_db.lookup(key)
@ -230,7 +240,8 @@ class WarcWriter:
if self.playback_index_db is not None:
self.playback_index_db.save(self._f_finalname, recordset, recordset_offset)
recorded_url.response_recorder.tempfile.close()
if recorded_url.response_recorder is not None:
recorded_url.response_recorder.tempfile.close()
def write_records(self, recorded_url):
recordset = self.build_warc_records(recorded_url)
@ -252,7 +263,6 @@ class WarcWriter:
self._final_tasks(recorded_url, recordset, recordset_offset)
class WarcWriterThread(threading.Thread):
logger = logging.getLogger("warcprox.warcwriter.WarcWriterThread")