From 403404f59093dc447ec65bc959b384c14d2e23e5 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 14 Jul 2015 15:58:15 -0700 Subject: [PATCH] custom PUTMETA http verb for writing warc metadata records; code borrowed from Ilya's fork https://github.com/ikreymer/warcprox --- warcprox/mitmproxy.py | 9 +++++++++ warcprox/warcprox.py | 29 ++++++++++++++++++++++++++++- warcprox/warcwriter.py | 14 ++++++++++++-- 3 files changed, 49 insertions(+), 3 deletions(-) diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index 0ed3211..cbd3992 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -116,6 +116,13 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): def do_COMMAND(self): if not self.is_connect: + if self.command == 'PUTMETA': + self._handle_custom_record(type_='metadata') + return + # if self.command == 'PUTRES': + # self._handle_custom_record(type_='resource') + # return + try: # Connect to destination self._determine_host_port() @@ -130,6 +137,8 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): self._proxy_request() + def _handle_custom_record(self, type_): + raise Exception('Not supported') def _proxy_request(self): raise Exception('_proxy_request() not implemented in MitmProxyHandler, must be implemented in subclass!') diff --git a/warcprox/warcprox.py b/warcprox/warcprox.py index 7d98293..10e5b12 100644 --- a/warcprox/warcprox.py +++ b/warcprox/warcprox.py @@ -215,9 +215,33 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): return recorded_url + def _handle_custom_record(self, type_): + self.url = self.path + + if 'Content-Length' in self.headers and 'Content-Type' in self.headers: + request_data = self.rfile.read(int(self.headers['Content-Length'])) + + warcprox_meta = self.headers.get('Warcprox-Meta') + + rec_custom = RecordedUrl(url=self.url, + request_data=request_data, + response_recorder=None, + remote_ip=b'', + warcprox_meta=warcprox_meta, + content_type=self.headers['Content-Type'].encode('latin1'), + custom_type=type_) + + self.server.recorded_url_q.put(rec_custom) + self.send_response(204, 'OK') + else: + self.send_error(400, 'Bad request') + + self.end_headers() + class RecordedUrl(object): - def __init__(self, url, request_data, response_recorder, remote_ip, warcprox_meta=None): + def __init__(self, url, request_data, response_recorder, remote_ip, + warcprox_meta=None, content_type=None, custom_type=None): # XXX should test what happens with non-ascii url (when does # url-encoding happen?) if type(url) is not bytes: @@ -238,6 +262,9 @@ class RecordedUrl(object): else: self.warcprox_meta = {} + self.content_type = content_type + self.custom_type = custom_type + class WarcProxy(socketserver.ThreadingMixIn, http_server.HTTPServer): logger = logging.getLogger("warcprox.warcprox.WarcProxy") diff --git a/warcprox/warcwriter.py b/warcprox/warcwriter.py index 6af6733..d92f98a 100644 --- a/warcprox/warcwriter.py +++ b/warcprox/warcwriter.py @@ -56,6 +56,16 @@ class WarcWriter: warc_date = warctools.warc.warc_datetime_str(datetime.utcnow()) dedup_info = None + + # metadata special case + if recorded_url.custom_type == 'metadata': + metadata_rec = self.build_warc_record(url=recorded_url.url, + warc_date=warc_date, + data=recorded_url.request_data, + warc_type=warctools.WarcRecord.METADATA, + content_type=recorded_url.content_type) + return [metadata_rec] + if self.dedup_db is not None and recorded_url.response_recorder.payload_digest is not None: key = self.digest_str(recorded_url.response_recorder.payload_digest) dedup_info = self.dedup_db.lookup(key) @@ -230,7 +240,8 @@ class WarcWriter: if self.playback_index_db is not None: self.playback_index_db.save(self._f_finalname, recordset, recordset_offset) - recorded_url.response_recorder.tempfile.close() + if recorded_url.response_recorder is not None: + recorded_url.response_recorder.tempfile.close() def write_records(self, recorded_url): recordset = self.build_warc_records(recorded_url) @@ -252,7 +263,6 @@ class WarcWriter: self._final_tasks(recorded_url, recordset, recordset_offset) - class WarcWriterThread(threading.Thread): logger = logging.getLogger("warcprox.warcwriter.WarcWriterThread")