mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
custom PUTMETA http verb for writing warc metadata records; code borrowed from Ilya's fork https://github.com/ikreymer/warcprox
This commit is contained in:
parent
f79e744823
commit
403404f590
@ -116,6 +116,13 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
|||||||
|
|
||||||
def do_COMMAND(self):
|
def do_COMMAND(self):
|
||||||
if not self.is_connect:
|
if not self.is_connect:
|
||||||
|
if self.command == 'PUTMETA':
|
||||||
|
self._handle_custom_record(type_='metadata')
|
||||||
|
return
|
||||||
|
# if self.command == 'PUTRES':
|
||||||
|
# self._handle_custom_record(type_='resource')
|
||||||
|
# return
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Connect to destination
|
# Connect to destination
|
||||||
self._determine_host_port()
|
self._determine_host_port()
|
||||||
@ -130,6 +137,8 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
|||||||
|
|
||||||
self._proxy_request()
|
self._proxy_request()
|
||||||
|
|
||||||
|
def _handle_custom_record(self, type_):
|
||||||
|
raise Exception('Not supported')
|
||||||
|
|
||||||
def _proxy_request(self):
|
def _proxy_request(self):
|
||||||
raise Exception('_proxy_request() not implemented in MitmProxyHandler, must be implemented in subclass!')
|
raise Exception('_proxy_request() not implemented in MitmProxyHandler, must be implemented in subclass!')
|
||||||
|
@ -215,9 +215,33 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
|||||||
|
|
||||||
return recorded_url
|
return recorded_url
|
||||||
|
|
||||||
|
def _handle_custom_record(self, type_):
|
||||||
|
self.url = self.path
|
||||||
|
|
||||||
|
if 'Content-Length' in self.headers and 'Content-Type' in self.headers:
|
||||||
|
request_data = self.rfile.read(int(self.headers['Content-Length']))
|
||||||
|
|
||||||
|
warcprox_meta = self.headers.get('Warcprox-Meta')
|
||||||
|
|
||||||
|
rec_custom = RecordedUrl(url=self.url,
|
||||||
|
request_data=request_data,
|
||||||
|
response_recorder=None,
|
||||||
|
remote_ip=b'',
|
||||||
|
warcprox_meta=warcprox_meta,
|
||||||
|
content_type=self.headers['Content-Type'].encode('latin1'),
|
||||||
|
custom_type=type_)
|
||||||
|
|
||||||
|
self.server.recorded_url_q.put(rec_custom)
|
||||||
|
self.send_response(204, 'OK')
|
||||||
|
else:
|
||||||
|
self.send_error(400, 'Bad request')
|
||||||
|
|
||||||
|
self.end_headers()
|
||||||
|
|
||||||
|
|
||||||
class RecordedUrl(object):
|
class RecordedUrl(object):
|
||||||
def __init__(self, url, request_data, response_recorder, remote_ip, warcprox_meta=None):
|
def __init__(self, url, request_data, response_recorder, remote_ip,
|
||||||
|
warcprox_meta=None, content_type=None, custom_type=None):
|
||||||
# XXX should test what happens with non-ascii url (when does
|
# XXX should test what happens with non-ascii url (when does
|
||||||
# url-encoding happen?)
|
# url-encoding happen?)
|
||||||
if type(url) is not bytes:
|
if type(url) is not bytes:
|
||||||
@ -238,6 +262,9 @@ class RecordedUrl(object):
|
|||||||
else:
|
else:
|
||||||
self.warcprox_meta = {}
|
self.warcprox_meta = {}
|
||||||
|
|
||||||
|
self.content_type = content_type
|
||||||
|
self.custom_type = custom_type
|
||||||
|
|
||||||
|
|
||||||
class WarcProxy(socketserver.ThreadingMixIn, http_server.HTTPServer):
|
class WarcProxy(socketserver.ThreadingMixIn, http_server.HTTPServer):
|
||||||
logger = logging.getLogger("warcprox.warcprox.WarcProxy")
|
logger = logging.getLogger("warcprox.warcprox.WarcProxy")
|
||||||
|
@ -56,6 +56,16 @@ class WarcWriter:
|
|||||||
warc_date = warctools.warc.warc_datetime_str(datetime.utcnow())
|
warc_date = warctools.warc.warc_datetime_str(datetime.utcnow())
|
||||||
|
|
||||||
dedup_info = None
|
dedup_info = None
|
||||||
|
|
||||||
|
# metadata special case
|
||||||
|
if recorded_url.custom_type == 'metadata':
|
||||||
|
metadata_rec = self.build_warc_record(url=recorded_url.url,
|
||||||
|
warc_date=warc_date,
|
||||||
|
data=recorded_url.request_data,
|
||||||
|
warc_type=warctools.WarcRecord.METADATA,
|
||||||
|
content_type=recorded_url.content_type)
|
||||||
|
return [metadata_rec]
|
||||||
|
|
||||||
if self.dedup_db is not None and recorded_url.response_recorder.payload_digest is not None:
|
if self.dedup_db is not None and recorded_url.response_recorder.payload_digest is not None:
|
||||||
key = self.digest_str(recorded_url.response_recorder.payload_digest)
|
key = self.digest_str(recorded_url.response_recorder.payload_digest)
|
||||||
dedup_info = self.dedup_db.lookup(key)
|
dedup_info = self.dedup_db.lookup(key)
|
||||||
@ -230,7 +240,8 @@ class WarcWriter:
|
|||||||
if self.playback_index_db is not None:
|
if self.playback_index_db is not None:
|
||||||
self.playback_index_db.save(self._f_finalname, recordset, recordset_offset)
|
self.playback_index_db.save(self._f_finalname, recordset, recordset_offset)
|
||||||
|
|
||||||
recorded_url.response_recorder.tempfile.close()
|
if recorded_url.response_recorder is not None:
|
||||||
|
recorded_url.response_recorder.tempfile.close()
|
||||||
|
|
||||||
def write_records(self, recorded_url):
|
def write_records(self, recorded_url):
|
||||||
recordset = self.build_warc_records(recorded_url)
|
recordset = self.build_warc_records(recorded_url)
|
||||||
@ -252,7 +263,6 @@ class WarcWriter:
|
|||||||
self._final_tasks(recorded_url, recordset, recordset_offset)
|
self._final_tasks(recorded_url, recordset, recordset_offset)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class WarcWriterThread(threading.Thread):
|
class WarcWriterThread(threading.Thread):
|
||||||
logger = logging.getLogger("warcprox.warcwriter.WarcWriterThread")
|
logger = logging.getLogger("warcprox.warcwriter.WarcWriterThread")
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user