mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
custom PUTMETA http verb for writing warc metadata records; code borrowed from Ilya's fork https://github.com/ikreymer/warcprox
This commit is contained in:
parent
f79e744823
commit
403404f590
@ -116,6 +116,13 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
||||
|
||||
def do_COMMAND(self):
|
||||
if not self.is_connect:
|
||||
if self.command == 'PUTMETA':
|
||||
self._handle_custom_record(type_='metadata')
|
||||
return
|
||||
# if self.command == 'PUTRES':
|
||||
# self._handle_custom_record(type_='resource')
|
||||
# return
|
||||
|
||||
try:
|
||||
# Connect to destination
|
||||
self._determine_host_port()
|
||||
@ -130,6 +137,8 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
||||
|
||||
self._proxy_request()
|
||||
|
||||
def _handle_custom_record(self, type_):
|
||||
raise Exception('Not supported')
|
||||
|
||||
def _proxy_request(self):
|
||||
raise Exception('_proxy_request() not implemented in MitmProxyHandler, must be implemented in subclass!')
|
||||
|
@ -215,9 +215,33 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
||||
|
||||
return recorded_url
|
||||
|
||||
def _handle_custom_record(self, type_):
|
||||
self.url = self.path
|
||||
|
||||
if 'Content-Length' in self.headers and 'Content-Type' in self.headers:
|
||||
request_data = self.rfile.read(int(self.headers['Content-Length']))
|
||||
|
||||
warcprox_meta = self.headers.get('Warcprox-Meta')
|
||||
|
||||
rec_custom = RecordedUrl(url=self.url,
|
||||
request_data=request_data,
|
||||
response_recorder=None,
|
||||
remote_ip=b'',
|
||||
warcprox_meta=warcprox_meta,
|
||||
content_type=self.headers['Content-Type'].encode('latin1'),
|
||||
custom_type=type_)
|
||||
|
||||
self.server.recorded_url_q.put(rec_custom)
|
||||
self.send_response(204, 'OK')
|
||||
else:
|
||||
self.send_error(400, 'Bad request')
|
||||
|
||||
self.end_headers()
|
||||
|
||||
|
||||
class RecordedUrl(object):
|
||||
def __init__(self, url, request_data, response_recorder, remote_ip, warcprox_meta=None):
|
||||
def __init__(self, url, request_data, response_recorder, remote_ip,
|
||||
warcprox_meta=None, content_type=None, custom_type=None):
|
||||
# XXX should test what happens with non-ascii url (when does
|
||||
# url-encoding happen?)
|
||||
if type(url) is not bytes:
|
||||
@ -238,6 +262,9 @@ class RecordedUrl(object):
|
||||
else:
|
||||
self.warcprox_meta = {}
|
||||
|
||||
self.content_type = content_type
|
||||
self.custom_type = custom_type
|
||||
|
||||
|
||||
class WarcProxy(socketserver.ThreadingMixIn, http_server.HTTPServer):
|
||||
logger = logging.getLogger("warcprox.warcprox.WarcProxy")
|
||||
|
@ -56,6 +56,16 @@ class WarcWriter:
|
||||
warc_date = warctools.warc.warc_datetime_str(datetime.utcnow())
|
||||
|
||||
dedup_info = None
|
||||
|
||||
# metadata special case
|
||||
if recorded_url.custom_type == 'metadata':
|
||||
metadata_rec = self.build_warc_record(url=recorded_url.url,
|
||||
warc_date=warc_date,
|
||||
data=recorded_url.request_data,
|
||||
warc_type=warctools.WarcRecord.METADATA,
|
||||
content_type=recorded_url.content_type)
|
||||
return [metadata_rec]
|
||||
|
||||
if self.dedup_db is not None and recorded_url.response_recorder.payload_digest is not None:
|
||||
key = self.digest_str(recorded_url.response_recorder.payload_digest)
|
||||
dedup_info = self.dedup_db.lookup(key)
|
||||
@ -230,7 +240,8 @@ class WarcWriter:
|
||||
if self.playback_index_db is not None:
|
||||
self.playback_index_db.save(self._f_finalname, recordset, recordset_offset)
|
||||
|
||||
recorded_url.response_recorder.tempfile.close()
|
||||
if recorded_url.response_recorder is not None:
|
||||
recorded_url.response_recorder.tempfile.close()
|
||||
|
||||
def write_records(self, recorded_url):
|
||||
recordset = self.build_warc_records(recorded_url)
|
||||
@ -252,7 +263,6 @@ class WarcWriter:
|
||||
self._final_tasks(recorded_url, recordset, recordset_offset)
|
||||
|
||||
|
||||
|
||||
class WarcWriterThread(threading.Thread):
|
||||
logger = logging.getLogger("warcprox.warcwriter.WarcWriterThread")
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user