mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
write WARC-IP-Address header on response record
This commit is contained in:
parent
980ba13d10
commit
7367620dae
21
warcprox.py
21
warcprox.py
@ -310,11 +310,13 @@ class WarcProxyHandler(BaseHTTPServer.BaseHTTPRequestHandler):
|
||||
while buf != '':
|
||||
buf = h.read(8192)
|
||||
|
||||
remote_ip = self._proxy_sock.getpeername()[0]
|
||||
|
||||
# Let's close off the remote end
|
||||
h.close()
|
||||
self._proxy_sock.close()
|
||||
|
||||
self.server.recordset_q.create_and_queue(self.url, req, h.recorder)
|
||||
self.server.recordset_q.create_and_queue(self.url, req, h.recorder, remote_ip)
|
||||
|
||||
|
||||
def __getattr__(self, item):
|
||||
@ -352,13 +354,14 @@ class WarcProxy(SocketServer.ThreadingMixIn, BaseHTTPServer.HTTPServer):
|
||||
# consecutively in the same warc.
|
||||
class WarcRecordsetQueue(Queue.Queue):
|
||||
|
||||
def create_and_queue(self, url, request_data, response_recorder):
|
||||
def create_and_queue(self, url, request_data, response_recorder, remote_ip):
|
||||
warc_date = warctools.warc.warc_datetime_str(datetime.now())
|
||||
|
||||
response_record, response_record_id = self.make_record(url=url,
|
||||
warc_date=warc_date, recorder=response_recorder,
|
||||
warc_type=warctools.WarcRecord.RESPONSE,
|
||||
content_type="application/http;msgtype=response")
|
||||
warc_type=warctools.WarcRecord.RESPONSE,
|
||||
content_type="application/http;msgtype=response",
|
||||
remote_ip=remote_ip)
|
||||
|
||||
request_record, request_record_id = self.make_record(url=url,
|
||||
warc_date=warc_date, data=request_data,
|
||||
@ -372,7 +375,7 @@ class WarcRecordsetQueue(Queue.Queue):
|
||||
|
||||
@staticmethod
|
||||
def make_record(url, warc_date=None, recorder=None, data=None,
|
||||
concurrent_to=None, warc_type=None, content_type=None):
|
||||
concurrent_to=None, warc_type=None, content_type=None, remote_ip=None):
|
||||
|
||||
if warc_date is None:
|
||||
warc_date = warctools.warc.warc_datetime_str(datetime.now())
|
||||
@ -380,12 +383,13 @@ class WarcRecordsetQueue(Queue.Queue):
|
||||
record_id = warctools.WarcRecord.random_warc_uuid()
|
||||
|
||||
headers = []
|
||||
if warc_type is not None:
|
||||
headers.append((warctools.WarcRecord.TYPE, warc_type))
|
||||
headers.append((warctools.WarcRecord.ID, record_id))
|
||||
headers.append((warctools.WarcRecord.DATE, warc_date))
|
||||
headers.append((warctools.WarcRecord.URL, url))
|
||||
# headers.append((warctools.WarcRecord.IP_ADDRESS, ip))
|
||||
if warc_type is not None:
|
||||
headers.append((warctools.WarcRecord.TYPE, warc_type))
|
||||
if remote_ip is not None:
|
||||
headers.append((warctools.WarcRecord.IP_ADDRESS, remote_ip))
|
||||
if concurrent_to is not None:
|
||||
headers.append((warctools.WarcRecord.CONCURRENT_TO, concurrent_to))
|
||||
if content_type is not None:
|
||||
@ -457,7 +461,6 @@ class WarcWriterThread(threading.Thread):
|
||||
headers.append((warctools.WarcRecord.TYPE, warctools.WarcRecord.WARCINFO))
|
||||
headers.append((warctools.WarcRecord.FILENAME, filename))
|
||||
headers.append((warctools.WarcRecord.DATE, warc_record_date))
|
||||
# headers.append((warctools.WarcRecord.IP_ADDRESS, ip))
|
||||
|
||||
warcinfo_fields = []
|
||||
warcinfo_fields.append('software: warcprox.py https://github.com/nlevitt/warcprox')
|
||||
|
Loading…
x
Reference in New Issue
Block a user