mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
support multiple captures of same url in the same second (revisits and non-revisits)
This commit is contained in:
parent
28c8dd81f9
commit
bdd218d338
@ -501,8 +501,8 @@ class PlaybackProxyHandler(MitmProxyHandler):
|
|||||||
return status, sz
|
return status, sz
|
||||||
|
|
||||||
|
|
||||||
def _send_headers_and_refd_payload(self, headers, refers_to_target_uri, refers_to_date):
|
def _send_headers_and_refd_payload(self, headers, refers_to, refers_to_target_uri, refers_to_date):
|
||||||
location = self.server.playback_index_db.lookup_exact(refers_to_target_uri, refers_to_date)
|
location = self.server.playback_index_db.lookup_exact(refers_to_target_uri, refers_to_date, record_id=refers_to)
|
||||||
self.logger.debug('loading http payload from {}'.format(location))
|
self.logger.debug('loading http payload from {}'.format(location))
|
||||||
|
|
||||||
fh = self._open_warc_at_offset(location['f'], location['o'])
|
fh = self._open_warc_at_offset(location['f'], location['o'])
|
||||||
@ -557,11 +557,12 @@ class PlaybackProxyHandler(MitmProxyHandler):
|
|||||||
if warc_profile != warctools.WarcRecord.PROFILE_IDENTICAL_PAYLOAD_DIGEST:
|
if warc_profile != warctools.WarcRecord.PROFILE_IDENTICAL_PAYLOAD_DIGEST:
|
||||||
raise Exception('unknown revisit record profile {}'.format(warc_profile))
|
raise Exception('unknown revisit record profile {}'.format(warc_profile))
|
||||||
|
|
||||||
|
refers_to = record.get_header(warctools.WarcRecord.REFERS_TO)
|
||||||
refers_to_target_uri = record.get_header(warctools.WarcRecord.REFERS_TO_TARGET_URI)
|
refers_to_target_uri = record.get_header(warctools.WarcRecord.REFERS_TO_TARGET_URI)
|
||||||
refers_to_date = record.get_header(warctools.WarcRecord.REFERS_TO_DATE)
|
refers_to_date = record.get_header(warctools.WarcRecord.REFERS_TO_DATE)
|
||||||
|
|
||||||
self.logger.debug('revisit record references {} capture of {}'.format(refers_to_date, refers_to_target_uri))
|
self.logger.debug('revisit record references {}:{} capture of {}'.format(refers_to_date, refers_to, refers_to_target_uri))
|
||||||
return self._send_headers_and_refd_payload(record.content[1], refers_to_target_uri, refers_to_date)
|
return self._send_headers_and_refd_payload(record.content[1], refers_to, refers_to_target_uri, refers_to_date)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
raise Exception('unknown warc record type {}'.format(warc_type))
|
raise Exception('unknown warc record type {}'.format(warc_type))
|
||||||
@ -912,23 +913,32 @@ class PlaybackIndexDb(object):
|
|||||||
def close(self):
|
def close(self):
|
||||||
self.db.close()
|
self.db.close()
|
||||||
|
|
||||||
|
|
||||||
def sync(self):
|
def sync(self):
|
||||||
self.db.sync()
|
self.db.sync()
|
||||||
|
|
||||||
|
|
||||||
def save(self, warcfile, recordset, offset):
|
def save(self, warcfile, recordset, offset):
|
||||||
response_record = recordset[0]
|
response_record = recordset[0]
|
||||||
# XXX canonicalize url?
|
# XXX canonicalize url?
|
||||||
url = response_record.get_header(warctools.WarcRecord.URL)
|
url = response_record.get_header(warctools.WarcRecord.URL)
|
||||||
date = response_record.get_header(warctools.WarcRecord.DATE)
|
date = response_record.get_header(warctools.WarcRecord.DATE)
|
||||||
|
record_id = response_record.get_header(warctools.WarcRecord.ID)
|
||||||
|
|
||||||
# url:{date1:{'f':warcfile,'o':response_offset,'q':request_offset,'t':response/revisit,'u':revisit_target_url,'d':revisit_target_date},date2:{...},...}
|
# there could be two visits of same url in the same second, and WARC-Date is
|
||||||
|
# prescribed as YYYY-MM-DDThh:mm:ssZ, so we have to handle it :-\
|
||||||
|
|
||||||
|
# url:{date1:[record1={'f':warcfile,'o':response_offset,'q':request_offset,'i':record_id},record2,...],date2:[{...}],...}
|
||||||
if url in self.db:
|
if url in self.db:
|
||||||
existing_json_value = self.db[url]
|
existing_json_value = self.db[url]
|
||||||
py_value = json.loads(existing_json_value)
|
py_value = json.loads(existing_json_value)
|
||||||
else:
|
else:
|
||||||
py_value = {}
|
py_value = {}
|
||||||
|
|
||||||
py_value[date] = {'f':warcfile, 'o':offset}
|
if date in py_value:
|
||||||
|
py_value[date].append({'f':warcfile, 'o':offset, 'i':record_id})
|
||||||
|
else:
|
||||||
|
py_value[date] = [{'f':warcfile, 'o':offset, 'i':record_id}]
|
||||||
|
|
||||||
json_value = json.dumps(py_value, separators=(',',':'))
|
json_value = json.dumps(py_value, separators=(',',':'))
|
||||||
|
|
||||||
@ -942,25 +952,32 @@ class PlaybackIndexDb(object):
|
|||||||
return None, None
|
return None, None
|
||||||
|
|
||||||
json_value = self.db[url]
|
json_value = self.db[url]
|
||||||
|
self.logger.debug("'{}':{}".format(url, json_value))
|
||||||
py_value = json.loads(json_value)
|
py_value = json.loads(json_value)
|
||||||
|
|
||||||
latest_date = max(py_value)
|
latest_date = max(py_value)
|
||||||
return latest_date, py_value[latest_date]
|
return latest_date, py_value[latest_date][0]
|
||||||
|
|
||||||
|
|
||||||
def lookup_exact(self, url, warc_date):
|
def lookup_exact(self, url, warc_date, record_id):
|
||||||
if url not in self.db:
|
if url not in self.db:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
json_value = self.db[url]
|
json_value = self.db[url]
|
||||||
|
self.logger.debug("'{}':{}".format(url, json_value))
|
||||||
py_value = json.loads(json_value)
|
py_value = json.loads(json_value)
|
||||||
|
|
||||||
if warc_date in py_value:
|
if warc_date in py_value:
|
||||||
return py_value[warc_date]
|
for record in py_value[warc_date]:
|
||||||
|
if record['i'] == record_id:
|
||||||
|
self.logger.debug("found exact match for ({},{},{})".format(repr(warc_date), repr(record_id), repr(url)))
|
||||||
|
return record
|
||||||
else:
|
else:
|
||||||
|
self.logger.info("match not found for ({},{},{})".format(repr(warc_date), repr(record_id), repr(url)))
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class WarcproxController(object):
|
class WarcproxController(object):
|
||||||
logger = logging.getLogger('warcprox.WarcproxController')
|
logger = logging.getLogger('warcprox.WarcproxController')
|
||||||
|
|
||||||
@ -1095,7 +1112,7 @@ def main(argv=sys.argv):
|
|||||||
loglevel = logging.INFO
|
loglevel = logging.INFO
|
||||||
|
|
||||||
logging.basicConfig(stream=sys.stdout, level=loglevel,
|
logging.basicConfig(stream=sys.stdout, level=loglevel,
|
||||||
format='%(asctime)s %(process)d %(threadName)s %(levelname)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s')
|
format='%(asctime)s %(process)d %(levelname)s %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s')
|
||||||
|
|
||||||
try:
|
try:
|
||||||
hashlib.new(args.digest_algorithm)
|
hashlib.new(args.digest_algorithm)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user