support multiple captures of same url in the same second (revisits and non-revisits)

This commit is contained in:
Noah Levitt 2013-11-22 11:19:27 -08:00
parent 28c8dd81f9
commit bdd218d338

View File

@ -501,8 +501,8 @@ class PlaybackProxyHandler(MitmProxyHandler):
return status, sz
def _send_headers_and_refd_payload(self, headers, refers_to_target_uri, refers_to_date):
location = self.server.playback_index_db.lookup_exact(refers_to_target_uri, refers_to_date)
def _send_headers_and_refd_payload(self, headers, refers_to, refers_to_target_uri, refers_to_date):
location = self.server.playback_index_db.lookup_exact(refers_to_target_uri, refers_to_date, record_id=refers_to)
self.logger.debug('loading http payload from {}'.format(location))
fh = self._open_warc_at_offset(location['f'], location['o'])
@ -557,11 +557,12 @@ class PlaybackProxyHandler(MitmProxyHandler):
if warc_profile != warctools.WarcRecord.PROFILE_IDENTICAL_PAYLOAD_DIGEST:
raise Exception('unknown revisit record profile {}'.format(warc_profile))
refers_to = record.get_header(warctools.WarcRecord.REFERS_TO)
refers_to_target_uri = record.get_header(warctools.WarcRecord.REFERS_TO_TARGET_URI)
refers_to_date = record.get_header(warctools.WarcRecord.REFERS_TO_DATE)
self.logger.debug('revisit record references {} capture of {}'.format(refers_to_date, refers_to_target_uri))
return self._send_headers_and_refd_payload(record.content[1], refers_to_target_uri, refers_to_date)
self.logger.debug('revisit record references {}:{} capture of {}'.format(refers_to_date, refers_to, refers_to_target_uri))
return self._send_headers_and_refd_payload(record.content[1], refers_to, refers_to_target_uri, refers_to_date)
else:
raise Exception('unknown warc record type {}'.format(warc_type))
@ -912,23 +913,32 @@ class PlaybackIndexDb(object):
def close(self):
self.db.close()
def sync(self):
self.db.sync()
def save(self, warcfile, recordset, offset):
response_record = recordset[0]
# XXX canonicalize url?
url = response_record.get_header(warctools.WarcRecord.URL)
date = response_record.get_header(warctools.WarcRecord.DATE)
record_id = response_record.get_header(warctools.WarcRecord.ID)
# url:{date1:{'f':warcfile,'o':response_offset,'q':request_offset,'t':response/revisit,'u':revisit_target_url,'d':revisit_target_date},date2:{...},...}
# there could be two visits of same url in the same second, and WARC-Date is
# prescribed as YYYY-MM-DDThh:mm:ssZ, so we have to handle it :-\
# url:{date1:[record1={'f':warcfile,'o':response_offset,'q':request_offset,'i':record_id},record2,...],date2:[{...}],...}
if url in self.db:
existing_json_value = self.db[url]
py_value = json.loads(existing_json_value)
else:
py_value = {}
py_value[date] = {'f':warcfile, 'o':offset}
if date in py_value:
py_value[date].append({'f':warcfile, 'o':offset, 'i':record_id})
else:
py_value[date] = [{'f':warcfile, 'o':offset, 'i':record_id}]
json_value = json.dumps(py_value, separators=(',',':'))
@ -942,25 +952,32 @@ class PlaybackIndexDb(object):
return None, None
json_value = self.db[url]
self.logger.debug("'{}':{}".format(url, json_value))
py_value = json.loads(json_value)
latest_date = max(py_value)
return latest_date, py_value[latest_date]
return latest_date, py_value[latest_date][0]
def lookup_exact(self, url, warc_date):
def lookup_exact(self, url, warc_date, record_id):
if url not in self.db:
return None
json_value = self.db[url]
self.logger.debug("'{}':{}".format(url, json_value))
py_value = json.loads(json_value)
if warc_date in py_value:
return py_value[warc_date]
for record in py_value[warc_date]:
if record['i'] == record_id:
self.logger.debug("found exact match for ({},{},{})".format(repr(warc_date), repr(record_id), repr(url)))
return record
else:
self.logger.info("match not found for ({},{},{})".format(repr(warc_date), repr(record_id), repr(url)))
return None
class WarcproxController(object):
logger = logging.getLogger('warcprox.WarcproxController')
@ -1095,7 +1112,7 @@ def main(argv=sys.argv):
loglevel = logging.INFO
logging.basicConfig(stream=sys.stdout, level=loglevel,
format='%(asctime)s %(process)d %(threadName)s %(levelname)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s')
format='%(asctime)s %(process)d %(levelname)s %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s')
try:
hashlib.new(args.digest_algorithm)