1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

refactor: cleanup the revisit resolving logic in replay

also, update documented logic on wiki at:
https://github.com/ikreymer/pywb/wiki/PyWb-Record-Lookup-and-Revisits
This commit is contained in:
Ilya Kreymer 2014-01-20 17:52:14 -08:00
parent 9a28a2ec6e
commit 8fd10673e8

View File

@ -103,39 +103,38 @@ class ReplayHandler(object):
hasCurr = (cdx['filename'] != '-')
hasOrig = (cdx.get('orig.filename','-') != '-')
# Case 1: non-revisit
if (hasCurr and not hasOrig):
headersRecord = self._load(cdx, False, failedFiles)
# load headers record from cdx['filename'] unless it is '-' (rare)
headersRecord = self._load(cdx, False, failedFiles) if hasCurr else None
# two index lookups
# Case 1: if mimetype is still warc/revisit
if cdx['mimetype'] == 'warc/revisit' and headersRecord:
payloadRecord = self._load_different_url_payload(wbrequest, query, cdx, headersRecord, failedFiles)
# single lookup cases
# case 2: non-revisit
elif (hasCurr and not hasOrig):
payloadRecord = headersRecord
isRevisit = False
# Case 2: old-style revisit, load headers from original payload
elif (not hasCurr and hasOrig):
payloadRecord = self._load(cdx, False, failedFiles)
headersRecord = payloadRecord
isRevisit = True
# Case 3: modern revisit, load headers from curr, payload from original
elif (hasCurr and hasOrig):
headersRecord = self._load(cdx, False, failedFiles)
# case 3: identical url revisit, load payload from orig.filename
elif (hasOrig):
payloadRecord = self._load(cdx, True, failedFiles)
# Case 4: if headers record is actually empty (eg empty revisit), then use headers from revisit
# special case: set header to payload if old-style revisit with missing header
if not headersRecord:
headersRecord = payloadRecord
elif headersRecord != payloadRecord:
# close remainder of stream as this record only used for (already parsed) headers
headersRecord.stream.close()
# special case: check if headers record is actually empty (eg empty revisit), then use headers from revisit
if not headersRecord.status_headers.headers:
headersRecord.stream.close()
headersRecord = payloadRecord
else:
headersRecord.stream.close()
isRevisit = True
else:
if not headersRecord or not payloadRecord:
raise wbexceptions.CaptureException('Invalid CDX' + str(cdx))
# special cases: if mimetype is still warc/revisit.. need to look further
if cdx['mimetype'] == 'warc/revisit':
payloadRecord = self._load_different_url_payload(wbrequest, query, cdx, headersRecord, failedFiles)
return WbResponse.stream_response(headersRecord.status_headers, payloadRecord.stream)
@ -147,11 +146,10 @@ class ReplayHandler(object):
def _load_different_url_payload(self, wbrequest, query, cdx, headersRecord, failedFiles):
ref_target_uri = headersRecord.rec_headers.getHeader('WARC-Refers-To-Target-URI')
# Case 5: unresolved revisit error, if refers to target uri not present or same as the current uri
# Check for unresolved revisit error, if refers to target uri not present or same as the current url
if not ref_target_uri or (ref_target_uri == headersRecord.rec_headers.getHeader('WARC-Target-URI')):
raise wbexceptions.CaptureException('Missing Revisit Original' + str(cdx))
# Case 6: url-agnostic revisit with different original url (either same or different date)
ref_target_date = headersRecord.rec_headers.getHeader('WARC-Refers-To-Date')
if not ref_target_date: