mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-28 00:25:21 +01:00
refactor: cleanup the revisit resolving logic in replay
also, update documented logic on wiki at: https://github.com/ikreymer/pywb/wiki/PyWb-Record-Lookup-and-Revisits
This commit is contained in:
parent
9a28a2ec6e
commit
8fd10673e8
@ -103,39 +103,38 @@ class ReplayHandler(object):
|
|||||||
hasCurr = (cdx['filename'] != '-')
|
hasCurr = (cdx['filename'] != '-')
|
||||||
hasOrig = (cdx.get('orig.filename','-') != '-')
|
hasOrig = (cdx.get('orig.filename','-') != '-')
|
||||||
|
|
||||||
# Case 1: non-revisit
|
# load headers record from cdx['filename'] unless it is '-' (rare)
|
||||||
if (hasCurr and not hasOrig):
|
headersRecord = self._load(cdx, False, failedFiles) if hasCurr else None
|
||||||
headersRecord = self._load(cdx, False, failedFiles)
|
|
||||||
|
# two index lookups
|
||||||
|
# Case 1: if mimetype is still warc/revisit
|
||||||
|
if cdx['mimetype'] == 'warc/revisit' and headersRecord:
|
||||||
|
payloadRecord = self._load_different_url_payload(wbrequest, query, cdx, headersRecord, failedFiles)
|
||||||
|
|
||||||
|
# single lookup cases
|
||||||
|
# case 2: non-revisit
|
||||||
|
elif (hasCurr and not hasOrig):
|
||||||
payloadRecord = headersRecord
|
payloadRecord = headersRecord
|
||||||
isRevisit = False
|
|
||||||
|
|
||||||
# Case 2: old-style revisit, load headers from original payload
|
# case 3: identical url revisit, load payload from orig.filename
|
||||||
elif (not hasCurr and hasOrig):
|
elif (hasOrig):
|
||||||
payloadRecord = self._load(cdx, False, failedFiles)
|
|
||||||
headersRecord = payloadRecord
|
|
||||||
isRevisit = True
|
|
||||||
|
|
||||||
# Case 3: modern revisit, load headers from curr, payload from original
|
|
||||||
elif (hasCurr and hasOrig):
|
|
||||||
headersRecord = self._load(cdx, False, failedFiles)
|
|
||||||
payloadRecord = self._load(cdx, True, failedFiles)
|
payloadRecord = self._load(cdx, True, failedFiles)
|
||||||
|
|
||||||
# Case 4: if headers record is actually empty (eg empty revisit), then use headers from revisit
|
# special case: set header to payload if old-style revisit with missing header
|
||||||
if not headersRecord.status_headers.headers:
|
if not headersRecord:
|
||||||
headersRecord.stream.close()
|
|
||||||
headersRecord = payloadRecord
|
headersRecord = payloadRecord
|
||||||
else:
|
elif headersRecord != payloadRecord:
|
||||||
|
# close remainder of stream as this record only used for (already parsed) headers
|
||||||
headersRecord.stream.close()
|
headersRecord.stream.close()
|
||||||
|
|
||||||
|
# special case: check if headers record is actually empty (eg empty revisit), then use headers from revisit
|
||||||
|
if not headersRecord.status_headers.headers:
|
||||||
|
headersRecord = payloadRecord
|
||||||
|
|
||||||
isRevisit = True
|
|
||||||
|
|
||||||
else:
|
if not headersRecord or not payloadRecord:
|
||||||
raise wbexceptions.CaptureException('Invalid CDX' + str(cdx))
|
raise wbexceptions.CaptureException('Invalid CDX' + str(cdx))
|
||||||
|
|
||||||
# special cases: if mimetype is still warc/revisit.. need to look further
|
|
||||||
if cdx['mimetype'] == 'warc/revisit':
|
|
||||||
payloadRecord = self._load_different_url_payload(wbrequest, query, cdx, headersRecord, failedFiles)
|
|
||||||
|
|
||||||
return WbResponse.stream_response(headersRecord.status_headers, payloadRecord.stream)
|
return WbResponse.stream_response(headersRecord.status_headers, payloadRecord.stream)
|
||||||
|
|
||||||
@ -147,11 +146,10 @@ class ReplayHandler(object):
|
|||||||
def _load_different_url_payload(self, wbrequest, query, cdx, headersRecord, failedFiles):
|
def _load_different_url_payload(self, wbrequest, query, cdx, headersRecord, failedFiles):
|
||||||
ref_target_uri = headersRecord.rec_headers.getHeader('WARC-Refers-To-Target-URI')
|
ref_target_uri = headersRecord.rec_headers.getHeader('WARC-Refers-To-Target-URI')
|
||||||
|
|
||||||
# Case 5: unresolved revisit error, if refers to target uri not present or same as the current uri
|
# Check for unresolved revisit error, if refers to target uri not present or same as the current url
|
||||||
if not ref_target_uri or (ref_target_uri == headersRecord.rec_headers.getHeader('WARC-Target-URI')):
|
if not ref_target_uri or (ref_target_uri == headersRecord.rec_headers.getHeader('WARC-Target-URI')):
|
||||||
raise wbexceptions.CaptureException('Missing Revisit Original' + str(cdx))
|
raise wbexceptions.CaptureException('Missing Revisit Original' + str(cdx))
|
||||||
|
|
||||||
# Case 6: url-agnostic revisit with different original url (either same or different date)
|
|
||||||
ref_target_date = headersRecord.rec_headers.getHeader('WARC-Refers-To-Date')
|
ref_target_date = headersRecord.rec_headers.getHeader('WARC-Refers-To-Date')
|
||||||
|
|
||||||
if not ref_target_date:
|
if not ref_target_date:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user