1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-25 23:47:47 +01:00
pywb/pywb/warc/resolvingloader.py
Ilya Kreymer 0784e4e5aa spin-off warcio!
update imports to point to warcio
warcio rename fixes:
- ArcWarcRecord.stream -> raw_stream
- ArcWarcRecord.status_headers -> http_headers
- ArchiveLoadFailed single param init
2017-03-07 10:58:00 -08:00

220 lines
8.0 KiB
Python

from warcio.recordloader import ArchiveLoadFailed
from warcio.timeutils import iso_date_to_timestamp
from pywb.warc.blockrecordloader import BlockArcWarcRecordLoader
from pywb.utils.wbexception import NotFoundException
import six
#=================================================================
class ResolvingLoader(object):
MISSING_REVISIT_MSG = 'Original for revisit record could not be loaded'
def __init__(self, path_resolvers, record_loader=BlockArcWarcRecordLoader(), no_record_parse=False):
self.path_resolvers = path_resolvers
self.record_loader = record_loader
self.no_record_parse = no_record_parse
def __call__(self, cdx, failed_files, cdx_loader, *args, **kwargs):
headers_record, payload_record = self.load_headers_and_payload(cdx, failed_files, cdx_loader)
# Default handling logic when loading http status/headers
# special case: set header to payload if old-style revisit
# with missing header
if not headers_record:
headers_record = payload_record
elif headers_record != payload_record:
# close remainder of stream as this record only used for
# (already parsed) headers
headers_record.raw_stream.close()
# special case: check if headers record is actually empty
# (eg empty revisit), then use headers from revisit
if not headers_record.http_headers.headers:
headers_record = payload_record
if not headers_record or not payload_record:
raise ArchiveLoadFailed('Could not load ' + str(cdx))
# ensure status line is valid from here
headers_record.http_headers.validate_statusline('204 No Content')
return (headers_record.http_headers, payload_record.raw_stream)
def load_headers_and_payload(self, cdx, failed_files, cdx_loader):
"""
Resolve headers and payload for a given capture
In the simple case, headers and payload are in the same record.
In the case of revisit records, the payload and headers may be in
different records.
If the original has already been found, lookup original using
orig. fields in cdx dict.
Otherwise, call _load_different_url_payload() to get cdx index
from a different url to find the original record.
"""
has_curr = (cdx['filename'] != '-')
#has_orig = (cdx.get('orig.filename', '-') != '-')
orig_f = cdx.get('orig.filename')
has_orig = orig_f and orig_f != '-'
# load headers record from cdx['filename'] unless it is '-' (rare)
headers_record = None
if has_curr:
headers_record = self._resolve_path_load(cdx, False, failed_files)
# two index lookups
# Case 1: if mimetype is still warc/revisit
if cdx.get('mime') == 'warc/revisit' and headers_record:
payload_record = self._load_different_url_payload(cdx,
headers_record,
failed_files,
cdx_loader)
# single lookup cases
# case 2: non-revisit
elif (has_curr and not has_orig):
payload_record = headers_record
# case 3: identical url revisit, load payload from orig.filename
elif (has_orig):
payload_record = self._resolve_path_load(cdx, True, failed_files)
return headers_record, payload_record
def _resolve_path_load(self, cdx, is_original, failed_files):
"""
Load specific record based on filename, offset and length
fields in the cdx.
If original=True, use the orig.* fields for the cdx
Resolve the filename to full path using specified path resolvers
If failed_files list provided, keep track of failed resolve attempts
"""
if is_original:
(filename, offset, length) = (cdx['orig.filename'],
cdx['orig.offset'],
cdx['orig.length'])
else:
(filename, offset, length) = (cdx['filename'],
cdx['offset'],
cdx.get('length', '-'))
# optimization: if same file already failed this request,
# don't try again
if failed_files is not None and filename in failed_files:
raise ArchiveLoadFailed('Skipping Already Failed: ' + filename)
any_found = False
last_exc = None
last_traceback = None
for resolver in self.path_resolvers:
possible_paths = resolver(filename, cdx)
if not possible_paths:
continue
if isinstance(possible_paths, six.string_types):
possible_paths = [possible_paths]
for path in possible_paths:
any_found = True
try:
return (self.record_loader.
load(path, offset, length,
no_record_parse=self.no_record_parse))
except Exception as ue:
last_exc = ue
import sys
last_traceback = sys.exc_info()[2]
# Unsuccessful if reached here
if failed_files is not None:
failed_files.append(filename)
if last_exc:
#msg = str(last_exc.__class__.__name__)
msg = str(last_exc)
else:
msg = 'Archive File Not Found'
#raise ArchiveLoadFailed(msg, filename), None, last_traceback
six.reraise(ArchiveLoadFailed, ArchiveLoadFailed(filename + ': ' + msg), last_traceback)
def _load_different_url_payload(self, cdx, headers_record,
failed_files, cdx_loader):
"""
Handle the case where a duplicate of a capture with same digest
exists at a different url.
If a cdx_server is provided, a query is made for matching
url, timestamp and digest.
Raise exception if no matches found.
"""
ref_target_uri = (headers_record.rec_headers.
get_header('WARC-Refers-To-Target-URI'))
target_uri = headers_record.rec_headers.get_header('WARC-Target-URI')
# if no target uri, no way to find the original
if not ref_target_uri:
raise ArchiveLoadFailed(self.MISSING_REVISIT_MSG)
ref_target_date = (headers_record.rec_headers.
get_header('WARC-Refers-To-Date'))
if not ref_target_date:
ref_target_date = cdx['timestamp']
else:
ref_target_date = iso_date_to_timestamp(ref_target_date)
digest = cdx.get('digest', '-')
try:
orig_cdx_lines = self.load_cdx_for_dupe(ref_target_uri,
ref_target_date,
digest,
cdx_loader)
except NotFoundException:
raise ArchiveLoadFailed(self.MISSING_REVISIT_MSG)
for orig_cdx in orig_cdx_lines:
try:
payload_record = self._resolve_path_load(orig_cdx, False,
failed_files)
return payload_record
except ArchiveLoadFailed as e:
pass
raise ArchiveLoadFailed(self.MISSING_REVISIT_MSG)
def load_cdx_for_dupe(self, url, timestamp, digest, cdx_loader):
"""
If a cdx_server is available, return response from server,
otherwise empty list
"""
if not cdx_loader:
return iter([])
filters = []
filters.append('!mime:warc/revisit')
if digest and digest != '-':
filters.append('digest:' + digest)
params = dict(url=url,
closest=timestamp,
filter=filters)
return cdx_loader(params)