2016-03-03 11:55:43 -08:00
|
|
|
from webagg.liverec import BaseRecorder
|
|
|
|
from webagg.liverec import request as remote_request
|
2016-02-19 17:27:19 -08:00
|
|
|
|
2016-03-03 11:55:43 -08:00
|
|
|
from webagg.utils import MementoUtils
|
2016-03-03 11:04:28 -08:00
|
|
|
|
2016-02-22 13:30:12 -08:00
|
|
|
from pywb.utils.timeutils import timestamp_to_datetime, datetime_to_http_date
|
2016-03-03 11:04:28 -08:00
|
|
|
from pywb.utils.timeutils import iso_date_to_datetime
|
2016-02-29 12:34:06 -08:00
|
|
|
from pywb.utils.wbexception import LiveResourceException
|
2016-02-22 13:30:12 -08:00
|
|
|
from pywb.warc.resolvingloader import ResolvingLoader
|
2016-02-19 17:27:19 -08:00
|
|
|
|
|
|
|
from io import BytesIO
|
|
|
|
from bottle import response
|
|
|
|
|
|
|
|
import uuid
|
|
|
|
|
|
|
|
|
|
|
|
#=============================================================================
|
2016-03-01 14:46:05 -08:00
|
|
|
class StreamIter(object):
|
|
|
|
def __init__(self, stream, header=None, size=8192):
|
|
|
|
self.stream = stream
|
|
|
|
self.header = header
|
|
|
|
self.size = size
|
|
|
|
|
|
|
|
def __iter__(self):
|
|
|
|
return self
|
|
|
|
|
|
|
|
def __next__(self):
|
|
|
|
if self.header:
|
|
|
|
header = self.header
|
|
|
|
self.header = None
|
|
|
|
return header
|
|
|
|
|
|
|
|
data = self.stream.read(self.size)
|
2016-02-19 17:27:19 -08:00
|
|
|
if data:
|
2016-03-01 14:46:05 -08:00
|
|
|
return data
|
|
|
|
|
|
|
|
self.close()
|
|
|
|
raise StopIteration
|
|
|
|
|
|
|
|
def close(self):
|
|
|
|
if not self.stream:
|
|
|
|
return
|
2016-02-19 17:27:19 -08:00
|
|
|
|
2016-03-01 14:46:05 -08:00
|
|
|
try:
|
|
|
|
self.stream.close()
|
|
|
|
self.stream = None
|
|
|
|
except Exception:
|
|
|
|
pass
|
2016-02-24 14:22:29 -08:00
|
|
|
|
2016-02-19 17:27:19 -08:00
|
|
|
|
|
|
|
#=============================================================================
|
2016-03-03 11:04:28 -08:00
|
|
|
class BaseLoader(object):
|
|
|
|
def __call__(self, cdx, params):
|
|
|
|
res = self._load_resource(cdx, params)
|
|
|
|
if not res:
|
|
|
|
return res
|
|
|
|
|
|
|
|
response.headers['WARC-Coll'] = cdx.get('source', '')
|
|
|
|
|
|
|
|
response.headers['Link'] = MementoUtils.make_link(
|
|
|
|
response.headers['WARC-Target-URI'],
|
|
|
|
'original')
|
|
|
|
|
|
|
|
memento_dt = iso_date_to_datetime(response.headers['WARC-Date'])
|
|
|
|
response.headers['Memento-Datetime'] = datetime_to_http_date(memento_dt)
|
|
|
|
return res
|
|
|
|
|
|
|
|
def _load_resource(self, cdx, params): #pragma: no cover
|
|
|
|
raise NotImplemented()
|
|
|
|
|
|
|
|
|
|
|
|
#=============================================================================
|
|
|
|
class WARCPathLoader(BaseLoader):
|
2016-02-24 14:22:29 -08:00
|
|
|
def __init__(self, paths, cdx_source):
|
|
|
|
self.paths = paths
|
|
|
|
if isinstance(paths, str):
|
|
|
|
self.paths = [paths]
|
|
|
|
|
|
|
|
self.path_checks = list(self.warc_paths())
|
|
|
|
|
|
|
|
self.resolve_loader = ResolvingLoader(self.path_checks,
|
|
|
|
no_record_parse=True)
|
|
|
|
self.cdx_source = cdx_source
|
2016-02-22 13:30:12 -08:00
|
|
|
|
2016-03-02 18:13:13 -08:00
|
|
|
def cdx_index_source(self, *args, **kwargs):
|
|
|
|
cdx_iter, errs = self.cdx_source(*args, **kwargs)
|
|
|
|
return cdx_iter
|
|
|
|
|
2016-02-24 14:22:29 -08:00
|
|
|
def warc_paths(self):
|
|
|
|
for path in self.paths:
|
|
|
|
def check(filename, cdx):
|
|
|
|
try:
|
2016-02-28 14:33:08 -08:00
|
|
|
if hasattr(cdx, '_src_params') and cdx._src_params:
|
|
|
|
full_path = path.format(**cdx._src_params)
|
|
|
|
else:
|
|
|
|
full_path = path
|
|
|
|
full_path += filename
|
2016-02-24 14:22:29 -08:00
|
|
|
return full_path
|
|
|
|
except KeyError:
|
|
|
|
return None
|
2016-02-22 13:30:12 -08:00
|
|
|
|
2016-02-24 14:22:29 -08:00
|
|
|
yield check
|
2016-02-19 17:27:19 -08:00
|
|
|
|
2016-03-03 11:04:28 -08:00
|
|
|
def _load_resource(self, cdx, params):
|
2016-02-22 13:30:12 -08:00
|
|
|
if not cdx.get('filename') or cdx.get('offset') is None:
|
|
|
|
return None
|
2016-02-19 17:27:19 -08:00
|
|
|
|
2016-02-28 14:33:08 -08:00
|
|
|
cdx._src_params = params.get('_src_params')
|
2016-02-22 13:30:12 -08:00
|
|
|
failed_files = []
|
2016-02-24 14:22:29 -08:00
|
|
|
headers, payload = (self.resolve_loader.
|
|
|
|
load_headers_and_payload(cdx,
|
|
|
|
failed_files,
|
2016-03-02 18:13:13 -08:00
|
|
|
self.cdx_index_source))
|
2016-02-19 17:27:19 -08:00
|
|
|
|
2016-02-22 13:30:12 -08:00
|
|
|
record = payload
|
2016-02-19 17:27:19 -08:00
|
|
|
|
|
|
|
for n, v in record.rec_headers.headers:
|
|
|
|
response.headers[n] = v
|
|
|
|
|
2016-02-28 14:33:08 -08:00
|
|
|
if headers != payload:
|
|
|
|
response.headers['WARC-Target-URI'] = headers.rec_headers.get_header('WARC-Target-URI')
|
|
|
|
response.headers['WARC-Date'] = headers.rec_headers.get_header('WARC-Date')
|
|
|
|
response.headers['WARC-Refers-To-Target-URI'] = payload.rec_headers.get_header('WARC-Target-URI')
|
|
|
|
response.headers['WARC-Refers-To-Date'] = payload.rec_headers.get_header('WARC-Date')
|
|
|
|
headers.stream.close()
|
|
|
|
|
2016-03-03 11:04:28 -08:00
|
|
|
return StreamIter(record.stream)
|
2016-02-19 17:27:19 -08:00
|
|
|
|
2016-03-02 18:13:13 -08:00
|
|
|
def __str__(self):
|
|
|
|
return 'WARCPathLoader'
|
|
|
|
|
2016-02-19 17:27:19 -08:00
|
|
|
|
|
|
|
#=============================================================================
|
|
|
|
class HeaderRecorder(BaseRecorder):
|
|
|
|
def __init__(self, skip_list=None):
|
|
|
|
self.buff = BytesIO()
|
|
|
|
self.skip_list = skip_list
|
|
|
|
self.skipped = []
|
|
|
|
|
|
|
|
def write_response_header_line(self, line):
|
|
|
|
if self.accept_header(line):
|
|
|
|
self.buff.write(line)
|
|
|
|
|
|
|
|
def get_header(self):
|
|
|
|
return self.buff.getvalue()
|
|
|
|
|
|
|
|
def accept_header(self, line):
|
|
|
|
if self.skip_list and line.lower().startswith(self.skip_list):
|
|
|
|
self.skipped.append(line)
|
|
|
|
return False
|
|
|
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
|
|
#=============================================================================
|
2016-03-03 11:04:28 -08:00
|
|
|
class LiveWebLoader(BaseLoader):
|
2016-02-19 17:27:19 -08:00
|
|
|
SKIP_HEADERS = (b'link',
|
|
|
|
b'memento-datetime',
|
|
|
|
b'content-location',
|
2016-02-22 13:30:12 -08:00
|
|
|
b'x-archive')
|
2016-02-19 17:27:19 -08:00
|
|
|
|
2016-03-03 11:04:28 -08:00
|
|
|
def _load_resource(self, cdx, params):
|
2016-02-19 17:27:19 -08:00
|
|
|
load_url = cdx.get('load_url')
|
|
|
|
if not load_url:
|
2016-02-22 13:30:12 -08:00
|
|
|
return None
|
2016-02-19 17:27:19 -08:00
|
|
|
|
|
|
|
recorder = HeaderRecorder(self.SKIP_HEADERS)
|
|
|
|
|
2016-02-24 14:22:29 -08:00
|
|
|
input_req = params['_input_req']
|
|
|
|
|
2016-02-28 14:33:08 -08:00
|
|
|
req_headers = input_req.get_req_headers()
|
2016-02-22 13:30:12 -08:00
|
|
|
|
|
|
|
dt = timestamp_to_datetime(cdx['timestamp'])
|
|
|
|
|
|
|
|
if not cdx.get('is_live'):
|
|
|
|
req_headers['Accept-Datetime'] = datetime_to_http_date(dt)
|
|
|
|
|
2016-02-28 14:33:08 -08:00
|
|
|
# if different url, ensure origin is not set
|
|
|
|
# may need to add other headers
|
|
|
|
if load_url != cdx['url']:
|
|
|
|
if 'Origin' in req_headers:
|
|
|
|
splits = urlsplit(load_url)
|
|
|
|
req_headers['Origin'] = splits.scheme + '://' + splits.netloc
|
|
|
|
|
2016-02-24 14:22:29 -08:00
|
|
|
method = input_req.get_req_method()
|
|
|
|
data = input_req.get_req_body()
|
|
|
|
|
2016-02-29 12:34:06 -08:00
|
|
|
try:
|
|
|
|
upstream_res = remote_request(url=load_url,
|
|
|
|
method=method,
|
|
|
|
recorder=recorder,
|
|
|
|
stream=True,
|
|
|
|
allow_redirects=False,
|
|
|
|
headers=req_headers,
|
|
|
|
data=data,
|
|
|
|
timeout=params.get('_timeout'))
|
|
|
|
except Exception:
|
|
|
|
raise LiveResourceException(load_url)
|
2016-02-22 13:30:12 -08:00
|
|
|
|
|
|
|
resp_headers = recorder.get_header()
|
2016-02-19 17:27:19 -08:00
|
|
|
|
|
|
|
response.headers['Content-Type'] = 'application/http; msgtype=response'
|
|
|
|
|
2016-02-22 13:30:12 -08:00
|
|
|
#response.headers['WARC-Type'] = 'response'
|
|
|
|
#response.headers['WARC-Record-ID'] = self._make_warc_id()
|
2016-02-19 17:27:19 -08:00
|
|
|
response.headers['WARC-Target-URI'] = cdx['url']
|
2016-02-22 13:30:12 -08:00
|
|
|
response.headers['WARC-Date'] = self._make_date(dt)
|
2016-02-19 17:27:19 -08:00
|
|
|
|
|
|
|
# Try to set content-length, if it is available and valid
|
|
|
|
try:
|
|
|
|
content_len = int(upstream_res.headers.get('content-length', 0))
|
|
|
|
if content_len > 0:
|
2016-02-22 13:30:12 -08:00
|
|
|
content_len += len(resp_headers)
|
2016-02-19 17:27:19 -08:00
|
|
|
response.headers['Content-Length'] = content_len
|
2016-03-03 11:04:28 -08:00
|
|
|
except (KeyError, TypeError):
|
|
|
|
pass
|
2016-02-19 17:27:19 -08:00
|
|
|
|
2016-03-01 14:46:05 -08:00
|
|
|
return StreamIter(upstream_res.raw, header=resp_headers)
|
2016-02-19 17:27:19 -08:00
|
|
|
|
|
|
|
@staticmethod
|
2016-02-22 13:30:12 -08:00
|
|
|
def _make_date(dt):
|
|
|
|
return dt.strftime('%Y-%m-%dT%H:%M:%SZ')
|
2016-02-19 17:27:19 -08:00
|
|
|
|
|
|
|
@staticmethod
|
2016-02-29 12:34:06 -08:00
|
|
|
def _make_warc_id(id_=None): #pragma: no cover
|
2016-02-19 17:27:19 -08:00
|
|
|
if not id_:
|
|
|
|
id_ = uuid.uuid1()
|
|
|
|
return '<urn:uuid:{0}>'.format(id_)
|
2016-03-02 18:13:13 -08:00
|
|
|
|
|
|
|
def __str__(self):
|
|
|
|
return 'LiveWebLoader'
|
|
|
|
|