1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-20 18:59:11 +01:00
pywb/rezag/responseloader.py

203 lines
6.4 KiB
Python
Raw Normal View History

from rezag.liverec import BaseRecorder
from rezag.liverec import request as remote_request
2016-02-19 17:27:19 -08:00
2016-02-22 13:30:12 -08:00
from pywb.utils.timeutils import timestamp_to_datetime, datetime_to_http_date
from pywb.utils.wbexception import LiveResourceException
2016-02-22 13:30:12 -08:00
from pywb.warc.resolvingloader import ResolvingLoader
2016-02-19 17:27:19 -08:00
from io import BytesIO
from bottle import response
import uuid
#=============================================================================
class StreamIter(object):
def __init__(self, stream, header=None, size=8192):
self.stream = stream
self.header = header
self.size = size
def __iter__(self):
return self
def __next__(self):
if self.header:
header = self.header
self.header = None
return header
data = self.stream.read(self.size)
2016-02-19 17:27:19 -08:00
if data:
return data
self.close()
raise StopIteration
def close(self):
if not self.stream:
return
2016-02-19 17:27:19 -08:00
try:
self.stream.close()
self.stream = None
except Exception:
pass
2016-02-19 17:27:19 -08:00
#=============================================================================
class WARCPathLoader(object):
def __init__(self, paths, cdx_source):
self.paths = paths
if isinstance(paths, str):
self.paths = [paths]
self.path_checks = list(self.warc_paths())
self.resolve_loader = ResolvingLoader(self.path_checks,
no_record_parse=True)
self.cdx_source = cdx_source
2016-02-22 13:30:12 -08:00
def warc_paths(self):
for path in self.paths:
def check(filename, cdx):
try:
if hasattr(cdx, '_src_params') and cdx._src_params:
full_path = path.format(**cdx._src_params)
else:
full_path = path
full_path += filename
return full_path
except KeyError:
return None
2016-02-22 13:30:12 -08:00
yield check
2016-02-19 17:27:19 -08:00
def __call__(self, cdx, params):
2016-02-22 13:30:12 -08:00
if not cdx.get('filename') or cdx.get('offset') is None:
return None
2016-02-19 17:27:19 -08:00
cdx._src_params = params.get('_src_params')
2016-02-22 13:30:12 -08:00
failed_files = []
headers, payload = (self.resolve_loader.
load_headers_and_payload(cdx,
failed_files,
self.cdx_source))
2016-02-19 17:27:19 -08:00
2016-02-22 13:30:12 -08:00
record = payload
2016-02-19 17:27:19 -08:00
for n, v in record.rec_headers.headers:
response.headers[n] = v
response.headers['WARC-Coll'] = cdx.get('source')
if headers != payload:
response.headers['WARC-Target-URI'] = headers.rec_headers.get_header('WARC-Target-URI')
response.headers['WARC-Date'] = headers.rec_headers.get_header('WARC-Date')
response.headers['WARC-Refers-To-Target-URI'] = payload.rec_headers.get_header('WARC-Target-URI')
response.headers['WARC-Refers-To-Date'] = payload.rec_headers.get_header('WARC-Date')
headers.stream.close()
res = StreamIter(record.stream)
return res
2016-02-19 17:27:19 -08:00
#=============================================================================
class HeaderRecorder(BaseRecorder):
def __init__(self, skip_list=None):
self.buff = BytesIO()
self.skip_list = skip_list
self.skipped = []
def write_response_header_line(self, line):
if self.accept_header(line):
self.buff.write(line)
def get_header(self):
return self.buff.getvalue()
def accept_header(self, line):
if self.skip_list and line.lower().startswith(self.skip_list):
self.skipped.append(line)
return False
return True
#=============================================================================
class LiveWebLoader(object):
2016-02-19 17:27:19 -08:00
SKIP_HEADERS = (b'link',
b'memento-datetime',
b'content-location',
2016-02-22 13:30:12 -08:00
b'x-archive')
2016-02-19 17:27:19 -08:00
def __call__(self, cdx, params):
2016-02-19 17:27:19 -08:00
load_url = cdx.get('load_url')
if not load_url:
2016-02-22 13:30:12 -08:00
return None
2016-02-19 17:27:19 -08:00
recorder = HeaderRecorder(self.SKIP_HEADERS)
input_req = params['_input_req']
req_headers = input_req.get_req_headers()
2016-02-22 13:30:12 -08:00
dt = timestamp_to_datetime(cdx['timestamp'])
if not cdx.get('is_live'):
req_headers['Accept-Datetime'] = datetime_to_http_date(dt)
# if different url, ensure origin is not set
# may need to add other headers
if load_url != cdx['url']:
if 'Origin' in req_headers:
splits = urlsplit(load_url)
req_headers['Origin'] = splits.scheme + '://' + splits.netloc
method = input_req.get_req_method()
data = input_req.get_req_body()
try:
upstream_res = remote_request(url=load_url,
method=method,
recorder=recorder,
stream=True,
allow_redirects=False,
headers=req_headers,
data=data,
timeout=params.get('_timeout'))
except Exception:
raise LiveResourceException(load_url)
2016-02-22 13:30:12 -08:00
resp_headers = recorder.get_header()
2016-02-19 17:27:19 -08:00
response.headers['Content-Type'] = 'application/http; msgtype=response'
2016-02-22 13:30:12 -08:00
#response.headers['WARC-Type'] = 'response'
#response.headers['WARC-Record-ID'] = self._make_warc_id()
2016-02-19 17:27:19 -08:00
response.headers['WARC-Target-URI'] = cdx['url']
2016-02-22 13:30:12 -08:00
response.headers['WARC-Date'] = self._make_date(dt)
response.headers['WARC-Coll'] = cdx.get('source', '')
2016-02-19 17:27:19 -08:00
# Try to set content-length, if it is available and valid
try:
content_len = int(upstream_res.headers.get('content-length', 0))
if content_len > 0:
2016-02-22 13:30:12 -08:00
content_len += len(resp_headers)
2016-02-19 17:27:19 -08:00
response.headers['Content-Length'] = content_len
except:
2016-02-22 13:30:12 -08:00
raise
2016-02-19 17:27:19 -08:00
return StreamIter(upstream_res.raw, header=resp_headers)
2016-02-19 17:27:19 -08:00
@staticmethod
2016-02-22 13:30:12 -08:00
def _make_date(dt):
return dt.strftime('%Y-%m-%dT%H:%M:%SZ')
2016-02-19 17:27:19 -08:00
@staticmethod
def _make_warc_id(id_=None): #pragma: no cover
2016-02-19 17:27:19 -08:00
if not id_:
id_ = uuid.uuid1()
return '<urn:uuid:{0}>'.format(id_)