2016-03-21 11:04:52 -07:00
|
|
|
from webagg.utils import MementoUtils, StreamIter, chunk_encode_iter
|
2016-03-24 16:08:29 -04:00
|
|
|
from webagg.utils import ParamFormatter
|
2016-03-21 11:04:52 -07:00
|
|
|
from webagg.indexsource import RedisIndexSource
|
2016-03-03 11:04:28 -08:00
|
|
|
|
2016-03-08 10:27:13 -08:00
|
|
|
from pywb.utils.timeutils import timestamp_to_datetime, datetime_to_timestamp
|
|
|
|
from pywb.utils.timeutils import iso_date_to_datetime, datetime_to_iso_date
|
|
|
|
from pywb.utils.timeutils import http_date_to_datetime, datetime_to_http_date
|
2016-03-06 23:10:30 -08:00
|
|
|
|
2016-03-24 16:08:29 -04:00
|
|
|
from pywb.utils.wbexception import LiveResourceException, WbException
|
|
|
|
from pywb.utils.statusandheaders import StatusAndHeaders, StatusAndHeadersParser
|
2016-03-06 23:10:30 -08:00
|
|
|
|
2016-02-22 13:30:12 -08:00
|
|
|
from pywb.warc.resolvingloader import ResolvingLoader
|
2016-02-19 17:27:19 -08:00
|
|
|
|
2016-07-23 21:57:24 -04:00
|
|
|
from six.moves.urllib.parse import urlsplit, quote, unquote
|
2016-03-24 16:08:29 -04:00
|
|
|
|
2016-05-28 15:01:33 -07:00
|
|
|
from io import BytesIO
|
2016-02-19 17:27:19 -08:00
|
|
|
|
|
|
|
import uuid
|
2016-03-03 13:58:09 -08:00
|
|
|
import six
|
2016-03-06 23:10:30 -08:00
|
|
|
import itertools
|
2016-05-28 15:01:33 -07:00
|
|
|
import json
|
2016-05-24 18:01:44 -07:00
|
|
|
|
|
|
|
from requests.models import PreparedRequest
|
2016-04-27 10:16:54 -07:00
|
|
|
import urllib3
|
2016-02-19 17:27:19 -08:00
|
|
|
|
|
|
|
|
|
|
|
#=============================================================================
|
2016-03-03 11:04:28 -08:00
|
|
|
class BaseLoader(object):
|
|
|
|
def __call__(self, cdx, params):
|
2016-03-08 10:27:13 -08:00
|
|
|
entry = self.load_resource(cdx, params)
|
2016-03-06 23:10:30 -08:00
|
|
|
if not entry:
|
2016-03-05 16:49:26 -08:00
|
|
|
return None, None
|
2016-03-03 11:04:28 -08:00
|
|
|
|
2016-03-08 10:27:13 -08:00
|
|
|
warc_headers, other_headers, stream = entry
|
2016-03-06 23:10:30 -08:00
|
|
|
|
|
|
|
out_headers = {}
|
2016-03-08 10:27:13 -08:00
|
|
|
out_headers['WebAgg-Type'] = 'warc'
|
2016-07-23 21:57:24 -04:00
|
|
|
out_headers['WebAgg-Source-Coll'] = quote(cdx.get('source', ''), safe=':/')
|
2016-03-08 10:27:13 -08:00
|
|
|
out_headers['Content-Type'] = 'application/warc-record'
|
|
|
|
|
|
|
|
if not warc_headers:
|
|
|
|
if other_headers:
|
|
|
|
out_headers['Link'] = other_headers.get('Link')
|
|
|
|
out_headers['Memento-Datetime'] = other_headers.get('Memento-Datetime')
|
|
|
|
out_headers['Content-Length'] = other_headers.get('Content-Length')
|
|
|
|
|
|
|
|
return out_headers, StreamIter(stream)
|
2016-03-03 11:04:28 -08:00
|
|
|
|
2016-03-05 16:49:26 -08:00
|
|
|
out_headers['Link'] = MementoUtils.make_link(
|
2016-03-06 23:10:30 -08:00
|
|
|
warc_headers.get_header('WARC-Target-URI'),
|
|
|
|
'original')
|
|
|
|
|
|
|
|
memento_dt = iso_date_to_datetime(warc_headers.get_header('WARC-Date'))
|
2016-03-05 16:49:26 -08:00
|
|
|
out_headers['Memento-Datetime'] = datetime_to_http_date(memento_dt)
|
2016-03-03 11:04:28 -08:00
|
|
|
|
2016-03-06 23:10:30 -08:00
|
|
|
warc_headers_buff = warc_headers.to_bytes()
|
|
|
|
|
2016-03-12 22:15:24 -08:00
|
|
|
lenset = self._set_content_len(warc_headers.get_header('Content-Length'),
|
|
|
|
out_headers,
|
|
|
|
len(warc_headers_buff))
|
|
|
|
|
|
|
|
streamiter = StreamIter(stream,
|
|
|
|
header1=warc_headers_buff,
|
|
|
|
header2=other_headers)
|
2016-03-06 23:10:30 -08:00
|
|
|
|
2016-03-12 22:15:24 -08:00
|
|
|
if not lenset:
|
|
|
|
out_headers['Transfer-Encoding'] = 'chunked'
|
2016-03-21 11:04:52 -07:00
|
|
|
streamiter = chunk_encode_iter(streamiter)
|
2016-03-12 22:15:24 -08:00
|
|
|
|
|
|
|
return out_headers, streamiter
|
2016-03-06 23:10:30 -08:00
|
|
|
|
|
|
|
def _set_content_len(self, content_len_str, headers, existing_len):
|
|
|
|
# Try to set content-length, if it is available and valid
|
|
|
|
try:
|
|
|
|
content_len = int(content_len_str)
|
|
|
|
except (KeyError, TypeError):
|
|
|
|
content_len = -1
|
|
|
|
|
|
|
|
if content_len >= 0:
|
|
|
|
content_len += existing_len
|
|
|
|
headers['Content-Length'] = str(content_len)
|
2016-03-12 22:15:24 -08:00
|
|
|
return True
|
|
|
|
|
|
|
|
return False
|
|
|
|
|
2016-03-24 16:08:29 -04:00
|
|
|
def raise_on_self_redirect(self, params, cdx, status_code, location_url):
|
|
|
|
"""
|
|
|
|
Check if response is a 3xx redirect to the same url
|
|
|
|
If so, reject this capture to avoid causing redirect loop
|
|
|
|
"""
|
|
|
|
if not status_code.startswith('3') or status_code == '304':
|
|
|
|
return
|
|
|
|
|
|
|
|
request_url = params['url'].lower()
|
|
|
|
if not location_url:
|
|
|
|
return
|
|
|
|
|
|
|
|
location_url = location_url.lower()
|
|
|
|
if location_url.startswith('/'):
|
|
|
|
host = urlsplit(cdx['url']).netloc
|
|
|
|
location_url = host + location_url
|
|
|
|
|
|
|
|
if request_url == location_url:
|
|
|
|
msg = 'Self Redirect {0} -> {1}'
|
|
|
|
msg = msg.format(request_url, location_url)
|
|
|
|
#print(msg)
|
|
|
|
raise WbException(msg)
|
|
|
|
|
2016-05-28 15:01:33 -07:00
|
|
|
@staticmethod
|
|
|
|
def _make_warc_id(id_=None):
|
|
|
|
if not id_:
|
|
|
|
id_ = uuid.uuid1()
|
|
|
|
return '<urn:uuid:{0}>'.format(id_)
|
|
|
|
|
2016-03-12 22:15:24 -08:00
|
|
|
|
2016-03-21 11:04:52 -07:00
|
|
|
#=============================================================================
|
|
|
|
class PrefixResolver(object):
|
|
|
|
def __init__(self, template):
|
|
|
|
self.template = template
|
|
|
|
|
|
|
|
def __call__(self, filename, cdx):
|
|
|
|
full_path = self.template
|
|
|
|
if hasattr(cdx, '_formatter') and cdx._formatter:
|
|
|
|
full_path = cdx._formatter.format(full_path)
|
|
|
|
|
|
|
|
return full_path + filename
|
|
|
|
|
|
|
|
|
|
|
|
#=============================================================================
|
|
|
|
class RedisResolver(RedisIndexSource):
|
|
|
|
def __call__(self, filename, cdx):
|
|
|
|
redis_key = self.redis_key_template
|
|
|
|
if hasattr(cdx, '_formatter') and cdx._formatter:
|
|
|
|
redis_key = cdx._formatter.format(redis_key)
|
|
|
|
|
2016-06-07 12:54:28 -04:00
|
|
|
res = None
|
|
|
|
|
|
|
|
if '*' in redis_key:
|
|
|
|
for key in self.redis.scan_iter(redis_key):
|
|
|
|
#key = key.decode('utf-8')
|
|
|
|
res = self.redis.hget(key, filename)
|
|
|
|
if res:
|
|
|
|
break
|
|
|
|
else:
|
|
|
|
res = self.redis.hget(redis_key, filename)
|
|
|
|
|
2016-03-26 22:32:21 -04:00
|
|
|
if res and six.PY3:
|
2016-03-21 11:04:52 -07:00
|
|
|
res = res.decode('utf-8')
|
|
|
|
|
|
|
|
return res
|
2016-03-03 11:04:28 -08:00
|
|
|
|
|
|
|
|
|
|
|
#=============================================================================
|
|
|
|
class WARCPathLoader(BaseLoader):
|
2016-02-24 14:22:29 -08:00
|
|
|
def __init__(self, paths, cdx_source):
|
|
|
|
self.paths = paths
|
2016-03-26 22:32:21 -04:00
|
|
|
if isinstance(paths, six.string_types):
|
2016-02-24 14:22:29 -08:00
|
|
|
self.paths = [paths]
|
|
|
|
|
2016-03-21 11:04:52 -07:00
|
|
|
self.resolvers = [self._make_resolver(path) for path in self.paths]
|
2016-02-24 14:22:29 -08:00
|
|
|
|
2016-03-21 11:04:52 -07:00
|
|
|
self.resolve_loader = ResolvingLoader(self.resolvers,
|
2016-02-24 14:22:29 -08:00
|
|
|
no_record_parse=True)
|
2016-03-24 16:08:29 -04:00
|
|
|
|
|
|
|
self.headers_parser = StatusAndHeadersParser([], verify=False)
|
|
|
|
|
2016-02-24 14:22:29 -08:00
|
|
|
self.cdx_source = cdx_source
|
2016-02-22 13:30:12 -08:00
|
|
|
|
2016-03-02 18:13:13 -08:00
|
|
|
def cdx_index_source(self, *args, **kwargs):
|
|
|
|
cdx_iter, errs = self.cdx_source(*args, **kwargs)
|
|
|
|
return cdx_iter
|
|
|
|
|
2016-03-21 11:04:52 -07:00
|
|
|
def _make_resolver(self, path):
|
|
|
|
if hasattr(path, '__call__'):
|
|
|
|
return path
|
|
|
|
|
|
|
|
if path.startswith('redis://'):
|
|
|
|
return RedisResolver(path)
|
|
|
|
|
|
|
|
else:
|
|
|
|
return PrefixResolver(path)
|
2016-02-19 17:27:19 -08:00
|
|
|
|
2016-03-08 10:27:13 -08:00
|
|
|
def load_resource(self, cdx, params):
|
|
|
|
if cdx.get('_cached_result'):
|
|
|
|
return cdx.get('_cached_result')
|
|
|
|
|
2016-02-22 13:30:12 -08:00
|
|
|
if not cdx.get('filename') or cdx.get('offset') is None:
|
2016-03-06 23:10:30 -08:00
|
|
|
return None
|
2016-02-19 17:27:19 -08:00
|
|
|
|
2016-06-14 00:13:01 -04:00
|
|
|
orig_source = cdx.get('source', '').split(':')[0]
|
|
|
|
cdx._formatter = ParamFormatter(params, orig_source)
|
2016-03-24 16:08:29 -04:00
|
|
|
|
2016-02-22 13:30:12 -08:00
|
|
|
failed_files = []
|
2016-02-24 14:22:29 -08:00
|
|
|
headers, payload = (self.resolve_loader.
|
|
|
|
load_headers_and_payload(cdx,
|
|
|
|
failed_files,
|
2016-03-02 18:13:13 -08:00
|
|
|
self.cdx_index_source))
|
2016-03-24 16:08:29 -04:00
|
|
|
|
|
|
|
if cdx.get('status', '').startswith('3'):
|
|
|
|
status_headers = self.headers_parser.parse(payload.stream)
|
|
|
|
self.raise_on_self_redirect(params, cdx,
|
|
|
|
status_headers.get_statuscode(),
|
|
|
|
status_headers.get_header('Location'))
|
|
|
|
http_headers_buff = status_headers.to_bytes()
|
|
|
|
else:
|
|
|
|
http_headers_buff = None
|
|
|
|
|
2016-03-06 23:10:30 -08:00
|
|
|
warc_headers = payload.rec_headers
|
2016-02-19 17:27:19 -08:00
|
|
|
|
2016-02-28 14:33:08 -08:00
|
|
|
if headers != payload:
|
2016-03-06 23:10:30 -08:00
|
|
|
warc_headers.replace_header('WARC-Refers-To-Target-URI',
|
|
|
|
payload.rec_headers.get_header('WARC-Target-URI'))
|
2016-02-19 17:27:19 -08:00
|
|
|
|
2016-03-06 23:10:30 -08:00
|
|
|
warc_headers.replace_header('WARC-Refers-To-Date',
|
|
|
|
payload.rec_headers.get_header('WARC-Date'))
|
2016-03-06 09:10:17 -08:00
|
|
|
|
2016-03-06 23:10:30 -08:00
|
|
|
warc_headers.replace_header('WARC-Target-URI',
|
|
|
|
headers.rec_headers.get_header('WARC-Target-URI'))
|
2016-03-06 09:10:17 -08:00
|
|
|
|
2016-03-06 23:10:30 -08:00
|
|
|
warc_headers.replace_header('WARC-Date',
|
|
|
|
headers.rec_headers.get_header('WARC-Date'))
|
2016-03-06 09:10:17 -08:00
|
|
|
|
2016-03-06 23:10:30 -08:00
|
|
|
headers.stream.close()
|
2016-03-06 09:10:17 -08:00
|
|
|
|
2016-03-24 16:08:29 -04:00
|
|
|
return (warc_headers, http_headers_buff, payload.stream)
|
2016-03-06 09:10:17 -08:00
|
|
|
|
|
|
|
def __str__(self):
|
2016-03-06 23:10:30 -08:00
|
|
|
return 'WARCPathLoader'
|
2016-03-06 09:10:17 -08:00
|
|
|
|
|
|
|
|
2016-02-19 17:27:19 -08:00
|
|
|
#=============================================================================
|
2016-03-03 11:04:28 -08:00
|
|
|
class LiveWebLoader(BaseLoader):
|
2016-03-12 22:15:24 -08:00
|
|
|
SKIP_HEADERS = ('link',
|
|
|
|
'memento-datetime',
|
|
|
|
'content-location',
|
|
|
|
'x-archive')
|
|
|
|
|
|
|
|
def __init__(self):
|
2016-04-27 10:16:54 -07:00
|
|
|
self.num_retries = 3
|
|
|
|
self.num_pools = 10
|
|
|
|
self.num_conn_per_pool = 10
|
|
|
|
|
|
|
|
self.pool = urllib3.PoolManager(num_pools=self.num_pools,
|
|
|
|
maxsize=self.num_conn_per_pool)
|
2016-02-19 17:27:19 -08:00
|
|
|
|
2016-03-08 10:27:13 -08:00
|
|
|
def load_resource(self, cdx, params):
|
2016-02-19 17:27:19 -08:00
|
|
|
load_url = cdx.get('load_url')
|
|
|
|
if not load_url:
|
2016-03-06 23:10:30 -08:00
|
|
|
return None
|
2016-02-19 17:27:19 -08:00
|
|
|
|
2016-05-28 15:01:33 -07:00
|
|
|
if params.get('content_type') == VideoLoader.CONTENT_TYPE:
|
|
|
|
return None
|
|
|
|
|
2016-02-24 14:22:29 -08:00
|
|
|
input_req = params['_input_req']
|
|
|
|
|
2016-02-28 14:33:08 -08:00
|
|
|
req_headers = input_req.get_req_headers()
|
2016-02-22 13:30:12 -08:00
|
|
|
|
|
|
|
dt = timestamp_to_datetime(cdx['timestamp'])
|
|
|
|
|
2016-03-08 10:27:13 -08:00
|
|
|
if cdx.get('memento_url'):
|
2016-02-22 13:30:12 -08:00
|
|
|
req_headers['Accept-Datetime'] = datetime_to_http_date(dt)
|
|
|
|
|
2016-02-24 14:22:29 -08:00
|
|
|
method = input_req.get_req_method()
|
|
|
|
data = input_req.get_req_body()
|
|
|
|
|
2016-05-24 18:01:44 -07:00
|
|
|
p = PreparedRequest()
|
|
|
|
p.prepare_url(load_url, None)
|
|
|
|
load_url = p.url
|
|
|
|
|
2016-02-29 12:34:06 -08:00
|
|
|
try:
|
2016-04-27 10:16:54 -07:00
|
|
|
upstream_res = self.pool.urlopen(method=method,
|
|
|
|
url=load_url,
|
|
|
|
body=data,
|
2016-03-12 22:15:24 -08:00
|
|
|
headers=req_headers,
|
2016-04-27 10:16:54 -07:00
|
|
|
redirect=False,
|
|
|
|
assert_same_host=False,
|
|
|
|
preload_content=False,
|
|
|
|
decode_content=False,
|
|
|
|
retries=self.num_retries,
|
2016-03-12 22:15:24 -08:00
|
|
|
timeout=params.get('_timeout'))
|
2016-04-27 10:16:54 -07:00
|
|
|
|
2016-03-06 23:10:30 -08:00
|
|
|
except Exception as e:
|
2016-02-29 12:34:06 -08:00
|
|
|
raise LiveResourceException(load_url)
|
2016-02-22 13:30:12 -08:00
|
|
|
|
2016-03-08 10:27:13 -08:00
|
|
|
memento_dt = upstream_res.headers.get('Memento-Datetime')
|
|
|
|
if memento_dt:
|
|
|
|
dt = http_date_to_datetime(memento_dt)
|
|
|
|
cdx['timestamp'] = datetime_to_timestamp(dt)
|
|
|
|
elif cdx.get('memento_url'):
|
|
|
|
# if 'memento_url' set and no Memento-Datetime header present
|
|
|
|
# then its an error
|
|
|
|
return None
|
|
|
|
|
|
|
|
agg_type = upstream_res.headers.get('WebAgg-Type')
|
|
|
|
if agg_type == 'warc':
|
2016-07-23 21:57:24 -04:00
|
|
|
cdx['source'] = unquote(upstream_res.headers.get('WebAgg-Source-Coll'))
|
2016-04-27 10:16:54 -07:00
|
|
|
return None, upstream_res.headers, upstream_res
|
2016-03-08 10:27:13 -08:00
|
|
|
|
2016-03-24 16:08:29 -04:00
|
|
|
self.raise_on_self_redirect(params, cdx,
|
2016-04-27 10:16:54 -07:00
|
|
|
str(upstream_res.status),
|
2016-03-24 16:08:29 -04:00
|
|
|
upstream_res.headers.get('Location'))
|
|
|
|
|
|
|
|
|
2016-04-27 10:16:54 -07:00
|
|
|
if upstream_res.version == 11:
|
2016-03-12 22:15:24 -08:00
|
|
|
version = '1.1'
|
|
|
|
else:
|
|
|
|
version = '1.0'
|
|
|
|
|
|
|
|
status = 'HTTP/{version} {status} {reason}\r\n'
|
|
|
|
status = status.format(version=version,
|
2016-04-27 10:16:54 -07:00
|
|
|
status=upstream_res.status,
|
2016-03-12 22:15:24 -08:00
|
|
|
reason=upstream_res.reason)
|
|
|
|
|
|
|
|
http_headers_buff = status
|
|
|
|
|
2016-04-27 10:16:54 -07:00
|
|
|
orig_resp = upstream_res._original_response
|
2016-03-12 22:15:24 -08:00
|
|
|
|
|
|
|
try: #pragma: no cover
|
|
|
|
#PY 3
|
|
|
|
resp_headers = orig_resp.headers._headers
|
|
|
|
for n, v in resp_headers:
|
|
|
|
if n.lower() in self.SKIP_HEADERS:
|
|
|
|
continue
|
|
|
|
|
|
|
|
http_headers_buff += n + ': ' + v + '\r\n'
|
|
|
|
except: #pragma: no cover
|
|
|
|
#PY 2
|
|
|
|
resp_headers = orig_resp.msg.headers
|
|
|
|
for n, v in zip(orig_resp.getheaders(), resp_headers):
|
|
|
|
if n in self.SKIP_HEADERS:
|
|
|
|
continue
|
|
|
|
|
|
|
|
http_headers_buff += v
|
|
|
|
|
|
|
|
http_headers_buff += '\r\n'
|
|
|
|
http_headers_buff = http_headers_buff.encode('latin-1')
|
|
|
|
|
|
|
|
try:
|
2016-04-27 10:16:54 -07:00
|
|
|
fp = upstream_res._fp.fp
|
2016-03-19 20:34:09 -07:00
|
|
|
if hasattr(fp, 'raw'): #pragma: no cover
|
2016-03-12 22:15:24 -08:00
|
|
|
fp = fp.raw
|
|
|
|
remote_ip = fp._sock.getpeername()[0]
|
|
|
|
except: #pragma: no cover
|
|
|
|
remote_ip = None
|
2016-02-19 17:27:19 -08:00
|
|
|
|
2016-03-06 23:10:30 -08:00
|
|
|
warc_headers = {}
|
2016-02-19 17:27:19 -08:00
|
|
|
|
2016-03-06 23:10:30 -08:00
|
|
|
warc_headers['WARC-Type'] = 'response'
|
|
|
|
warc_headers['WARC-Record-ID'] = self._make_warc_id()
|
|
|
|
warc_headers['WARC-Target-URI'] = cdx['url']
|
2016-03-08 10:27:13 -08:00
|
|
|
warc_headers['WARC-Date'] = datetime_to_iso_date(dt)
|
2016-03-12 22:15:24 -08:00
|
|
|
if remote_ip:
|
|
|
|
warc_headers['WARC-IP-Address'] = remote_ip
|
2016-02-19 17:27:19 -08:00
|
|
|
|
2016-03-06 23:10:30 -08:00
|
|
|
warc_headers['Content-Type'] = 'application/http; msgtype=response'
|
|
|
|
|
|
|
|
self._set_content_len(upstream_res.headers.get('Content-Length', -1),
|
|
|
|
warc_headers,
|
|
|
|
len(http_headers_buff))
|
2016-02-19 17:27:19 -08:00
|
|
|
|
2016-03-06 23:10:30 -08:00
|
|
|
warc_headers = StatusAndHeaders('WARC/1.0', warc_headers.items())
|
2016-04-27 10:16:54 -07:00
|
|
|
return (warc_headers, http_headers_buff, upstream_res)
|
2016-02-19 17:27:19 -08:00
|
|
|
|
2016-03-02 18:13:13 -08:00
|
|
|
def __str__(self):
|
|
|
|
return 'LiveWebLoader'
|
|
|
|
|
2016-05-28 15:01:33 -07:00
|
|
|
|
|
|
|
#=============================================================================
|
|
|
|
class VideoLoader(BaseLoader):
|
|
|
|
CONTENT_TYPE = 'application/vnd.youtube-dl_formats+json'
|
|
|
|
|
|
|
|
def __init__(self):
|
|
|
|
try:
|
|
|
|
from youtube_dl import YoutubeDL as YoutubeDL
|
|
|
|
except ImportError:
|
|
|
|
self.ydl = None
|
|
|
|
return
|
|
|
|
|
|
|
|
self.ydl = YoutubeDL(dict(simulate=True,
|
|
|
|
youtube_include_dash_manifest=False))
|
|
|
|
|
|
|
|
self.ydl.add_default_info_extractors()
|
|
|
|
|
|
|
|
def load_resource(self, cdx, params):
|
|
|
|
load_url = cdx.get('load_url')
|
|
|
|
if not load_url:
|
|
|
|
return None
|
|
|
|
|
|
|
|
if params.get('content_type') != self.CONTENT_TYPE:
|
|
|
|
return None
|
|
|
|
|
|
|
|
if not self.ydl:
|
|
|
|
return None
|
|
|
|
|
|
|
|
info = self.ydl.extract_info(load_url)
|
|
|
|
info_buff = json.dumps(info)
|
|
|
|
info_buff = info_buff.encode('utf-8')
|
|
|
|
|
|
|
|
warc_headers = {}
|
|
|
|
|
|
|
|
schema, rest = load_url.split('://', 1)
|
|
|
|
target_url = 'metadata://' + rest
|
|
|
|
|
|
|
|
dt = timestamp_to_datetime(cdx['timestamp'])
|
|
|
|
|
|
|
|
warc_headers['WARC-Type'] = 'metadata'
|
|
|
|
warc_headers['WARC-Record-ID'] = self._make_warc_id()
|
|
|
|
warc_headers['WARC-Target-URI'] = target_url
|
|
|
|
warc_headers['WARC-Date'] = datetime_to_iso_date(dt)
|
|
|
|
warc_headers['Content-Type'] = self.CONTENT_TYPE
|
|
|
|
warc_headers['Content-Length'] = str(len(info_buff))
|
|
|
|
|
|
|
|
warc_headers = StatusAndHeaders('WARC/1.0', warc_headers.items())
|
|
|
|
|
|
|
|
return warc_headers, None, BytesIO(info_buff)
|
|
|
|
|