1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

provenance improvement: don't store source id as provenance,

instead write full url to WARC-Recorded-From-URI, current datetime to WARC-Recorded-On-Date
warcwriter: ensure WARC-Recorded-* headers copied to request record as well
This commit is contained in:
Ilya Kreymer 2017-05-25 13:10:24 -07:00
parent 8e97a29c0b
commit 5930b2acb3
2 changed files with 16 additions and 6 deletions

View File

@ -132,10 +132,14 @@ class MultiFileWARCWriter(BaseWARCWriter):
params = params or {} params = params or {}
self._do_write_req_resp(None, record, params) self._do_write_req_resp(None, record, params)
def _copy_header(self, from_rec, to_rec, name):
header = from_rec.rec_headers.get_header(name)
if header:
to_rec.rec_headers.add_header(name, header)
def _do_write_req_resp(self, req, resp, params): def _do_write_req_resp(self, req, resp, params):
prov = resp.rec_headers.get_header('WARC-Provenance') self._copy_header(resp, req, 'WARC-Recorded-From-URI')
if prov: self._copy_header(resp, req, 'WARC-Recorded-On-Date')
req.rec_headers.add_header(prov)
resp = self._check_revisit(resp, params) resp = self._check_revisit(resp, params)
if not resp: if not resp:

View File

@ -22,6 +22,7 @@ import six
import itertools import itertools
import json import json
import glob import glob
import datetime
from requests.models import PreparedRequest from requests.models import PreparedRequest
from requests.packages import urllib3 from requests.packages import urllib3
@ -41,7 +42,7 @@ class BaseLoader(object):
warc_headers, other_headers, stream = entry warc_headers, other_headers, stream = entry
source = self._get_provenance(cdx) source = self._get_source_id(cdx)
out_headers = {} out_headers = {}
out_headers['WebAgg-Type'] = 'warc' out_headers['WebAgg-Type'] = 'warc'
@ -50,6 +51,9 @@ class BaseLoader(object):
out_headers['WebAgg-Cdx'] = to_native_str(cdx.to_cdxj().rstrip()) out_headers['WebAgg-Cdx'] = to_native_str(cdx.to_cdxj().rstrip())
out_headers['WebAgg-Source-Coll'] = source out_headers['WebAgg-Source-Coll'] = source
if params.get('recorder_skip'):
out_headers['Recorder-Skip'] = '1'
if not warc_headers: if not warc_headers:
if other_headers: if other_headers:
out_headers['Link'] = other_headers.get('Link') out_headers['Link'] = other_headers.get('Link')
@ -91,7 +95,7 @@ class BaseLoader(object):
return out_headers, streamiter return out_headers, streamiter
def _get_provenance(self, cdx): def _get_source_id(self, cdx):
return quote(cdx.get('source', ''), safe=':/') return quote(cdx.get('source', ''), safe=':/')
def _set_content_len(self, content_len_str, headers, existing_len): def _set_content_len(self, content_len_str, headers, existing_len):
@ -366,7 +370,9 @@ class LiveWebLoader(BaseLoader):
warc_headers['WARC-Date'] = datetime_to_iso_date(dt) warc_headers['WARC-Date'] = datetime_to_iso_date(dt)
if not cdx.get('is_live'): if not cdx.get('is_live'):
warc_headers['WARC-Provenance'] = self._get_provenance(cdx) now = datetime.datetime.utcnow()
warc_headers['WARC-Recorded-From-URI'] = cdx.get('load_url')
warc_headers['WARC-Recorded-On-Date'] = datatime_to_iso_date(now)
if remote_ip: if remote_ip:
warc_headers['WARC-IP-Address'] = remote_ip warc_headers['WARC-IP-Address'] = remote_ip