diff --git a/pywb/recorder/multifilewarcwriter.py b/pywb/recorder/multifilewarcwriter.py index 806cc009..9c206213 100644 --- a/pywb/recorder/multifilewarcwriter.py +++ b/pywb/recorder/multifilewarcwriter.py @@ -132,10 +132,14 @@ class MultiFileWARCWriter(BaseWARCWriter): params = params or {} self._do_write_req_resp(None, record, params) + def _copy_header(self, from_rec, to_rec, name): + header = from_rec.rec_headers.get_header(name) + if header: + to_rec.rec_headers.add_header(name, header) + def _do_write_req_resp(self, req, resp, params): - prov = resp.rec_headers.get_header('WARC-Provenance') - if prov: - req.rec_headers.add_header('WARC-Provenance', prov) + self._copy_header(resp, req, 'WARC-Recorded-From-URI') + self._copy_header(resp, req, 'WARC-Recorded-On-Date') resp = self._check_revisit(resp, params) if not resp: diff --git a/pywb/webagg/responseloader.py b/pywb/webagg/responseloader.py index c07de57f..566c1f88 100644 --- a/pywb/webagg/responseloader.py +++ b/pywb/webagg/responseloader.py @@ -22,6 +22,7 @@ import six import itertools import json import glob +import datetime from requests.models import PreparedRequest from requests.packages import urllib3 @@ -41,7 +42,7 @@ class BaseLoader(object): warc_headers, other_headers, stream = entry - source = self._get_provenance(cdx) + source = self._get_source_id(cdx) out_headers = {} out_headers['WebAgg-Type'] = 'warc' @@ -50,6 +51,9 @@ class BaseLoader(object): out_headers['WebAgg-Cdx'] = to_native_str(cdx.to_cdxj().rstrip()) out_headers['WebAgg-Source-Coll'] = source + if params.get('recorder_skip'): + out_headers['Recorder-Skip'] = '1' + if not warc_headers: if other_headers: out_headers['Link'] = other_headers.get('Link') @@ -91,7 +95,7 @@ class BaseLoader(object): return out_headers, streamiter - def _get_provenance(self, cdx): + def _get_source_id(self, cdx): return quote(cdx.get('source', ''), safe=':/') def _set_content_len(self, content_len_str, headers, existing_len): @@ -432,7 +436,9 @@ class LiveWebLoader(BaseLoader): warc_headers['WARC-Date'] = datetime_to_iso_date(dt) if not cdx.get('is_live'): - warc_headers['WARC-Provenance'] = self._get_provenance(cdx) + now = datetime.datetime.utcnow() + warc_headers['WARC-Recorded-From-URI'] = cdx.get('load_url') + warc_headers['WARC-Recorded-On-Date'] = datatime_to_iso_date(now) if remote_ip: warc_headers['WARC-IP-Address'] = remote_ip