1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

provenance improvement: don't store source id as provenance,

instead write full url to WARC-Recorded-From-URI, current datetime to WARC-Recorded-On-Date
warcwriter: ensure WARC-Recorded-* headers copied to request record as well
This commit is contained in:
Ilya Kreymer 2017-05-25 13:10:24 -07:00
parent afbe2478cb
commit 630911ef23
2 changed files with 16 additions and 6 deletions

View File

@ -132,10 +132,14 @@ class MultiFileWARCWriter(BaseWARCWriter):
params = params or {}
self._do_write_req_resp(None, record, params)
def _copy_header(self, from_rec, to_rec, name):
header = from_rec.rec_headers.get_header(name)
if header:
to_rec.rec_headers.add_header(name, header)
def _do_write_req_resp(self, req, resp, params):
prov = resp.rec_headers.get_header('WARC-Provenance')
if prov:
req.rec_headers.add_header('WARC-Provenance', prov)
self._copy_header(resp, req, 'WARC-Recorded-From-URI')
self._copy_header(resp, req, 'WARC-Recorded-On-Date')
resp = self._check_revisit(resp, params)
if not resp:

View File

@ -22,6 +22,7 @@ import six
import itertools
import json
import glob
import datetime
from requests.models import PreparedRequest
from requests.packages import urllib3
@ -41,7 +42,7 @@ class BaseLoader(object):
warc_headers, other_headers, stream = entry
source = self._get_provenance(cdx)
source = self._get_source_id(cdx)
out_headers = {}
out_headers['WebAgg-Type'] = 'warc'
@ -50,6 +51,9 @@ class BaseLoader(object):
out_headers['WebAgg-Cdx'] = to_native_str(cdx.to_cdxj().rstrip())
out_headers['WebAgg-Source-Coll'] = source
if params.get('recorder_skip'):
out_headers['Recorder-Skip'] = '1'
if not warc_headers:
if other_headers:
out_headers['Link'] = other_headers.get('Link')
@ -91,7 +95,7 @@ class BaseLoader(object):
return out_headers, streamiter
def _get_provenance(self, cdx):
def _get_source_id(self, cdx):
return quote(cdx.get('source', ''), safe=':/')
def _set_content_len(self, content_len_str, headers, existing_len):
@ -432,7 +436,9 @@ class LiveWebLoader(BaseLoader):
warc_headers['WARC-Date'] = datetime_to_iso_date(dt)
if not cdx.get('is_live'):
warc_headers['WARC-Provenance'] = self._get_provenance(cdx)
now = datetime.datetime.utcnow()
warc_headers['WARC-Recorded-From-URI'] = cdx.get('load_url')
warc_headers['WARC-Recorded-On-Date'] = datatime_to_iso_date(now)
if remote_ip:
warc_headers['WARC-IP-Address'] = remote_ip