mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
provenance improvement: don't store source id as provenance,
instead write full url to WARC-Recorded-From-URI, current datetime to WARC-Recorded-On-Date warcwriter: ensure WARC-Recorded-* headers copied to request record as well
This commit is contained in:
parent
afbe2478cb
commit
630911ef23
@ -132,10 +132,14 @@ class MultiFileWARCWriter(BaseWARCWriter):
|
||||
params = params or {}
|
||||
self._do_write_req_resp(None, record, params)
|
||||
|
||||
def _copy_header(self, from_rec, to_rec, name):
|
||||
header = from_rec.rec_headers.get_header(name)
|
||||
if header:
|
||||
to_rec.rec_headers.add_header(name, header)
|
||||
|
||||
def _do_write_req_resp(self, req, resp, params):
|
||||
prov = resp.rec_headers.get_header('WARC-Provenance')
|
||||
if prov:
|
||||
req.rec_headers.add_header('WARC-Provenance', prov)
|
||||
self._copy_header(resp, req, 'WARC-Recorded-From-URI')
|
||||
self._copy_header(resp, req, 'WARC-Recorded-On-Date')
|
||||
|
||||
resp = self._check_revisit(resp, params)
|
||||
if not resp:
|
||||
|
@ -22,6 +22,7 @@ import six
|
||||
import itertools
|
||||
import json
|
||||
import glob
|
||||
import datetime
|
||||
|
||||
from requests.models import PreparedRequest
|
||||
from requests.packages import urllib3
|
||||
@ -41,7 +42,7 @@ class BaseLoader(object):
|
||||
|
||||
warc_headers, other_headers, stream = entry
|
||||
|
||||
source = self._get_provenance(cdx)
|
||||
source = self._get_source_id(cdx)
|
||||
|
||||
out_headers = {}
|
||||
out_headers['WebAgg-Type'] = 'warc'
|
||||
@ -50,6 +51,9 @@ class BaseLoader(object):
|
||||
out_headers['WebAgg-Cdx'] = to_native_str(cdx.to_cdxj().rstrip())
|
||||
out_headers['WebAgg-Source-Coll'] = source
|
||||
|
||||
if params.get('recorder_skip'):
|
||||
out_headers['Recorder-Skip'] = '1'
|
||||
|
||||
if not warc_headers:
|
||||
if other_headers:
|
||||
out_headers['Link'] = other_headers.get('Link')
|
||||
@ -91,7 +95,7 @@ class BaseLoader(object):
|
||||
|
||||
return out_headers, streamiter
|
||||
|
||||
def _get_provenance(self, cdx):
|
||||
def _get_source_id(self, cdx):
|
||||
return quote(cdx.get('source', ''), safe=':/')
|
||||
|
||||
def _set_content_len(self, content_len_str, headers, existing_len):
|
||||
@ -432,7 +436,9 @@ class LiveWebLoader(BaseLoader):
|
||||
warc_headers['WARC-Date'] = datetime_to_iso_date(dt)
|
||||
|
||||
if not cdx.get('is_live'):
|
||||
warc_headers['WARC-Provenance'] = self._get_provenance(cdx)
|
||||
now = datetime.datetime.utcnow()
|
||||
warc_headers['WARC-Recorded-From-URI'] = cdx.get('load_url')
|
||||
warc_headers['WARC-Recorded-On-Date'] = datatime_to_iso_date(now)
|
||||
|
||||
if remote_ip:
|
||||
warc_headers['WARC-IP-Address'] = remote_ip
|
||||
|
Loading…
x
Reference in New Issue
Block a user