mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
recorder warcwriter: allow skipping writing of only request or only response by overriding _is_write_req and _is_write_resp in subclass
(todo: rethink the interface)
This commit is contained in:
parent
a93f75dca2
commit
0b255819ff
@ -37,7 +37,8 @@ class RecorderApp(object):
|
|||||||
|
|
||||||
self.skip_filters = skip_filters
|
self.skip_filters = skip_filters
|
||||||
|
|
||||||
def create_default_filters(self, kwargs):
|
@staticmethod
|
||||||
|
def create_default_filters(kwargs):
|
||||||
skip_filters = [SkipRangeRequestFilter()]
|
skip_filters = [SkipRangeRequestFilter()]
|
||||||
|
|
||||||
accept_colls = kwargs.get('accept_colls')
|
accept_colls = kwargs.get('accept_colls')
|
||||||
|
@ -44,7 +44,7 @@ class BaseWARCWriter(object):
|
|||||||
FILE_TEMPLATE = 'rec-{timestamp}-{hostname}.warc.gz'
|
FILE_TEMPLATE = 'rec-{timestamp}-{hostname}.warc.gz'
|
||||||
|
|
||||||
def __init__(self, gzip=True, dedup_index=None, name='recorder',
|
def __init__(self, gzip=True, dedup_index=None, name='recorder',
|
||||||
header_filter=ExcludeNone()):
|
header_filter=ExcludeNone(), *args, **kwargs):
|
||||||
self.gzip = gzip
|
self.gzip = gzip
|
||||||
self.dedup_index = dedup_index
|
self.dedup_index = dedup_index
|
||||||
self.rec_source_name = name
|
self.rec_source_name = name
|
||||||
@ -85,13 +85,13 @@ class BaseWARCWriter(object):
|
|||||||
record.status_headers.headers_buff = buff
|
record.status_headers.headers_buff = buff
|
||||||
|
|
||||||
def write_req_resp(self, req, resp, params):
|
def write_req_resp(self, req, resp, params):
|
||||||
url = resp.rec_headers.get('WARC-Target-Uri')
|
url = resp.rec_headers.get('WARC-Target-URI')
|
||||||
dt = resp.rec_headers.get('WARC-Date')
|
dt = resp.rec_headers.get('WARC-Date')
|
||||||
|
|
||||||
if not req.rec_headers.get('WARC-Record-ID'):
|
if not req.rec_headers.get('WARC-Record-ID'):
|
||||||
req.rec_headers['WARC-Record-ID'] = self._make_warc_id()
|
req.rec_headers['WARC-Record-ID'] = self._make_warc_id()
|
||||||
|
|
||||||
req.rec_headers['WARC-Target-Uri'] = url
|
req.rec_headers['WARC-Target-URI'] = url
|
||||||
req.rec_headers['WARC-Date'] = dt
|
req.rec_headers['WARC-Date'] = dt
|
||||||
req.rec_headers['WARC-Type'] = 'request'
|
req.rec_headers['WARC-Type'] = 'request'
|
||||||
#req.rec_headers['Content-Type'] = req.content_type
|
#req.rec_headers['Content-Type'] = req.content_type
|
||||||
@ -137,7 +137,7 @@ class BaseWARCWriter(object):
|
|||||||
return record
|
return record
|
||||||
|
|
||||||
try:
|
try:
|
||||||
url = record.rec_headers.get('WARC-Target-Uri')
|
url = record.rec_headers.get('WARC-Target-URI')
|
||||||
digest = record.rec_headers.get('WARC-Payload-Digest')
|
digest = record.rec_headers.get('WARC-Payload-Digest')
|
||||||
iso_dt = record.rec_headers.get('WARC-Date')
|
iso_dt = record.rec_headers.get('WARC-Date')
|
||||||
result = self.dedup_index.lookup_revisit(params, digest, url, iso_dt)
|
result = self.dedup_index.lookup_revisit(params, digest, url, iso_dt)
|
||||||
@ -310,6 +310,12 @@ class MultiFileWARCWriter(BaseWARCWriter):
|
|||||||
self._close_file(out)
|
self._close_file(out)
|
||||||
return filename
|
return filename
|
||||||
|
|
||||||
|
def _is_write_resp(self, resp, params):
|
||||||
|
return True
|
||||||
|
|
||||||
|
def _is_write_req(self, req, params):
|
||||||
|
return True
|
||||||
|
|
||||||
def _do_write_req_resp(self, req, resp, params):
|
def _do_write_req_resp(self, req, resp, params):
|
||||||
full_dir = res_template(self.dir_template, params)
|
full_dir = res_template(self.dir_template, params)
|
||||||
|
|
||||||
@ -325,13 +331,16 @@ class MultiFileWARCWriter(BaseWARCWriter):
|
|||||||
is_new = True
|
is_new = True
|
||||||
|
|
||||||
try:
|
try:
|
||||||
url = resp.rec_headers.get('WARC-Target-Uri')
|
url = resp.rec_headers.get('WARC-Target-URI')
|
||||||
print('Writing req/resp {0} to {1} '.format(url, filename))
|
print('Writing req/resp {0} to {1} '.format(url, filename))
|
||||||
|
|
||||||
start = out.tell()
|
start = out.tell()
|
||||||
|
|
||||||
self._write_warc_record(out, resp)
|
if self._is_write_resp(resp, params):
|
||||||
self._write_warc_record(out, req)
|
self._write_warc_record(out, resp)
|
||||||
|
|
||||||
|
if self._is_write_req(req, params):
|
||||||
|
self._write_warc_record(out, req)
|
||||||
|
|
||||||
out.flush()
|
out.flush()
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user