1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 08:04:49 +01:00

recorder warcwriter: allow skipping writing of only request or only response by overriding _is_write_req and _is_write_resp in subclass

(todo: rethink the interface)
This commit is contained in:
Ilya Kreymer 2016-04-15 02:19:34 +00:00
parent a93f75dca2
commit 0b255819ff
2 changed files with 18 additions and 8 deletions

View File

@ -37,7 +37,8 @@ class RecorderApp(object):
self.skip_filters = skip_filters
def create_default_filters(self, kwargs):
@staticmethod
def create_default_filters(kwargs):
skip_filters = [SkipRangeRequestFilter()]
accept_colls = kwargs.get('accept_colls')

View File

@ -44,7 +44,7 @@ class BaseWARCWriter(object):
FILE_TEMPLATE = 'rec-{timestamp}-{hostname}.warc.gz'
def __init__(self, gzip=True, dedup_index=None, name='recorder',
header_filter=ExcludeNone()):
header_filter=ExcludeNone(), *args, **kwargs):
self.gzip = gzip
self.dedup_index = dedup_index
self.rec_source_name = name
@ -85,13 +85,13 @@ class BaseWARCWriter(object):
record.status_headers.headers_buff = buff
def write_req_resp(self, req, resp, params):
url = resp.rec_headers.get('WARC-Target-Uri')
url = resp.rec_headers.get('WARC-Target-URI')
dt = resp.rec_headers.get('WARC-Date')
if not req.rec_headers.get('WARC-Record-ID'):
req.rec_headers['WARC-Record-ID'] = self._make_warc_id()
req.rec_headers['WARC-Target-Uri'] = url
req.rec_headers['WARC-Target-URI'] = url
req.rec_headers['WARC-Date'] = dt
req.rec_headers['WARC-Type'] = 'request'
#req.rec_headers['Content-Type'] = req.content_type
@ -137,7 +137,7 @@ class BaseWARCWriter(object):
return record
try:
url = record.rec_headers.get('WARC-Target-Uri')
url = record.rec_headers.get('WARC-Target-URI')
digest = record.rec_headers.get('WARC-Payload-Digest')
iso_dt = record.rec_headers.get('WARC-Date')
result = self.dedup_index.lookup_revisit(params, digest, url, iso_dt)
@ -310,6 +310,12 @@ class MultiFileWARCWriter(BaseWARCWriter):
self._close_file(out)
return filename
def _is_write_resp(self, resp, params):
return True
def _is_write_req(self, req, params):
return True
def _do_write_req_resp(self, req, resp, params):
full_dir = res_template(self.dir_template, params)
@ -325,13 +331,16 @@ class MultiFileWARCWriter(BaseWARCWriter):
is_new = True
try:
url = resp.rec_headers.get('WARC-Target-Uri')
url = resp.rec_headers.get('WARC-Target-URI')
print('Writing req/resp {0} to {1} '.format(url, filename))
start = out.tell()
self._write_warc_record(out, resp)
self._write_warc_record(out, req)
if self._is_write_resp(resp, params):
self._write_warc_record(out, resp)
if self._is_write_req(req, params):
self._write_warc_record(out, req)
out.flush()