From cba8e4ee3a6ba82c9c80efbbfcb00cdc36c2f3b0 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 17 Mar 2016 18:22:26 -0700 Subject: [PATCH] filters: more functional filter impl for header exclusion --- recorder/filters.py | 17 +++++++++++++++++ recorder/test/test_recorder.py | 4 +++- recorder/warcrecorder.py | 15 +++++++++------ 3 files changed, 29 insertions(+), 7 deletions(-) create mode 100644 recorder/filters.py diff --git a/recorder/filters.py b/recorder/filters.py new file mode 100644 index 00000000..809822d4 --- /dev/null +++ b/recorder/filters.py @@ -0,0 +1,17 @@ + + +# ============================================================================ +class ExcludeNone(object): + def __call__(self, record): + return None + + +# ============================================================================ +class ExcludeSpecificHeaders(object): + def __init__(self, exclude_headers=[]): + self.exclude_headers = [x.lower() for x in exclude_headers] + + def __call__(self, record): + return self.exclude_headers + + diff --git a/recorder/test/test_recorder.py b/recorder/test/test_recorder.py index f5dbdaf7..7838830b 100644 --- a/recorder/test/test_recorder.py +++ b/recorder/test/test_recorder.py @@ -12,6 +12,7 @@ from mock import patch from recorder.recorderapp import RecorderApp from recorder.redisindexer import WritableRedisIndexer from recorder.warcrecorder import PerRecordWARCRecorder +from recorder.filters import ExcludeSpecificHeaders from webagg.utils import MementoUtils @@ -128,8 +129,9 @@ class TestRecorder(LiveServerTests, TempDirTests, BaseTestClass): def test_record_cookies_skip_header(self): base_path = to_path(self.root_dir + '/warcs/cookieskip/') + header_filter = ExcludeSpecificHeaders(['Set-Cookie', 'Cookie']) recorder_app = RecorderApp(self.upstream_url, - PerRecordWARCRecorder(base_path, exclude_headers=['Set-Cookie', 'Cookie']), + PerRecordWARCRecorder(base_path, header_filter=header_filter), accept_colls='live') resp = self._test_per_warc(recorder_app, 'httpbin.org', '/cookies/set%3Fname%3Dvalue%26foo%3Dbar') diff --git a/recorder/warcrecorder.py b/recorder/warcrecorder.py index e75bff05..a17cd670 100644 --- a/recorder/warcrecorder.py +++ b/recorder/warcrecorder.py @@ -17,6 +17,8 @@ from pywb.utils.bufferedreaders import BufferedReader from webagg.utils import ParamFormatter +from recorder.filters import ExcludeNone + # ============================================================================ class BaseWARCRecorder(object): @@ -29,14 +31,14 @@ class BaseWARCRecorder(object): REVISIT_PROFILE = 'http://netpreserve.org/warc/1.0/revisit/uri-agnostic-identical-payload-digest' + BUFF_SIZE = 8192 + def __init__(self, gzip=True, dedup_index=None, name='recorder', - exclude_headers=None): + header_filter=ExcludeNone()): self.gzip = gzip self.dedup_index = dedup_index self.rec_source_name = name - self.exclude_headers = exclude_headers - if self.exclude_headers: - self.exclude_headers = [x.lower() for x in self.exclude_headers] + self.header_filter = header_filter def ensure_digest(self, record): block_digest = record.rec_headers.get('WARC-Block-Digest') @@ -52,7 +54,7 @@ class BaseWARCRecorder(object): block_digester.update(record.status_headers.headers_buff) while True: - buf = record.stream.read(8192) + buf = record.stream.read(self.BUFF_SIZE) if not buf: break @@ -67,7 +69,8 @@ class BaseWARCRecorder(object): return Digester('sha1') def _set_header_buff(self, record): - buff = record.status_headers.to_bytes(self.exclude_headers) + exclude_list = self.header_filter(record) + buff = record.status_headers.to_bytes(exclude_list) record.status_headers.headers_buff = buff def write_req_resp(self, req, resp, params):