1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 08:04:49 +01:00

filters: more functional filter impl for header exclusion

This commit is contained in:
Ilya Kreymer 2016-03-17 18:22:26 -07:00
parent 58e8c709aa
commit cba8e4ee3a
3 changed files with 29 additions and 7 deletions

17
recorder/filters.py Normal file
View File

@ -0,0 +1,17 @@
# ============================================================================
class ExcludeNone(object):
def __call__(self, record):
return None
# ============================================================================
class ExcludeSpecificHeaders(object):
def __init__(self, exclude_headers=[]):
self.exclude_headers = [x.lower() for x in exclude_headers]
def __call__(self, record):
return self.exclude_headers

View File

@ -12,6 +12,7 @@ from mock import patch
from recorder.recorderapp import RecorderApp
from recorder.redisindexer import WritableRedisIndexer
from recorder.warcrecorder import PerRecordWARCRecorder
from recorder.filters import ExcludeSpecificHeaders
from webagg.utils import MementoUtils
@ -128,8 +129,9 @@ class TestRecorder(LiveServerTests, TempDirTests, BaseTestClass):
def test_record_cookies_skip_header(self):
base_path = to_path(self.root_dir + '/warcs/cookieskip/')
header_filter = ExcludeSpecificHeaders(['Set-Cookie', 'Cookie'])
recorder_app = RecorderApp(self.upstream_url,
PerRecordWARCRecorder(base_path, exclude_headers=['Set-Cookie', 'Cookie']),
PerRecordWARCRecorder(base_path, header_filter=header_filter),
accept_colls='live')
resp = self._test_per_warc(recorder_app, 'httpbin.org', '/cookies/set%3Fname%3Dvalue%26foo%3Dbar')

View File

@ -17,6 +17,8 @@ from pywb.utils.bufferedreaders import BufferedReader
from webagg.utils import ParamFormatter
from recorder.filters import ExcludeNone
# ============================================================================
class BaseWARCRecorder(object):
@ -29,14 +31,14 @@ class BaseWARCRecorder(object):
REVISIT_PROFILE = 'http://netpreserve.org/warc/1.0/revisit/uri-agnostic-identical-payload-digest'
BUFF_SIZE = 8192
def __init__(self, gzip=True, dedup_index=None, name='recorder',
exclude_headers=None):
header_filter=ExcludeNone()):
self.gzip = gzip
self.dedup_index = dedup_index
self.rec_source_name = name
self.exclude_headers = exclude_headers
if self.exclude_headers:
self.exclude_headers = [x.lower() for x in self.exclude_headers]
self.header_filter = header_filter
def ensure_digest(self, record):
block_digest = record.rec_headers.get('WARC-Block-Digest')
@ -52,7 +54,7 @@ class BaseWARCRecorder(object):
block_digester.update(record.status_headers.headers_buff)
while True:
buf = record.stream.read(8192)
buf = record.stream.read(self.BUFF_SIZE)
if not buf:
break
@ -67,7 +69,8 @@ class BaseWARCRecorder(object):
return Digester('sha1')
def _set_header_buff(self, record):
buff = record.status_headers.to_bytes(self.exclude_headers)
exclude_list = self.header_filter(record)
buff = record.status_headers.to_bytes(exclude_list)
record.status_headers.headers_buff = buff
def write_req_resp(self, req, resp, params):