mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 08:04:49 +01:00
filters: more functional filter impl for header exclusion
This commit is contained in:
parent
58e8c709aa
commit
cba8e4ee3a
17
recorder/filters.py
Normal file
17
recorder/filters.py
Normal file
@ -0,0 +1,17 @@
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class ExcludeNone(object):
|
||||
def __call__(self, record):
|
||||
return None
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class ExcludeSpecificHeaders(object):
|
||||
def __init__(self, exclude_headers=[]):
|
||||
self.exclude_headers = [x.lower() for x in exclude_headers]
|
||||
|
||||
def __call__(self, record):
|
||||
return self.exclude_headers
|
||||
|
||||
|
@ -12,6 +12,7 @@ from mock import patch
|
||||
from recorder.recorderapp import RecorderApp
|
||||
from recorder.redisindexer import WritableRedisIndexer
|
||||
from recorder.warcrecorder import PerRecordWARCRecorder
|
||||
from recorder.filters import ExcludeSpecificHeaders
|
||||
|
||||
from webagg.utils import MementoUtils
|
||||
|
||||
@ -128,8 +129,9 @@ class TestRecorder(LiveServerTests, TempDirTests, BaseTestClass):
|
||||
|
||||
def test_record_cookies_skip_header(self):
|
||||
base_path = to_path(self.root_dir + '/warcs/cookieskip/')
|
||||
header_filter = ExcludeSpecificHeaders(['Set-Cookie', 'Cookie'])
|
||||
recorder_app = RecorderApp(self.upstream_url,
|
||||
PerRecordWARCRecorder(base_path, exclude_headers=['Set-Cookie', 'Cookie']),
|
||||
PerRecordWARCRecorder(base_path, header_filter=header_filter),
|
||||
accept_colls='live')
|
||||
|
||||
resp = self._test_per_warc(recorder_app, 'httpbin.org', '/cookies/set%3Fname%3Dvalue%26foo%3Dbar')
|
||||
|
@ -17,6 +17,8 @@ from pywb.utils.bufferedreaders import BufferedReader
|
||||
|
||||
from webagg.utils import ParamFormatter
|
||||
|
||||
from recorder.filters import ExcludeNone
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class BaseWARCRecorder(object):
|
||||
@ -29,14 +31,14 @@ class BaseWARCRecorder(object):
|
||||
|
||||
REVISIT_PROFILE = 'http://netpreserve.org/warc/1.0/revisit/uri-agnostic-identical-payload-digest'
|
||||
|
||||
BUFF_SIZE = 8192
|
||||
|
||||
def __init__(self, gzip=True, dedup_index=None, name='recorder',
|
||||
exclude_headers=None):
|
||||
header_filter=ExcludeNone()):
|
||||
self.gzip = gzip
|
||||
self.dedup_index = dedup_index
|
||||
self.rec_source_name = name
|
||||
self.exclude_headers = exclude_headers
|
||||
if self.exclude_headers:
|
||||
self.exclude_headers = [x.lower() for x in self.exclude_headers]
|
||||
self.header_filter = header_filter
|
||||
|
||||
def ensure_digest(self, record):
|
||||
block_digest = record.rec_headers.get('WARC-Block-Digest')
|
||||
@ -52,7 +54,7 @@ class BaseWARCRecorder(object):
|
||||
block_digester.update(record.status_headers.headers_buff)
|
||||
|
||||
while True:
|
||||
buf = record.stream.read(8192)
|
||||
buf = record.stream.read(self.BUFF_SIZE)
|
||||
if not buf:
|
||||
break
|
||||
|
||||
@ -67,7 +69,8 @@ class BaseWARCRecorder(object):
|
||||
return Digester('sha1')
|
||||
|
||||
def _set_header_buff(self, record):
|
||||
buff = record.status_headers.to_bytes(self.exclude_headers)
|
||||
exclude_list = self.header_filter(record)
|
||||
buff = record.status_headers.to_bytes(exclude_list)
|
||||
record.status_headers.headers_buff = buff
|
||||
|
||||
def write_req_resp(self, req, resp, params):
|
||||
|
Loading…
x
Reference in New Issue
Block a user