1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

filters: more functional filter impl for header exclusion

This commit is contained in:
Ilya Kreymer 2016-03-17 18:22:26 -07:00
parent 58e8c709aa
commit cba8e4ee3a
3 changed files with 29 additions and 7 deletions

17
recorder/filters.py Normal file
View File

@ -0,0 +1,17 @@
# ============================================================================
class ExcludeNone(object):
def __call__(self, record):
return None
# ============================================================================
class ExcludeSpecificHeaders(object):
def __init__(self, exclude_headers=[]):
self.exclude_headers = [x.lower() for x in exclude_headers]
def __call__(self, record):
return self.exclude_headers

View File

@ -12,6 +12,7 @@ from mock import patch
from recorder.recorderapp import RecorderApp from recorder.recorderapp import RecorderApp
from recorder.redisindexer import WritableRedisIndexer from recorder.redisindexer import WritableRedisIndexer
from recorder.warcrecorder import PerRecordWARCRecorder from recorder.warcrecorder import PerRecordWARCRecorder
from recorder.filters import ExcludeSpecificHeaders
from webagg.utils import MementoUtils from webagg.utils import MementoUtils
@ -128,8 +129,9 @@ class TestRecorder(LiveServerTests, TempDirTests, BaseTestClass):
def test_record_cookies_skip_header(self): def test_record_cookies_skip_header(self):
base_path = to_path(self.root_dir + '/warcs/cookieskip/') base_path = to_path(self.root_dir + '/warcs/cookieskip/')
header_filter = ExcludeSpecificHeaders(['Set-Cookie', 'Cookie'])
recorder_app = RecorderApp(self.upstream_url, recorder_app = RecorderApp(self.upstream_url,
PerRecordWARCRecorder(base_path, exclude_headers=['Set-Cookie', 'Cookie']), PerRecordWARCRecorder(base_path, header_filter=header_filter),
accept_colls='live') accept_colls='live')
resp = self._test_per_warc(recorder_app, 'httpbin.org', '/cookies/set%3Fname%3Dvalue%26foo%3Dbar') resp = self._test_per_warc(recorder_app, 'httpbin.org', '/cookies/set%3Fname%3Dvalue%26foo%3Dbar')

View File

@ -17,6 +17,8 @@ from pywb.utils.bufferedreaders import BufferedReader
from webagg.utils import ParamFormatter from webagg.utils import ParamFormatter
from recorder.filters import ExcludeNone
# ============================================================================ # ============================================================================
class BaseWARCRecorder(object): class BaseWARCRecorder(object):
@ -29,14 +31,14 @@ class BaseWARCRecorder(object):
REVISIT_PROFILE = 'http://netpreserve.org/warc/1.0/revisit/uri-agnostic-identical-payload-digest' REVISIT_PROFILE = 'http://netpreserve.org/warc/1.0/revisit/uri-agnostic-identical-payload-digest'
BUFF_SIZE = 8192
def __init__(self, gzip=True, dedup_index=None, name='recorder', def __init__(self, gzip=True, dedup_index=None, name='recorder',
exclude_headers=None): header_filter=ExcludeNone()):
self.gzip = gzip self.gzip = gzip
self.dedup_index = dedup_index self.dedup_index = dedup_index
self.rec_source_name = name self.rec_source_name = name
self.exclude_headers = exclude_headers self.header_filter = header_filter
if self.exclude_headers:
self.exclude_headers = [x.lower() for x in self.exclude_headers]
def ensure_digest(self, record): def ensure_digest(self, record):
block_digest = record.rec_headers.get('WARC-Block-Digest') block_digest = record.rec_headers.get('WARC-Block-Digest')
@ -52,7 +54,7 @@ class BaseWARCRecorder(object):
block_digester.update(record.status_headers.headers_buff) block_digester.update(record.status_headers.headers_buff)
while True: while True:
buf = record.stream.read(8192) buf = record.stream.read(self.BUFF_SIZE)
if not buf: if not buf:
break break
@ -67,7 +69,8 @@ class BaseWARCRecorder(object):
return Digester('sha1') return Digester('sha1')
def _set_header_buff(self, record): def _set_header_buff(self, record):
buff = record.status_headers.to_bytes(self.exclude_headers) exclude_list = self.header_filter(record)
buff = record.status_headers.to_bytes(exclude_list)
record.status_headers.headers_buff = buff record.status_headers.headers_buff = buff
def write_req_resp(self, req, resp, params): def write_req_resp(self, req, resp, params):