mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 08:04:49 +01:00
recorder: cookie filter:
- update ExcludeSpecificHeaders() to be passed directly as a filter to warcio - add ExcludeHttpOnlyCookiesHeader() to exclude only Set-Cookie if HttpOnly is present remove unused code
This commit is contained in:
parent
7a8fed2681
commit
d04f8fc2e3
@ -5,18 +5,31 @@ import re
|
||||
# ============================================================================
|
||||
# Header Exclusions
|
||||
# ============================================================================
|
||||
class ExcludeNone(object):
|
||||
def __call__(self, record):
|
||||
return None
|
||||
class ExcludeSpecificHeaders(object):
|
||||
def __init__(self, exclude_headers=None):
|
||||
self.exclude_headers = [x.lower() for x in exclude_headers]
|
||||
|
||||
def __call__(self, header):
|
||||
if header[0].lower() in self.exclude_headers:
|
||||
return None
|
||||
|
||||
return header
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class ExcludeSpecificHeaders(object):
|
||||
def __init__(self, exclude_headers=[]):
|
||||
self.exclude_headers = [x.lower() for x in exclude_headers]
|
||||
class ExcludeHttpOnlyCookieHeaders(object):
|
||||
HTTPONLY_RX = re.compile(';\\s*HttpOnly\\s*(;|$)', re.I)
|
||||
|
||||
def __call__(self, record):
|
||||
return self.exclude_headers
|
||||
def __call__(self, header):
|
||||
name = header[0].lower()
|
||||
if name == 'cookie':
|
||||
return None
|
||||
|
||||
if (name == 'set-cookie' and
|
||||
self.HTTPONLY_RX.search(header[1])):
|
||||
return None
|
||||
|
||||
return header
|
||||
|
||||
|
||||
# ============================================================================
|
||||
|
@ -21,8 +21,6 @@ class MultiFileWARCWriter(BaseWARCWriter):
|
||||
max_idle_secs=1800, *args, **kwargs):
|
||||
super(MultiFileWARCWriter, self).__init__(*args, **kwargs)
|
||||
|
||||
self.header_filter = kwargs.get('header_filter')
|
||||
|
||||
if not filename_template:
|
||||
dir_template, filename_template = os.path.split(dir_template)
|
||||
dir_template += os.path.sep
|
||||
@ -64,13 +62,6 @@ class MultiFileWARCWriter(BaseWARCWriter):
|
||||
|
||||
return record
|
||||
|
||||
def _set_header_buff(self, record):
|
||||
exclude_list = None
|
||||
if self.header_filter:
|
||||
exclude_list = self.header_filter(record)
|
||||
buff = record.http_headers.to_bytes(exclude_list)
|
||||
record.http_headers.headers_buff = buff
|
||||
|
||||
def get_new_filename(self, dir_, params):
|
||||
timestamp = timestamp20_now()
|
||||
|
||||
|
@ -10,7 +10,6 @@ import six
|
||||
import json
|
||||
import tempfile
|
||||
|
||||
#from requests.structures import CaseInsensitiveDict
|
||||
import requests
|
||||
|
||||
import traceback
|
||||
@ -68,7 +67,6 @@ class RecorderApp(object):
|
||||
|
||||
req_head, req_pay, resp_head, resp_pay, params = result
|
||||
|
||||
#resp_type, resp = self.writer.read_resp_record(resp_head, resp_pay)
|
||||
resp_length = resp_pay.tell()
|
||||
resp_pay.seek(0)
|
||||
resp = self.writer.create_record_from_stream(resp_pay, resp_length)
|
||||
@ -238,9 +236,6 @@ class RecorderApp(object):
|
||||
|
||||
resp_iter = StreamIter(resp_stream)
|
||||
|
||||
#if res.headers.get('Transfer-Encoding') == 'chunked':
|
||||
# resp_iter = chunk_encode_iter(resp_iter)
|
||||
|
||||
return resp_iter
|
||||
|
||||
|
||||
|
@ -14,7 +14,7 @@ from fakeredis import FakeStrictRedis
|
||||
from pywb.recorder.recorderapp import RecorderApp
|
||||
from pywb.recorder.redisindexer import WritableRedisIndexer
|
||||
from pywb.recorder.multifilewarcwriter import PerRecordWARCWriter, MultiFileWARCWriter
|
||||
from pywb.recorder.filters import ExcludeSpecificHeaders
|
||||
from pywb.recorder.filters import ExcludeSpecificHeaders, ExcludeHttpOnlyCookieHeaders
|
||||
from pywb.recorder.filters import SkipDupePolicy, WriteDupePolicy, WriteRevisitDupePolicy
|
||||
|
||||
from pywb.webagg.utils import MementoUtils
|
||||
@ -119,6 +119,19 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass)
|
||||
assert stored_req is not None
|
||||
return stored_req, stored_resp
|
||||
|
||||
def _get_http_only_cookies(self, record):
|
||||
non_http_only = None
|
||||
http_only = None
|
||||
for header in record.http_headers.headers:
|
||||
name = header[0].lower()
|
||||
if name == 'set-cookie':
|
||||
if ExcludeHttpOnlyCookieHeaders.HTTPONLY_RX.search(header[1].lower()):
|
||||
http_only = header
|
||||
else:
|
||||
non_http_only = header
|
||||
|
||||
return non_http_only, http_only
|
||||
|
||||
def _verify_content_len(self, base_dir, files):
|
||||
for filename in files:
|
||||
filename = os.path.join(base_dir, filename)
|
||||
@ -183,7 +196,7 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass)
|
||||
|
||||
self._test_all_warcs('/warcs/cookiecheck/', 1)
|
||||
|
||||
def test_record_cookies_skip_header(self):
|
||||
def test_record_skip_all_cookies_header(self):
|
||||
warc_path = to_path(self.root_dir + '/warcs/cookieskip/')
|
||||
header_filter = ExcludeSpecificHeaders(['Set-Cookie', 'Cookie'])
|
||||
recorder_app = RecorderApp(self.upstream_url,
|
||||
@ -208,6 +221,37 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass)
|
||||
|
||||
self._test_all_warcs('/warcs/cookieskip/', 1)
|
||||
|
||||
def test_record_skip_http_only_cookies_header(self):
|
||||
warc_path = to_path(self.root_dir + '/warcs/cookieskip_httponly/')
|
||||
header_filter = ExcludeHttpOnlyCookieHeaders()
|
||||
recorder_app = RecorderApp(self.upstream_url,
|
||||
PerRecordWARCWriter(warc_path, header_filter=header_filter),
|
||||
accept_colls='live')
|
||||
|
||||
resp = self._test_warc_write(recorder_app, 'www.google.com', '/')
|
||||
assert b'HTTP/1.1 302' in resp.body
|
||||
|
||||
buff = BytesIO(resp.body)
|
||||
record = ArcWarcRecordLoader().parse_record_stream(buff)
|
||||
|
||||
non_http_only, http_only = self._get_http_only_cookies(record)
|
||||
# both httponly and other cookies
|
||||
assert http_only != None
|
||||
assert non_http_only != None
|
||||
|
||||
stored_req, stored_resp = self._load_resp_req(warc_path)
|
||||
|
||||
non_http_only, http_only = self._get_http_only_cookies(stored_resp)
|
||||
# no httponly cookies
|
||||
assert http_only == None
|
||||
assert non_http_only != None
|
||||
|
||||
|
||||
assert ('X-Other', 'foo') in stored_req.http_headers.headers
|
||||
assert ('Cookie', 'boo=far') not in stored_req.http_headers.headers
|
||||
|
||||
self._test_all_warcs('/warcs/cookieskip_httponly/', 1)
|
||||
|
||||
def test_record_skip_wrong_coll(self):
|
||||
recorder_app = RecorderApp(self.upstream_url,
|
||||
writer=PerRecordWARCWriter(to_path(self.root_dir + '/warcs/')), accept_colls='not-live')
|
||||
|
Loading…
x
Reference in New Issue
Block a user