1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 08:04:49 +01:00

recorder: cookie filter:

- update ExcludeSpecificHeaders() to be passed directly as a filter to warcio
- add ExcludeHttpOnlyCookiesHeader() to exclude only Set-Cookie if HttpOnly is present
remove unused code
This commit is contained in:
Ilya Kreymer 2017-03-10 10:07:13 -08:00
parent 7a8fed2681
commit d04f8fc2e3
4 changed files with 67 additions and 24 deletions

View File

@ -5,18 +5,31 @@ import re
# ============================================================================
# Header Exclusions
# ============================================================================
class ExcludeNone(object):
def __call__(self, record):
return None
class ExcludeSpecificHeaders(object):
def __init__(self, exclude_headers=None):
self.exclude_headers = [x.lower() for x in exclude_headers]
def __call__(self, header):
if header[0].lower() in self.exclude_headers:
return None
return header
# ============================================================================
class ExcludeSpecificHeaders(object):
def __init__(self, exclude_headers=[]):
self.exclude_headers = [x.lower() for x in exclude_headers]
class ExcludeHttpOnlyCookieHeaders(object):
HTTPONLY_RX = re.compile(';\\s*HttpOnly\\s*(;|$)', re.I)
def __call__(self, record):
return self.exclude_headers
def __call__(self, header):
name = header[0].lower()
if name == 'cookie':
return None
if (name == 'set-cookie' and
self.HTTPONLY_RX.search(header[1])):
return None
return header
# ============================================================================

View File

@ -21,8 +21,6 @@ class MultiFileWARCWriter(BaseWARCWriter):
max_idle_secs=1800, *args, **kwargs):
super(MultiFileWARCWriter, self).__init__(*args, **kwargs)
self.header_filter = kwargs.get('header_filter')
if not filename_template:
dir_template, filename_template = os.path.split(dir_template)
dir_template += os.path.sep
@ -64,13 +62,6 @@ class MultiFileWARCWriter(BaseWARCWriter):
return record
def _set_header_buff(self, record):
exclude_list = None
if self.header_filter:
exclude_list = self.header_filter(record)
buff = record.http_headers.to_bytes(exclude_list)
record.http_headers.headers_buff = buff
def get_new_filename(self, dir_, params):
timestamp = timestamp20_now()

View File

@ -10,7 +10,6 @@ import six
import json
import tempfile
#from requests.structures import CaseInsensitiveDict
import requests
import traceback
@ -68,7 +67,6 @@ class RecorderApp(object):
req_head, req_pay, resp_head, resp_pay, params = result
#resp_type, resp = self.writer.read_resp_record(resp_head, resp_pay)
resp_length = resp_pay.tell()
resp_pay.seek(0)
resp = self.writer.create_record_from_stream(resp_pay, resp_length)
@ -238,9 +236,6 @@ class RecorderApp(object):
resp_iter = StreamIter(resp_stream)
#if res.headers.get('Transfer-Encoding') == 'chunked':
# resp_iter = chunk_encode_iter(resp_iter)
return resp_iter

View File

@ -14,7 +14,7 @@ from fakeredis import FakeStrictRedis
from pywb.recorder.recorderapp import RecorderApp
from pywb.recorder.redisindexer import WritableRedisIndexer
from pywb.recorder.multifilewarcwriter import PerRecordWARCWriter, MultiFileWARCWriter
from pywb.recorder.filters import ExcludeSpecificHeaders
from pywb.recorder.filters import ExcludeSpecificHeaders, ExcludeHttpOnlyCookieHeaders
from pywb.recorder.filters import SkipDupePolicy, WriteDupePolicy, WriteRevisitDupePolicy
from pywb.webagg.utils import MementoUtils
@ -119,6 +119,19 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass)
assert stored_req is not None
return stored_req, stored_resp
def _get_http_only_cookies(self, record):
non_http_only = None
http_only = None
for header in record.http_headers.headers:
name = header[0].lower()
if name == 'set-cookie':
if ExcludeHttpOnlyCookieHeaders.HTTPONLY_RX.search(header[1].lower()):
http_only = header
else:
non_http_only = header
return non_http_only, http_only
def _verify_content_len(self, base_dir, files):
for filename in files:
filename = os.path.join(base_dir, filename)
@ -183,7 +196,7 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass)
self._test_all_warcs('/warcs/cookiecheck/', 1)
def test_record_cookies_skip_header(self):
def test_record_skip_all_cookies_header(self):
warc_path = to_path(self.root_dir + '/warcs/cookieskip/')
header_filter = ExcludeSpecificHeaders(['Set-Cookie', 'Cookie'])
recorder_app = RecorderApp(self.upstream_url,
@ -208,6 +221,37 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass)
self._test_all_warcs('/warcs/cookieskip/', 1)
def test_record_skip_http_only_cookies_header(self):
warc_path = to_path(self.root_dir + '/warcs/cookieskip_httponly/')
header_filter = ExcludeHttpOnlyCookieHeaders()
recorder_app = RecorderApp(self.upstream_url,
PerRecordWARCWriter(warc_path, header_filter=header_filter),
accept_colls='live')
resp = self._test_warc_write(recorder_app, 'www.google.com', '/')
assert b'HTTP/1.1 302' in resp.body
buff = BytesIO(resp.body)
record = ArcWarcRecordLoader().parse_record_stream(buff)
non_http_only, http_only = self._get_http_only_cookies(record)
# both httponly and other cookies
assert http_only != None
assert non_http_only != None
stored_req, stored_resp = self._load_resp_req(warc_path)
non_http_only, http_only = self._get_http_only_cookies(stored_resp)
# no httponly cookies
assert http_only == None
assert non_http_only != None
assert ('X-Other', 'foo') in stored_req.http_headers.headers
assert ('Cookie', 'boo=far') not in stored_req.http_headers.headers
self._test_all_warcs('/warcs/cookieskip_httponly/', 1)
def test_record_skip_wrong_coll(self):
recorder_app = RecorderApp(self.upstream_url,
writer=PerRecordWARCWriter(to_path(self.root_dir + '/warcs/')), accept_colls='not-live')