diff --git a/pywb/recorder/filters.py b/pywb/recorder/filters.py index 1d9c68e1..b9ccd540 100644 --- a/pywb/recorder/filters.py +++ b/pywb/recorder/filters.py @@ -5,18 +5,31 @@ import re # ============================================================================ # Header Exclusions # ============================================================================ -class ExcludeNone(object): - def __call__(self, record): - return None +class ExcludeSpecificHeaders(object): + def __init__(self, exclude_headers=None): + self.exclude_headers = [x.lower() for x in exclude_headers] + + def __call__(self, header): + if header[0].lower() in self.exclude_headers: + return None + + return header # ============================================================================ -class ExcludeSpecificHeaders(object): - def __init__(self, exclude_headers=[]): - self.exclude_headers = [x.lower() for x in exclude_headers] +class ExcludeHttpOnlyCookieHeaders(object): + HTTPONLY_RX = re.compile(';\\s*HttpOnly\\s*(;|$)', re.I) - def __call__(self, record): - return self.exclude_headers + def __call__(self, header): + name = header[0].lower() + if name == 'cookie': + return None + + if (name == 'set-cookie' and + self.HTTPONLY_RX.search(header[1])): + return None + + return header # ============================================================================ diff --git a/pywb/recorder/multifilewarcwriter.py b/pywb/recorder/multifilewarcwriter.py index 475dce01..9c0396ef 100644 --- a/pywb/recorder/multifilewarcwriter.py +++ b/pywb/recorder/multifilewarcwriter.py @@ -21,8 +21,6 @@ class MultiFileWARCWriter(BaseWARCWriter): max_idle_secs=1800, *args, **kwargs): super(MultiFileWARCWriter, self).__init__(*args, **kwargs) - self.header_filter = kwargs.get('header_filter') - if not filename_template: dir_template, filename_template = os.path.split(dir_template) dir_template += os.path.sep @@ -64,13 +62,6 @@ class MultiFileWARCWriter(BaseWARCWriter): return record - def _set_header_buff(self, record): - exclude_list = None - if self.header_filter: - exclude_list = self.header_filter(record) - buff = record.http_headers.to_bytes(exclude_list) - record.http_headers.headers_buff = buff - def get_new_filename(self, dir_, params): timestamp = timestamp20_now() diff --git a/pywb/recorder/recorderapp.py b/pywb/recorder/recorderapp.py index 1d3a6992..8641fc61 100644 --- a/pywb/recorder/recorderapp.py +++ b/pywb/recorder/recorderapp.py @@ -10,7 +10,6 @@ import six import json import tempfile -#from requests.structures import CaseInsensitiveDict import requests import traceback @@ -68,7 +67,6 @@ class RecorderApp(object): req_head, req_pay, resp_head, resp_pay, params = result - #resp_type, resp = self.writer.read_resp_record(resp_head, resp_pay) resp_length = resp_pay.tell() resp_pay.seek(0) resp = self.writer.create_record_from_stream(resp_pay, resp_length) @@ -238,9 +236,6 @@ class RecorderApp(object): resp_iter = StreamIter(resp_stream) - #if res.headers.get('Transfer-Encoding') == 'chunked': - # resp_iter = chunk_encode_iter(resp_iter) - return resp_iter diff --git a/pywb/recorder/test/test_recorder.py b/pywb/recorder/test/test_recorder.py index 4efac766..4bf62e1e 100644 --- a/pywb/recorder/test/test_recorder.py +++ b/pywb/recorder/test/test_recorder.py @@ -14,7 +14,7 @@ from fakeredis import FakeStrictRedis from pywb.recorder.recorderapp import RecorderApp from pywb.recorder.redisindexer import WritableRedisIndexer from pywb.recorder.multifilewarcwriter import PerRecordWARCWriter, MultiFileWARCWriter -from pywb.recorder.filters import ExcludeSpecificHeaders +from pywb.recorder.filters import ExcludeSpecificHeaders, ExcludeHttpOnlyCookieHeaders from pywb.recorder.filters import SkipDupePolicy, WriteDupePolicy, WriteRevisitDupePolicy from pywb.webagg.utils import MementoUtils @@ -119,6 +119,19 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass) assert stored_req is not None return stored_req, stored_resp + def _get_http_only_cookies(self, record): + non_http_only = None + http_only = None + for header in record.http_headers.headers: + name = header[0].lower() + if name == 'set-cookie': + if ExcludeHttpOnlyCookieHeaders.HTTPONLY_RX.search(header[1].lower()): + http_only = header + else: + non_http_only = header + + return non_http_only, http_only + def _verify_content_len(self, base_dir, files): for filename in files: filename = os.path.join(base_dir, filename) @@ -183,7 +196,7 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass) self._test_all_warcs('/warcs/cookiecheck/', 1) - def test_record_cookies_skip_header(self): + def test_record_skip_all_cookies_header(self): warc_path = to_path(self.root_dir + '/warcs/cookieskip/') header_filter = ExcludeSpecificHeaders(['Set-Cookie', 'Cookie']) recorder_app = RecorderApp(self.upstream_url, @@ -208,6 +221,37 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass) self._test_all_warcs('/warcs/cookieskip/', 1) + def test_record_skip_http_only_cookies_header(self): + warc_path = to_path(self.root_dir + '/warcs/cookieskip_httponly/') + header_filter = ExcludeHttpOnlyCookieHeaders() + recorder_app = RecorderApp(self.upstream_url, + PerRecordWARCWriter(warc_path, header_filter=header_filter), + accept_colls='live') + + resp = self._test_warc_write(recorder_app, 'www.google.com', '/') + assert b'HTTP/1.1 302' in resp.body + + buff = BytesIO(resp.body) + record = ArcWarcRecordLoader().parse_record_stream(buff) + + non_http_only, http_only = self._get_http_only_cookies(record) + # both httponly and other cookies + assert http_only != None + assert non_http_only != None + + stored_req, stored_resp = self._load_resp_req(warc_path) + + non_http_only, http_only = self._get_http_only_cookies(stored_resp) + # no httponly cookies + assert http_only == None + assert non_http_only != None + + + assert ('X-Other', 'foo') in stored_req.http_headers.headers + assert ('Cookie', 'boo=far') not in stored_req.http_headers.headers + + self._test_all_warcs('/warcs/cookieskip_httponly/', 1) + def test_record_skip_wrong_coll(self): recorder_app = RecorderApp(self.upstream_url, writer=PerRecordWARCWriter(to_path(self.root_dir + '/warcs/')), accept_colls='not-live')