diff --git a/CHANGES.rst b/CHANGES.rst index 61488c7b..de149cf4 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -3,6 +3,8 @@ pywb 0.7.2 changelist * Experiment with disabling DASH for YT +* New ``req_cookie_rewrite`` rewrite directive to rewrite outgoing ``Cookie`` header, can be used to fix a certain cookie for a url prefix. + pywb 0.7.1 changelist ~~~~~~~~~~~~~~~~~~~~~ @@ -25,7 +27,7 @@ pywb 0.7.1 changelist - setAttribute override - Date override sets date to replay timestamp - Image() object override - - ability to disable dynamic attribute rewriting by setting `_no_rewrite` on an element. + - ability to disable dynamic attribute rewriting by setting ``_no_rewrite`` on an element. * Type detection: resolve conflict between text/html that is served under js_ mod, resolve if html or js. diff --git a/pywb/rewrite/rewrite_live.py b/pywb/rewrite/rewrite_live.py index 1b20e120..3ebbe68d 100644 --- a/pywb/rewrite/rewrite_live.py +++ b/pywb/rewrite/rewrite_live.py @@ -50,10 +50,11 @@ class LiveRewriter(object): return (status_headers, stream) - def translate_headers(self, url, env): + def translate_headers(self, url, urlkey, env): headers = {} splits = urlsplit(url) + has_cookies = False for name, value in env.iteritems(): if name == 'HTTP_HOST': @@ -73,6 +74,11 @@ class LiveRewriter(object): elif name == 'HTTP_REFERER': continue + elif name == 'HTTP_COOKIE': + name = 'Cookie' + value = self._req_cookie_rewrite(urlkey, value) + has_cookies = True + elif name.startswith('HTTP_'): name = name[5:].title().replace('_', '-') @@ -87,9 +93,28 @@ class LiveRewriter(object): if value: headers[name] = value + if not has_cookies: + value = self._req_cookie_rewrite(urlkey, '') + if value: + headers['Cookie'] = value + return headers + def _req_cookie_rewrite(self, urlkey, value): + rule = self.rewriter.ruleset.get_first_match(urlkey) + if not rule or not rule.req_cookie_rewrite: + return value + + for cr in rule.req_cookie_rewrite: + try: + value = cr['rx'].sub(cr['replace'], value) + except KeyError: + pass + + return value + def fetch_http(self, url, + urlkey=None, env=None, req_headers=None, follow_redirects=False, @@ -109,7 +134,7 @@ class LiveRewriter(object): method = env['REQUEST_METHOD'].upper() input_ = env['wsgi.input'] - req_headers.update(self.translate_headers(url, env)) + req_headers.update(self.translate_headers(url, urlkey, env)) if method in ('POST', 'PUT'): len_ = env.get('CONTENT_LENGTH') @@ -155,17 +180,18 @@ class LiveRewriter(object): if url.startswith('//'): url = 'http:' + url + # explicit urlkey may be passed in (say for testing) + if not urlkey: + urlkey = canonicalize(url) + if is_http(url): - (status_headers, stream) = self.fetch_http(url, env, req_headers, + (status_headers, stream) = self.fetch_http(url, urlkey, env, + req_headers, follow_redirects, ignore_proxies) else: (status_headers, stream) = self.fetch_local_file(url) - # explicit urlkey may be passed in (say for testing) - if not urlkey: - urlkey = canonicalize(url) - if timestamp is None: timestamp = datetime_to_timestamp(datetime.datetime.utcnow()) diff --git a/pywb/rewrite/rewriterules.py b/pywb/rewrite/rewriterules.py index 514fb55e..95e2225a 100644 --- a/pywb/rewrite/rewriterules.py +++ b/pywb/rewrite/rewriterules.py @@ -8,6 +8,7 @@ from header_rewriter import HeaderRewriter from html_rewriter import HTMLRewriter import itertools +import re #================================================================= @@ -47,6 +48,12 @@ class RewriteRules(BaseRule): # cookie rewrite scope self.cookie_scope = config.get('cookie_scope', 'default') + req_cookie_rewrite = config.get('req_cookie_rewrite', []) + for rc in req_cookie_rewrite: + rc['rx'] = re.compile(rc.get('match', '')) + + self.req_cookie_rewrite = req_cookie_rewrite + def _add_custom_regexs(self, field, config): regexs = config.get(field + '_regexs') if not regexs: diff --git a/pywb/rewrite/test/test_rewrite_live.py b/pywb/rewrite/test/test_rewrite_live.py index fdea7557..99fb6074 100644 --- a/pywb/rewrite/test/test_rewrite_live.py +++ b/pywb/rewrite/test/test_rewrite_live.py @@ -22,10 +22,43 @@ def test_csrf_token_headers(): rewriter = LiveRewriter() env = {'HTTP_X_CSRFTOKEN': 'wrong', 'HTTP_COOKIE': 'csrftoken=foobar'} - req_headers = rewriter.translate_headers('http://example.com/', env) + req_headers = rewriter.translate_headers('http://example.com/', 'com,example)/', env) assert req_headers == {'X-CSRFToken': 'foobar', 'Cookie': 'csrftoken=foobar'} +def test_req_cookie_rewrite_1(): + rewriter = LiveRewriter() + env = {'HTTP_COOKIE': 'A=B'} + + urlkey = 'example,example,test)/' + url = 'test.example.example/' + + req_headers = rewriter.translate_headers(url, urlkey, env) + + assert req_headers == {'Cookie': 'A=B; FOO=&bar=1'} + +def test_req_cookie_rewrite_2(): + rewriter = LiveRewriter() + env = {'HTTP_COOKIE': 'FOO=goo'} + + urlkey = 'example,example,test)/' + url = 'test.example.example/' + + req_headers = rewriter.translate_headers(url, urlkey, env) + + assert req_headers == {'Cookie': 'FOO=&bar=1'} + +def test_req_cookie_rewrite_3(): + rewriter = LiveRewriter() + env = {} + + urlkey = 'example,example,test)/' + url = 'test.example.example/' + + req_headers = rewriter.translate_headers(url, urlkey, env) + + assert req_headers == {'Cookie': '; FOO=&bar=1'} + def test_local_1(): status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html', urlrewriter, diff --git a/pywb/rules.yaml b/pywb/rules.yaml index da47efad..cb87a843 100644 --- a/pywb/rules.yaml +++ b/pywb/rules.yaml @@ -162,7 +162,7 @@ rules: args: - id - itag - - mime + #- mime filter: - '~urlkey:{0}' @@ -187,15 +187,24 @@ rules: js_rewrite_location: location - # watch config changes - - url_prefix: 'com,youtube)/watch' + # watch and embed config changes + - url_prefix: 'com,youtube)/' rewrite: - js_regexs: - match: 'ytplayer.load\(\);' replace: 'ytplayer.config.args.dash = "0"; ytplayer.config.args.dashmpd = ""; {0}' + - match: 'yt\.setConfig.*PLAYER_CONFIG.*args": {' + replace: '{0} "dash": "0", dashmpd: "", ' + + req_cookie_rewrite: + - match: '^(((?!PREF).)*)$' + replace: '\1; PREF=f2=40000000' + + - match: '(.*PREF=)([^ ;]*)(.*)' + replace: '\1&f2=40000000\3' + # testing rules -- not for valid domain #================================================================= # this rule block is a non-existent prefix merely for testing @@ -217,6 +226,15 @@ rules: rewrite: js_rewrite_location: urls + req_cookie_rewrite: + - match: '^(((?!FOO).)*)$' + replace: '\1; FOO=bar=1' + + - match: '(.*FOO=)([^ ;]*)(.*)' + replace: '\1&bar=1\3' + + - match: '' + invalid_: '' # all domain rules -- fallback to this dataset #=================================================================