1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

rewrite: add support for Cookie request header rewrite to support sites

which require a cookie to be set. req_cookie_rewrite directive can be
set in rules.yaml per url prefix with a list of match/replace regexs
This commit is contained in:
Ilya Kreymer 2015-01-03 12:51:09 -08:00
parent df94e17305
commit d9c5345d3c
5 changed files with 99 additions and 13 deletions

View File

@ -3,6 +3,8 @@ pywb 0.7.2 changelist
* Experiment with disabling DASH for YT
* New ``req_cookie_rewrite`` rewrite directive to rewrite outgoing ``Cookie`` header, can be used to fix a certain cookie for a url prefix.
pywb 0.7.1 changelist
~~~~~~~~~~~~~~~~~~~~~
@ -25,7 +27,7 @@ pywb 0.7.1 changelist
- setAttribute override
- Date override sets date to replay timestamp
- Image() object override
- ability to disable dynamic attribute rewriting by setting `_no_rewrite` on an element.
- ability to disable dynamic attribute rewriting by setting ``_no_rewrite`` on an element.
* Type detection: resolve conflict between text/html that is served under js_ mod, resolve if html or js.

View File

@ -50,10 +50,11 @@ class LiveRewriter(object):
return (status_headers, stream)
def translate_headers(self, url, env):
def translate_headers(self, url, urlkey, env):
headers = {}
splits = urlsplit(url)
has_cookies = False
for name, value in env.iteritems():
if name == 'HTTP_HOST':
@ -73,6 +74,11 @@ class LiveRewriter(object):
elif name == 'HTTP_REFERER':
continue
elif name == 'HTTP_COOKIE':
name = 'Cookie'
value = self._req_cookie_rewrite(urlkey, value)
has_cookies = True
elif name.startswith('HTTP_'):
name = name[5:].title().replace('_', '-')
@ -87,9 +93,28 @@ class LiveRewriter(object):
if value:
headers[name] = value
if not has_cookies:
value = self._req_cookie_rewrite(urlkey, '')
if value:
headers['Cookie'] = value
return headers
def _req_cookie_rewrite(self, urlkey, value):
rule = self.rewriter.ruleset.get_first_match(urlkey)
if not rule or not rule.req_cookie_rewrite:
return value
for cr in rule.req_cookie_rewrite:
try:
value = cr['rx'].sub(cr['replace'], value)
except KeyError:
pass
return value
def fetch_http(self, url,
urlkey=None,
env=None,
req_headers=None,
follow_redirects=False,
@ -109,7 +134,7 @@ class LiveRewriter(object):
method = env['REQUEST_METHOD'].upper()
input_ = env['wsgi.input']
req_headers.update(self.translate_headers(url, env))
req_headers.update(self.translate_headers(url, urlkey, env))
if method in ('POST', 'PUT'):
len_ = env.get('CONTENT_LENGTH')
@ -155,17 +180,18 @@ class LiveRewriter(object):
if url.startswith('//'):
url = 'http:' + url
# explicit urlkey may be passed in (say for testing)
if not urlkey:
urlkey = canonicalize(url)
if is_http(url):
(status_headers, stream) = self.fetch_http(url, env, req_headers,
(status_headers, stream) = self.fetch_http(url, urlkey, env,
req_headers,
follow_redirects,
ignore_proxies)
else:
(status_headers, stream) = self.fetch_local_file(url)
# explicit urlkey may be passed in (say for testing)
if not urlkey:
urlkey = canonicalize(url)
if timestamp is None:
timestamp = datetime_to_timestamp(datetime.datetime.utcnow())

View File

@ -8,6 +8,7 @@ from header_rewriter import HeaderRewriter
from html_rewriter import HTMLRewriter
import itertools
import re
#=================================================================
@ -47,6 +48,12 @@ class RewriteRules(BaseRule):
# cookie rewrite scope
self.cookie_scope = config.get('cookie_scope', 'default')
req_cookie_rewrite = config.get('req_cookie_rewrite', [])
for rc in req_cookie_rewrite:
rc['rx'] = re.compile(rc.get('match', ''))
self.req_cookie_rewrite = req_cookie_rewrite
def _add_custom_regexs(self, field, config):
regexs = config.get(field + '_regexs')
if not regexs:

View File

@ -22,10 +22,43 @@ def test_csrf_token_headers():
rewriter = LiveRewriter()
env = {'HTTP_X_CSRFTOKEN': 'wrong', 'HTTP_COOKIE': 'csrftoken=foobar'}
req_headers = rewriter.translate_headers('http://example.com/', env)
req_headers = rewriter.translate_headers('http://example.com/', 'com,example)/', env)
assert req_headers == {'X-CSRFToken': 'foobar', 'Cookie': 'csrftoken=foobar'}
def test_req_cookie_rewrite_1():
rewriter = LiveRewriter()
env = {'HTTP_COOKIE': 'A=B'}
urlkey = 'example,example,test)/'
url = 'test.example.example/'
req_headers = rewriter.translate_headers(url, urlkey, env)
assert req_headers == {'Cookie': 'A=B; FOO=&bar=1'}
def test_req_cookie_rewrite_2():
rewriter = LiveRewriter()
env = {'HTTP_COOKIE': 'FOO=goo'}
urlkey = 'example,example,test)/'
url = 'test.example.example/'
req_headers = rewriter.translate_headers(url, urlkey, env)
assert req_headers == {'Cookie': 'FOO=&bar=1'}
def test_req_cookie_rewrite_3():
rewriter = LiveRewriter()
env = {}
urlkey = 'example,example,test)/'
url = 'test.example.example/'
req_headers = rewriter.translate_headers(url, urlkey, env)
assert req_headers == {'Cookie': '; FOO=&bar=1'}
def test_local_1():
status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html',
urlrewriter,

View File

@ -162,7 +162,7 @@ rules:
args:
- id
- itag
- mime
#- mime
filter:
- '~urlkey:{0}'
@ -187,15 +187,24 @@ rules:
js_rewrite_location: location
# watch config changes
- url_prefix: 'com,youtube)/watch'
# watch and embed config changes
- url_prefix: 'com,youtube)/'
rewrite:
js_regexs:
- match: 'ytplayer.load\(\);'
replace: 'ytplayer.config.args.dash = "0"; ytplayer.config.args.dashmpd = ""; {0}'
- match: 'yt\.setConfig.*PLAYER_CONFIG.*args": {'
replace: '{0} "dash": "0", dashmpd: "", '
req_cookie_rewrite:
- match: '^(((?!PREF).)*)$'
replace: '\1; PREF=f2=40000000'
- match: '(.*PREF=)([^ ;]*)(.*)'
replace: '\1&f2=40000000\3'
# testing rules -- not for valid domain
#=================================================================
# this rule block is a non-existent prefix merely for testing
@ -217,6 +226,15 @@ rules:
rewrite:
js_rewrite_location: urls
req_cookie_rewrite:
- match: '^(((?!FOO).)*)$'
replace: '\1; FOO=bar=1'
- match: '(.*FOO=)([^ ;]*)(.*)'
replace: '\1&bar=1\3'
- match: ''
invalid_: ''
# all domain rules -- fallback to this dataset
#=================================================================