1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

cookie rewriter work: ability to set a custom 'root scope' rewriter,

which sets the path of all cookies to pywb root.
Option to enable per url-prefix in rules, still more testing, other
options needed
This commit is contained in:
Ilya Kreymer 2014-09-30 12:42:11 -07:00
parent 7feb0893eb
commit f1b3f8c76f
9 changed files with 107 additions and 49 deletions

View File

@ -71,7 +71,8 @@ class WbRequest(object):
self.wb_url = wburl_class(wb_url_str)
self.urlrewriter = urlrewriter_class(self.wb_url, self.wb_prefix,
host_prefix + rel_prefix,
rel_prefix)
rel_prefix,
env.get('SCRIPT_NAME', '/'))
else:
# no wb_url, just store blank wb_url
self.wb_url = None
@ -96,9 +97,6 @@ class WbRequest(object):
if value and value.lower() == 'xmlhttprequest':
return True
#if self.referrer and ('ajaxpipe' in self.env.get('QUERY_STRING')):
# return True
return False
def __repr__(self):

View File

@ -2,10 +2,8 @@ from Cookie import SimpleCookie, CookieError
#=================================================================
class WbUrlCookieRewriter(object):
""" Cookie rewriter for wburl-based requests
Remove the domain and rewrite path, if any, to match
given WbUrl using the url rewriter.
class WbUrlBaseCookieRewriter(object):
""" Base Cookie rewriter for wburl-based requests.
"""
def __init__(self, url_rewriter):
self.url_rewriter = url_rewriter
@ -19,21 +17,68 @@ class WbUrlCookieRewriter(object):
return results
for name, morsel in cookie.iteritems():
# if domain set, no choice but to expand cookie path to root
if morsel.get('domain'):
del morsel['domain']
morsel['path'] = self.url_rewriter.rel_prefix
# else set cookie to rewritten path
elif morsel.get('path'):
morsel['path'] = self.url_rewriter.rewrite(morsel['path'])
# remove expires as it refers to archived time
if morsel.get('expires'):
del morsel['expires']
# don't use max-age, just expire at end of session
if morsel.get('max-age'):
del morsel['max-age']
results.append((header, morsel.OutputString()))
morsel = self.rewrite_cookie(name, morsel)
if morsel:
results.append((header, morsel.OutputString()))
return results
#=================================================================
class MinimalScopeCookieRewriter(WbUrlBaseCookieRewriter):
"""
Attempt to rewrite cookies to minimal scope possible
If path present, rewrite path to current rewritten url only
If domain present, remove domain and set to path prefix
"""
def rewrite_cookie(self, name, morsel):
# if domain set, no choice but to expand cookie path to root
if morsel.get('domain'):
del morsel['domain']
morsel['path'] = self.url_rewriter.rel_prefix
# else set cookie to rewritten path
elif morsel.get('path'):
morsel['path'] = self.url_rewriter.rewrite(morsel['path'])
# remove expires as it refers to archived time
if morsel.get('expires'):
del morsel['expires']
# don't use max-age, just expire at end of session
if morsel.get('max-age'):
del morsel['max-age']
return morsel
#=================================================================
class RootScopeCookieRewriter(WbUrlBaseCookieRewriter):
"""
Sometimes it is necessary to rewrite cookies to root scope
in order to work across time boundaries and modifiers
This rewriter simply sets all cookies to be in the root
"""
def rewrite_cookie(self, name, morsel):
# get root path
morsel['path'] = self.url_rewriter.root_path
# remove domain
if morsel.get('domain'):
del morsel['domain']
# remove expires as it refers to archived time
if morsel.get('expires'):
del morsel['expires']
return morsel
#=================================================================
def get_cookie_rewriter(rule):
if rule and rule.cookie_scope == 'root':
return RootScopeCookieRewriter
else:
return MinimalScopeCookieRewriter

View File

@ -49,7 +49,7 @@ class HeaderRewriter:
def __init__(self, header_prefix='X-Archive-Orig-'):
self.header_prefix = header_prefix
def rewrite(self, status_headers, urlrewriter):
def rewrite(self, status_headers, urlrewriter, cookie_rewriter):
content_type = status_headers.get_header('Content-Type')
text_type = None
charset = None
@ -63,6 +63,7 @@ class HeaderRewriter:
result = self._rewrite_headers(status_headers.headers,
urlrewriter,
cookie_rewriter,
strip_encoding)
new_headers = result[0]
@ -89,15 +90,12 @@ class HeaderRewriter:
return content_type[idx + len(CHARSET_TOKEN):].lower()
def _rewrite_headers(self, headers, urlrewriter, content_rewritten=False):
def _rewrite_headers(self, headers, urlrewriter, cookie_rewriter,
content_rewritten):
new_headers = []
removed_header_dict = {}
if urlrewriter:
cookie_rewriter = urlrewriter.get_cookie_rewriter()
else:
cookie_rewriter = None
for (name, value) in headers:
lowername = name.lower()

View File

@ -37,13 +37,17 @@ class RewriteContent:
return (status_headers, stream)
def rewrite_headers(self, urlrewriter, status_headers, stream, urlkey=''):
def _rewrite_headers(self, urlrewriter, rule, status_headers, stream, urlkey=''):
header_rewriter_class = (self.ruleset.get_first_match(urlkey).
rewriters['header'])
header_rewriter_class = rule.rewriters['header']
cookie_rewriter = None
if urlrewriter:
cookie_rewriter = urlrewriter.get_cookie_rewriter(rule)
rewritten_headers = (header_rewriter_class().
rewrite(status_headers, urlrewriter))
rewrite(status_headers, urlrewriter, cookie_rewriter))
# note: since chunk encoding may/may not be valid,
# the approach taken here is to *always* attempt
@ -74,9 +78,12 @@ class RewriteContent:
if wb_url.is_banner_only:
urlrewriter = None
(rewritten_headers, stream) = self.rewrite_headers(urlrewriter,
headers,
stream)
rule = self.ruleset.get_first_match(urlkey)
(rewritten_headers, stream) = self._rewrite_headers(urlrewriter,
rule,
headers,
stream)
status_headers = rewritten_headers.status_headers
@ -112,8 +119,6 @@ class RewriteContent:
else:
stream = DecompressingBufferedReader(stream)
rule = self.ruleset.get_first_match(urlkey)
rewriter_class = rule.rewriters[text_type]
# for html, need to perform header insert, supply js, css, xml

View File

@ -42,6 +42,9 @@ class RewriteRules(BaseRule):
# add any regexs for js rewriter
self._add_custom_regexs('js', config)
# cookie rewrite scope
self.cookie_scope = config.get('cookie_scope', 'default')
def _add_custom_regexs(self, field, config):
regexs = config.get(field + '_regexs')
if not regexs:

View File

@ -26,7 +26,7 @@ r"""
"""
from pywb.rewrite.cookie_rewriter import WbUrlCookieRewriter
from pywb.rewrite.cookie_rewriter import MinimalScopeCookieRewriter
from pywb.rewrite.url_rewriter import UrlRewriter
urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/pywb/')
@ -35,5 +35,5 @@ urlrewriter2 = UrlRewriter('em_/http://example.com/', '/preview/')
def rewrite_cookie(cookie_str, rewriter=urlrewriter):
return WbUrlCookieRewriter(rewriter).rewrite(cookie_str)
return MinimalScopeCookieRewriter(rewriter).rewrite(cookie_str)

View File

@ -71,7 +71,7 @@ urlrewriter = UrlRewriter('20131010/http://example.com/', '/web/')
headerrewriter = HeaderRewriter()
def _test_headers(headers, status = '200 OK'):
rewritten = headerrewriter.rewrite(StatusAndHeaders(status, headers), urlrewriter)
rewritten = headerrewriter.rewrite(StatusAndHeaders(status, headers), urlrewriter, urlrewriter.get_cookie_rewriter())
return pprint.pprint(vars(rewritten))

View File

@ -2,7 +2,7 @@ import copy
import urlparse
from wburl import WbUrl
from cookie_rewriter import WbUrlCookieRewriter
from cookie_rewriter import get_cookie_rewriter
#=================================================================
@ -18,11 +18,12 @@ class UrlRewriter(object):
PROTOCOLS = ['http:', 'https:', 'ftp:', 'mms:', 'rtsp:', 'wais:']
def __init__(self, wburl, prefix, full_prefix=None, rel_prefix=None):
def __init__(self, wburl, prefix, full_prefix=None, rel_prefix=None, root_path=None):
self.wburl = wburl if isinstance(wburl, WbUrl) else WbUrl(wburl)
self.prefix = prefix
self.full_prefix = full_prefix
self.rel_prefix = rel_prefix if rel_prefix else prefix
self.root_path = root_path if root_path else '/'
def rewrite(self, url, mod=None):
# if special protocol, no rewriting at all
@ -83,8 +84,9 @@ class UrlRewriter(object):
new_wburl = WbUrl(new_url)
return UrlRewriter(new_wburl, self.prefix)
def get_cookie_rewriter(self):
return WbUrlCookieRewriter(self)
def get_cookie_rewriter(self, rule=None):
cls = get_cookie_rewriter(rule)
return cls(self)
def __repr__(self):
return "UrlRewriter('{0}', '{1}')".format(self.wburl, self.prefix)
@ -149,5 +151,5 @@ class HttpsUrlRewriter(UrlRewriter):
def rebase_rewriter(self, new_url):
return self
def get_cookie_rewriter(self):
def get_cookie_rewriter(self, rule=None):
return None

View File

@ -46,7 +46,14 @@ rules:
parse_comments: true
# flickr rules
# instagram rules
#=================================================================
- url_prefix: 'com,instagram'
rewrite:
cookie_scope: root
# flickr rules
#=================================================================
- url_prefix: ['com,yimg,l)/g/combo', 'com,yimg,s)/pw/combo', 'com,yahooapis,yui)/combo']
fuzzy_lookup: '([^/]+(?:\.css|\.js))'