mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
cookie rewriter work: ability to set a custom 'root scope' rewriter,
which sets the path of all cookies to pywb root. Option to enable per url-prefix in rules, still more testing, other options needed
This commit is contained in:
parent
7feb0893eb
commit
f1b3f8c76f
@ -71,7 +71,8 @@ class WbRequest(object):
|
||||
self.wb_url = wburl_class(wb_url_str)
|
||||
self.urlrewriter = urlrewriter_class(self.wb_url, self.wb_prefix,
|
||||
host_prefix + rel_prefix,
|
||||
rel_prefix)
|
||||
rel_prefix,
|
||||
env.get('SCRIPT_NAME', '/'))
|
||||
else:
|
||||
# no wb_url, just store blank wb_url
|
||||
self.wb_url = None
|
||||
@ -96,9 +97,6 @@ class WbRequest(object):
|
||||
if value and value.lower() == 'xmlhttprequest':
|
||||
return True
|
||||
|
||||
#if self.referrer and ('ajaxpipe' in self.env.get('QUERY_STRING')):
|
||||
# return True
|
||||
|
||||
return False
|
||||
|
||||
def __repr__(self):
|
||||
|
@ -2,10 +2,8 @@ from Cookie import SimpleCookie, CookieError
|
||||
|
||||
|
||||
#=================================================================
|
||||
class WbUrlCookieRewriter(object):
|
||||
""" Cookie rewriter for wburl-based requests
|
||||
Remove the domain and rewrite path, if any, to match
|
||||
given WbUrl using the url rewriter.
|
||||
class WbUrlBaseCookieRewriter(object):
|
||||
""" Base Cookie rewriter for wburl-based requests.
|
||||
"""
|
||||
def __init__(self, url_rewriter):
|
||||
self.url_rewriter = url_rewriter
|
||||
@ -19,21 +17,68 @@ class WbUrlCookieRewriter(object):
|
||||
return results
|
||||
|
||||
for name, morsel in cookie.iteritems():
|
||||
# if domain set, no choice but to expand cookie path to root
|
||||
if morsel.get('domain'):
|
||||
del morsel['domain']
|
||||
morsel['path'] = self.url_rewriter.rel_prefix
|
||||
# else set cookie to rewritten path
|
||||
elif morsel.get('path'):
|
||||
morsel['path'] = self.url_rewriter.rewrite(morsel['path'])
|
||||
# remove expires as it refers to archived time
|
||||
if morsel.get('expires'):
|
||||
del morsel['expires']
|
||||
|
||||
# don't use max-age, just expire at end of session
|
||||
if morsel.get('max-age'):
|
||||
del morsel['max-age']
|
||||
|
||||
results.append((header, morsel.OutputString()))
|
||||
morsel = self.rewrite_cookie(name, morsel)
|
||||
if morsel:
|
||||
results.append((header, morsel.OutputString()))
|
||||
|
||||
return results
|
||||
|
||||
|
||||
#=================================================================
|
||||
class MinimalScopeCookieRewriter(WbUrlBaseCookieRewriter):
|
||||
"""
|
||||
Attempt to rewrite cookies to minimal scope possible
|
||||
|
||||
If path present, rewrite path to current rewritten url only
|
||||
If domain present, remove domain and set to path prefix
|
||||
"""
|
||||
|
||||
def rewrite_cookie(self, name, morsel):
|
||||
# if domain set, no choice but to expand cookie path to root
|
||||
if morsel.get('domain'):
|
||||
del morsel['domain']
|
||||
morsel['path'] = self.url_rewriter.rel_prefix
|
||||
# else set cookie to rewritten path
|
||||
elif morsel.get('path'):
|
||||
morsel['path'] = self.url_rewriter.rewrite(morsel['path'])
|
||||
|
||||
# remove expires as it refers to archived time
|
||||
if morsel.get('expires'):
|
||||
del morsel['expires']
|
||||
|
||||
# don't use max-age, just expire at end of session
|
||||
if morsel.get('max-age'):
|
||||
del morsel['max-age']
|
||||
|
||||
return morsel
|
||||
|
||||
|
||||
#=================================================================
|
||||
class RootScopeCookieRewriter(WbUrlBaseCookieRewriter):
|
||||
"""
|
||||
Sometimes it is necessary to rewrite cookies to root scope
|
||||
in order to work across time boundaries and modifiers
|
||||
|
||||
This rewriter simply sets all cookies to be in the root
|
||||
"""
|
||||
def rewrite_cookie(self, name, morsel):
|
||||
# get root path
|
||||
morsel['path'] = self.url_rewriter.root_path
|
||||
|
||||
# remove domain
|
||||
if morsel.get('domain'):
|
||||
del morsel['domain']
|
||||
|
||||
# remove expires as it refers to archived time
|
||||
if morsel.get('expires'):
|
||||
del morsel['expires']
|
||||
|
||||
return morsel
|
||||
|
||||
|
||||
#=================================================================
|
||||
def get_cookie_rewriter(rule):
|
||||
if rule and rule.cookie_scope == 'root':
|
||||
return RootScopeCookieRewriter
|
||||
else:
|
||||
return MinimalScopeCookieRewriter
|
||||
|
@ -49,7 +49,7 @@ class HeaderRewriter:
|
||||
def __init__(self, header_prefix='X-Archive-Orig-'):
|
||||
self.header_prefix = header_prefix
|
||||
|
||||
def rewrite(self, status_headers, urlrewriter):
|
||||
def rewrite(self, status_headers, urlrewriter, cookie_rewriter):
|
||||
content_type = status_headers.get_header('Content-Type')
|
||||
text_type = None
|
||||
charset = None
|
||||
@ -63,6 +63,7 @@ class HeaderRewriter:
|
||||
|
||||
result = self._rewrite_headers(status_headers.headers,
|
||||
urlrewriter,
|
||||
cookie_rewriter,
|
||||
strip_encoding)
|
||||
|
||||
new_headers = result[0]
|
||||
@ -89,15 +90,12 @@ class HeaderRewriter:
|
||||
|
||||
return content_type[idx + len(CHARSET_TOKEN):].lower()
|
||||
|
||||
def _rewrite_headers(self, headers, urlrewriter, content_rewritten=False):
|
||||
def _rewrite_headers(self, headers, urlrewriter, cookie_rewriter,
|
||||
content_rewritten):
|
||||
|
||||
new_headers = []
|
||||
removed_header_dict = {}
|
||||
|
||||
if urlrewriter:
|
||||
cookie_rewriter = urlrewriter.get_cookie_rewriter()
|
||||
else:
|
||||
cookie_rewriter = None
|
||||
|
||||
for (name, value) in headers:
|
||||
|
||||
lowername = name.lower()
|
||||
|
@ -37,13 +37,17 @@ class RewriteContent:
|
||||
|
||||
return (status_headers, stream)
|
||||
|
||||
def rewrite_headers(self, urlrewriter, status_headers, stream, urlkey=''):
|
||||
def _rewrite_headers(self, urlrewriter, rule, status_headers, stream, urlkey=''):
|
||||
|
||||
header_rewriter_class = (self.ruleset.get_first_match(urlkey).
|
||||
rewriters['header'])
|
||||
header_rewriter_class = rule.rewriters['header']
|
||||
|
||||
cookie_rewriter = None
|
||||
|
||||
if urlrewriter:
|
||||
cookie_rewriter = urlrewriter.get_cookie_rewriter(rule)
|
||||
|
||||
rewritten_headers = (header_rewriter_class().
|
||||
rewrite(status_headers, urlrewriter))
|
||||
rewrite(status_headers, urlrewriter, cookie_rewriter))
|
||||
|
||||
# note: since chunk encoding may/may not be valid,
|
||||
# the approach taken here is to *always* attempt
|
||||
@ -74,9 +78,12 @@ class RewriteContent:
|
||||
if wb_url.is_banner_only:
|
||||
urlrewriter = None
|
||||
|
||||
(rewritten_headers, stream) = self.rewrite_headers(urlrewriter,
|
||||
headers,
|
||||
stream)
|
||||
rule = self.ruleset.get_first_match(urlkey)
|
||||
|
||||
(rewritten_headers, stream) = self._rewrite_headers(urlrewriter,
|
||||
rule,
|
||||
headers,
|
||||
stream)
|
||||
|
||||
status_headers = rewritten_headers.status_headers
|
||||
|
||||
@ -112,8 +119,6 @@ class RewriteContent:
|
||||
else:
|
||||
stream = DecompressingBufferedReader(stream)
|
||||
|
||||
rule = self.ruleset.get_first_match(urlkey)
|
||||
|
||||
rewriter_class = rule.rewriters[text_type]
|
||||
|
||||
# for html, need to perform header insert, supply js, css, xml
|
||||
|
@ -42,6 +42,9 @@ class RewriteRules(BaseRule):
|
||||
# add any regexs for js rewriter
|
||||
self._add_custom_regexs('js', config)
|
||||
|
||||
# cookie rewrite scope
|
||||
self.cookie_scope = config.get('cookie_scope', 'default')
|
||||
|
||||
def _add_custom_regexs(self, field, config):
|
||||
regexs = config.get(field + '_regexs')
|
||||
if not regexs:
|
||||
|
@ -26,7 +26,7 @@ r"""
|
||||
"""
|
||||
|
||||
|
||||
from pywb.rewrite.cookie_rewriter import WbUrlCookieRewriter
|
||||
from pywb.rewrite.cookie_rewriter import MinimalScopeCookieRewriter
|
||||
from pywb.rewrite.url_rewriter import UrlRewriter
|
||||
|
||||
urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/pywb/')
|
||||
@ -35,5 +35,5 @@ urlrewriter2 = UrlRewriter('em_/http://example.com/', '/preview/')
|
||||
|
||||
|
||||
def rewrite_cookie(cookie_str, rewriter=urlrewriter):
|
||||
return WbUrlCookieRewriter(rewriter).rewrite(cookie_str)
|
||||
return MinimalScopeCookieRewriter(rewriter).rewrite(cookie_str)
|
||||
|
||||
|
@ -71,7 +71,7 @@ urlrewriter = UrlRewriter('20131010/http://example.com/', '/web/')
|
||||
headerrewriter = HeaderRewriter()
|
||||
|
||||
def _test_headers(headers, status = '200 OK'):
|
||||
rewritten = headerrewriter.rewrite(StatusAndHeaders(status, headers), urlrewriter)
|
||||
rewritten = headerrewriter.rewrite(StatusAndHeaders(status, headers), urlrewriter, urlrewriter.get_cookie_rewriter())
|
||||
return pprint.pprint(vars(rewritten))
|
||||
|
||||
|
||||
|
@ -2,7 +2,7 @@ import copy
|
||||
import urlparse
|
||||
|
||||
from wburl import WbUrl
|
||||
from cookie_rewriter import WbUrlCookieRewriter
|
||||
from cookie_rewriter import get_cookie_rewriter
|
||||
|
||||
|
||||
#=================================================================
|
||||
@ -18,11 +18,12 @@ class UrlRewriter(object):
|
||||
|
||||
PROTOCOLS = ['http:', 'https:', 'ftp:', 'mms:', 'rtsp:', 'wais:']
|
||||
|
||||
def __init__(self, wburl, prefix, full_prefix=None, rel_prefix=None):
|
||||
def __init__(self, wburl, prefix, full_prefix=None, rel_prefix=None, root_path=None):
|
||||
self.wburl = wburl if isinstance(wburl, WbUrl) else WbUrl(wburl)
|
||||
self.prefix = prefix
|
||||
self.full_prefix = full_prefix
|
||||
self.rel_prefix = rel_prefix if rel_prefix else prefix
|
||||
self.root_path = root_path if root_path else '/'
|
||||
|
||||
def rewrite(self, url, mod=None):
|
||||
# if special protocol, no rewriting at all
|
||||
@ -83,8 +84,9 @@ class UrlRewriter(object):
|
||||
new_wburl = WbUrl(new_url)
|
||||
return UrlRewriter(new_wburl, self.prefix)
|
||||
|
||||
def get_cookie_rewriter(self):
|
||||
return WbUrlCookieRewriter(self)
|
||||
def get_cookie_rewriter(self, rule=None):
|
||||
cls = get_cookie_rewriter(rule)
|
||||
return cls(self)
|
||||
|
||||
def __repr__(self):
|
||||
return "UrlRewriter('{0}', '{1}')".format(self.wburl, self.prefix)
|
||||
@ -149,5 +151,5 @@ class HttpsUrlRewriter(UrlRewriter):
|
||||
def rebase_rewriter(self, new_url):
|
||||
return self
|
||||
|
||||
def get_cookie_rewriter(self):
|
||||
def get_cookie_rewriter(self, rule=None):
|
||||
return None
|
||||
|
@ -46,7 +46,14 @@ rules:
|
||||
parse_comments: true
|
||||
|
||||
|
||||
# flickr rules
|
||||
# instagram rules
|
||||
#=================================================================
|
||||
- url_prefix: 'com,instagram'
|
||||
rewrite:
|
||||
cookie_scope: root
|
||||
|
||||
|
||||
# flickr rules
|
||||
#=================================================================
|
||||
- url_prefix: ['com,yimg,l)/g/combo', 'com,yimg,s)/pw/combo', 'com,yahooapis,yui)/combo']
|
||||
fuzzy_lookup: '([^/]+(?:\.css|\.js))'
|
||||
|
Loading…
x
Reference in New Issue
Block a user