mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
cookie rewriter work: ability to set a custom 'root scope' rewriter,
which sets the path of all cookies to pywb root. Option to enable per url-prefix in rules, still more testing, other options needed
This commit is contained in:
parent
7feb0893eb
commit
f1b3f8c76f
@ -71,7 +71,8 @@ class WbRequest(object):
|
|||||||
self.wb_url = wburl_class(wb_url_str)
|
self.wb_url = wburl_class(wb_url_str)
|
||||||
self.urlrewriter = urlrewriter_class(self.wb_url, self.wb_prefix,
|
self.urlrewriter = urlrewriter_class(self.wb_url, self.wb_prefix,
|
||||||
host_prefix + rel_prefix,
|
host_prefix + rel_prefix,
|
||||||
rel_prefix)
|
rel_prefix,
|
||||||
|
env.get('SCRIPT_NAME', '/'))
|
||||||
else:
|
else:
|
||||||
# no wb_url, just store blank wb_url
|
# no wb_url, just store blank wb_url
|
||||||
self.wb_url = None
|
self.wb_url = None
|
||||||
@ -96,9 +97,6 @@ class WbRequest(object):
|
|||||||
if value and value.lower() == 'xmlhttprequest':
|
if value and value.lower() == 'xmlhttprequest':
|
||||||
return True
|
return True
|
||||||
|
|
||||||
#if self.referrer and ('ajaxpipe' in self.env.get('QUERY_STRING')):
|
|
||||||
# return True
|
|
||||||
|
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
|
@ -2,10 +2,8 @@ from Cookie import SimpleCookie, CookieError
|
|||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class WbUrlCookieRewriter(object):
|
class WbUrlBaseCookieRewriter(object):
|
||||||
""" Cookie rewriter for wburl-based requests
|
""" Base Cookie rewriter for wburl-based requests.
|
||||||
Remove the domain and rewrite path, if any, to match
|
|
||||||
given WbUrl using the url rewriter.
|
|
||||||
"""
|
"""
|
||||||
def __init__(self, url_rewriter):
|
def __init__(self, url_rewriter):
|
||||||
self.url_rewriter = url_rewriter
|
self.url_rewriter = url_rewriter
|
||||||
@ -19,21 +17,68 @@ class WbUrlCookieRewriter(object):
|
|||||||
return results
|
return results
|
||||||
|
|
||||||
for name, morsel in cookie.iteritems():
|
for name, morsel in cookie.iteritems():
|
||||||
# if domain set, no choice but to expand cookie path to root
|
morsel = self.rewrite_cookie(name, morsel)
|
||||||
if morsel.get('domain'):
|
if morsel:
|
||||||
del morsel['domain']
|
results.append((header, morsel.OutputString()))
|
||||||
morsel['path'] = self.url_rewriter.rel_prefix
|
|
||||||
# else set cookie to rewritten path
|
|
||||||
elif morsel.get('path'):
|
|
||||||
morsel['path'] = self.url_rewriter.rewrite(morsel['path'])
|
|
||||||
# remove expires as it refers to archived time
|
|
||||||
if morsel.get('expires'):
|
|
||||||
del morsel['expires']
|
|
||||||
|
|
||||||
# don't use max-age, just expire at end of session
|
|
||||||
if morsel.get('max-age'):
|
|
||||||
del morsel['max-age']
|
|
||||||
|
|
||||||
results.append((header, morsel.OutputString()))
|
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
class MinimalScopeCookieRewriter(WbUrlBaseCookieRewriter):
|
||||||
|
"""
|
||||||
|
Attempt to rewrite cookies to minimal scope possible
|
||||||
|
|
||||||
|
If path present, rewrite path to current rewritten url only
|
||||||
|
If domain present, remove domain and set to path prefix
|
||||||
|
"""
|
||||||
|
|
||||||
|
def rewrite_cookie(self, name, morsel):
|
||||||
|
# if domain set, no choice but to expand cookie path to root
|
||||||
|
if morsel.get('domain'):
|
||||||
|
del morsel['domain']
|
||||||
|
morsel['path'] = self.url_rewriter.rel_prefix
|
||||||
|
# else set cookie to rewritten path
|
||||||
|
elif morsel.get('path'):
|
||||||
|
morsel['path'] = self.url_rewriter.rewrite(morsel['path'])
|
||||||
|
|
||||||
|
# remove expires as it refers to archived time
|
||||||
|
if morsel.get('expires'):
|
||||||
|
del morsel['expires']
|
||||||
|
|
||||||
|
# don't use max-age, just expire at end of session
|
||||||
|
if morsel.get('max-age'):
|
||||||
|
del morsel['max-age']
|
||||||
|
|
||||||
|
return morsel
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
class RootScopeCookieRewriter(WbUrlBaseCookieRewriter):
|
||||||
|
"""
|
||||||
|
Sometimes it is necessary to rewrite cookies to root scope
|
||||||
|
in order to work across time boundaries and modifiers
|
||||||
|
|
||||||
|
This rewriter simply sets all cookies to be in the root
|
||||||
|
"""
|
||||||
|
def rewrite_cookie(self, name, morsel):
|
||||||
|
# get root path
|
||||||
|
morsel['path'] = self.url_rewriter.root_path
|
||||||
|
|
||||||
|
# remove domain
|
||||||
|
if morsel.get('domain'):
|
||||||
|
del morsel['domain']
|
||||||
|
|
||||||
|
# remove expires as it refers to archived time
|
||||||
|
if morsel.get('expires'):
|
||||||
|
del morsel['expires']
|
||||||
|
|
||||||
|
return morsel
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
def get_cookie_rewriter(rule):
|
||||||
|
if rule and rule.cookie_scope == 'root':
|
||||||
|
return RootScopeCookieRewriter
|
||||||
|
else:
|
||||||
|
return MinimalScopeCookieRewriter
|
||||||
|
@ -49,7 +49,7 @@ class HeaderRewriter:
|
|||||||
def __init__(self, header_prefix='X-Archive-Orig-'):
|
def __init__(self, header_prefix='X-Archive-Orig-'):
|
||||||
self.header_prefix = header_prefix
|
self.header_prefix = header_prefix
|
||||||
|
|
||||||
def rewrite(self, status_headers, urlrewriter):
|
def rewrite(self, status_headers, urlrewriter, cookie_rewriter):
|
||||||
content_type = status_headers.get_header('Content-Type')
|
content_type = status_headers.get_header('Content-Type')
|
||||||
text_type = None
|
text_type = None
|
||||||
charset = None
|
charset = None
|
||||||
@ -63,6 +63,7 @@ class HeaderRewriter:
|
|||||||
|
|
||||||
result = self._rewrite_headers(status_headers.headers,
|
result = self._rewrite_headers(status_headers.headers,
|
||||||
urlrewriter,
|
urlrewriter,
|
||||||
|
cookie_rewriter,
|
||||||
strip_encoding)
|
strip_encoding)
|
||||||
|
|
||||||
new_headers = result[0]
|
new_headers = result[0]
|
||||||
@ -89,15 +90,12 @@ class HeaderRewriter:
|
|||||||
|
|
||||||
return content_type[idx + len(CHARSET_TOKEN):].lower()
|
return content_type[idx + len(CHARSET_TOKEN):].lower()
|
||||||
|
|
||||||
def _rewrite_headers(self, headers, urlrewriter, content_rewritten=False):
|
def _rewrite_headers(self, headers, urlrewriter, cookie_rewriter,
|
||||||
|
content_rewritten):
|
||||||
|
|
||||||
new_headers = []
|
new_headers = []
|
||||||
removed_header_dict = {}
|
removed_header_dict = {}
|
||||||
|
|
||||||
if urlrewriter:
|
|
||||||
cookie_rewriter = urlrewriter.get_cookie_rewriter()
|
|
||||||
else:
|
|
||||||
cookie_rewriter = None
|
|
||||||
|
|
||||||
for (name, value) in headers:
|
for (name, value) in headers:
|
||||||
|
|
||||||
lowername = name.lower()
|
lowername = name.lower()
|
||||||
|
@ -37,13 +37,17 @@ class RewriteContent:
|
|||||||
|
|
||||||
return (status_headers, stream)
|
return (status_headers, stream)
|
||||||
|
|
||||||
def rewrite_headers(self, urlrewriter, status_headers, stream, urlkey=''):
|
def _rewrite_headers(self, urlrewriter, rule, status_headers, stream, urlkey=''):
|
||||||
|
|
||||||
header_rewriter_class = (self.ruleset.get_first_match(urlkey).
|
header_rewriter_class = rule.rewriters['header']
|
||||||
rewriters['header'])
|
|
||||||
|
cookie_rewriter = None
|
||||||
|
|
||||||
|
if urlrewriter:
|
||||||
|
cookie_rewriter = urlrewriter.get_cookie_rewriter(rule)
|
||||||
|
|
||||||
rewritten_headers = (header_rewriter_class().
|
rewritten_headers = (header_rewriter_class().
|
||||||
rewrite(status_headers, urlrewriter))
|
rewrite(status_headers, urlrewriter, cookie_rewriter))
|
||||||
|
|
||||||
# note: since chunk encoding may/may not be valid,
|
# note: since chunk encoding may/may not be valid,
|
||||||
# the approach taken here is to *always* attempt
|
# the approach taken here is to *always* attempt
|
||||||
@ -74,9 +78,12 @@ class RewriteContent:
|
|||||||
if wb_url.is_banner_only:
|
if wb_url.is_banner_only:
|
||||||
urlrewriter = None
|
urlrewriter = None
|
||||||
|
|
||||||
(rewritten_headers, stream) = self.rewrite_headers(urlrewriter,
|
rule = self.ruleset.get_first_match(urlkey)
|
||||||
headers,
|
|
||||||
stream)
|
(rewritten_headers, stream) = self._rewrite_headers(urlrewriter,
|
||||||
|
rule,
|
||||||
|
headers,
|
||||||
|
stream)
|
||||||
|
|
||||||
status_headers = rewritten_headers.status_headers
|
status_headers = rewritten_headers.status_headers
|
||||||
|
|
||||||
@ -112,8 +119,6 @@ class RewriteContent:
|
|||||||
else:
|
else:
|
||||||
stream = DecompressingBufferedReader(stream)
|
stream = DecompressingBufferedReader(stream)
|
||||||
|
|
||||||
rule = self.ruleset.get_first_match(urlkey)
|
|
||||||
|
|
||||||
rewriter_class = rule.rewriters[text_type]
|
rewriter_class = rule.rewriters[text_type]
|
||||||
|
|
||||||
# for html, need to perform header insert, supply js, css, xml
|
# for html, need to perform header insert, supply js, css, xml
|
||||||
|
@ -42,6 +42,9 @@ class RewriteRules(BaseRule):
|
|||||||
# add any regexs for js rewriter
|
# add any regexs for js rewriter
|
||||||
self._add_custom_regexs('js', config)
|
self._add_custom_regexs('js', config)
|
||||||
|
|
||||||
|
# cookie rewrite scope
|
||||||
|
self.cookie_scope = config.get('cookie_scope', 'default')
|
||||||
|
|
||||||
def _add_custom_regexs(self, field, config):
|
def _add_custom_regexs(self, field, config):
|
||||||
regexs = config.get(field + '_regexs')
|
regexs = config.get(field + '_regexs')
|
||||||
if not regexs:
|
if not regexs:
|
||||||
|
@ -26,7 +26,7 @@ r"""
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
from pywb.rewrite.cookie_rewriter import WbUrlCookieRewriter
|
from pywb.rewrite.cookie_rewriter import MinimalScopeCookieRewriter
|
||||||
from pywb.rewrite.url_rewriter import UrlRewriter
|
from pywb.rewrite.url_rewriter import UrlRewriter
|
||||||
|
|
||||||
urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/pywb/')
|
urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/pywb/')
|
||||||
@ -35,5 +35,5 @@ urlrewriter2 = UrlRewriter('em_/http://example.com/', '/preview/')
|
|||||||
|
|
||||||
|
|
||||||
def rewrite_cookie(cookie_str, rewriter=urlrewriter):
|
def rewrite_cookie(cookie_str, rewriter=urlrewriter):
|
||||||
return WbUrlCookieRewriter(rewriter).rewrite(cookie_str)
|
return MinimalScopeCookieRewriter(rewriter).rewrite(cookie_str)
|
||||||
|
|
||||||
|
@ -71,7 +71,7 @@ urlrewriter = UrlRewriter('20131010/http://example.com/', '/web/')
|
|||||||
headerrewriter = HeaderRewriter()
|
headerrewriter = HeaderRewriter()
|
||||||
|
|
||||||
def _test_headers(headers, status = '200 OK'):
|
def _test_headers(headers, status = '200 OK'):
|
||||||
rewritten = headerrewriter.rewrite(StatusAndHeaders(status, headers), urlrewriter)
|
rewritten = headerrewriter.rewrite(StatusAndHeaders(status, headers), urlrewriter, urlrewriter.get_cookie_rewriter())
|
||||||
return pprint.pprint(vars(rewritten))
|
return pprint.pprint(vars(rewritten))
|
||||||
|
|
||||||
|
|
||||||
|
@ -2,7 +2,7 @@ import copy
|
|||||||
import urlparse
|
import urlparse
|
||||||
|
|
||||||
from wburl import WbUrl
|
from wburl import WbUrl
|
||||||
from cookie_rewriter import WbUrlCookieRewriter
|
from cookie_rewriter import get_cookie_rewriter
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
@ -18,11 +18,12 @@ class UrlRewriter(object):
|
|||||||
|
|
||||||
PROTOCOLS = ['http:', 'https:', 'ftp:', 'mms:', 'rtsp:', 'wais:']
|
PROTOCOLS = ['http:', 'https:', 'ftp:', 'mms:', 'rtsp:', 'wais:']
|
||||||
|
|
||||||
def __init__(self, wburl, prefix, full_prefix=None, rel_prefix=None):
|
def __init__(self, wburl, prefix, full_prefix=None, rel_prefix=None, root_path=None):
|
||||||
self.wburl = wburl if isinstance(wburl, WbUrl) else WbUrl(wburl)
|
self.wburl = wburl if isinstance(wburl, WbUrl) else WbUrl(wburl)
|
||||||
self.prefix = prefix
|
self.prefix = prefix
|
||||||
self.full_prefix = full_prefix
|
self.full_prefix = full_prefix
|
||||||
self.rel_prefix = rel_prefix if rel_prefix else prefix
|
self.rel_prefix = rel_prefix if rel_prefix else prefix
|
||||||
|
self.root_path = root_path if root_path else '/'
|
||||||
|
|
||||||
def rewrite(self, url, mod=None):
|
def rewrite(self, url, mod=None):
|
||||||
# if special protocol, no rewriting at all
|
# if special protocol, no rewriting at all
|
||||||
@ -83,8 +84,9 @@ class UrlRewriter(object):
|
|||||||
new_wburl = WbUrl(new_url)
|
new_wburl = WbUrl(new_url)
|
||||||
return UrlRewriter(new_wburl, self.prefix)
|
return UrlRewriter(new_wburl, self.prefix)
|
||||||
|
|
||||||
def get_cookie_rewriter(self):
|
def get_cookie_rewriter(self, rule=None):
|
||||||
return WbUrlCookieRewriter(self)
|
cls = get_cookie_rewriter(rule)
|
||||||
|
return cls(self)
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return "UrlRewriter('{0}', '{1}')".format(self.wburl, self.prefix)
|
return "UrlRewriter('{0}', '{1}')".format(self.wburl, self.prefix)
|
||||||
@ -149,5 +151,5 @@ class HttpsUrlRewriter(UrlRewriter):
|
|||||||
def rebase_rewriter(self, new_url):
|
def rebase_rewriter(self, new_url):
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def get_cookie_rewriter(self):
|
def get_cookie_rewriter(self, rule=None):
|
||||||
return None
|
return None
|
||||||
|
@ -46,7 +46,14 @@ rules:
|
|||||||
parse_comments: true
|
parse_comments: true
|
||||||
|
|
||||||
|
|
||||||
# flickr rules
|
# instagram rules
|
||||||
|
#=================================================================
|
||||||
|
- url_prefix: 'com,instagram'
|
||||||
|
rewrite:
|
||||||
|
cookie_scope: root
|
||||||
|
|
||||||
|
|
||||||
|
# flickr rules
|
||||||
#=================================================================
|
#=================================================================
|
||||||
- url_prefix: ['com,yimg,l)/g/combo', 'com,yimg,s)/pw/combo', 'com,yahooapis,yui)/combo']
|
- url_prefix: ['com,yimg,l)/g/combo', 'com,yimg,s)/pw/combo', 'com,yahooapis,yui)/combo']
|
||||||
fuzzy_lookup: '([^/]+(?:\.css|\.js))'
|
fuzzy_lookup: '([^/]+(?:\.css|\.js))'
|
||||||
|
Loading…
x
Reference in New Issue
Block a user