1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

cookie rewriter work: ability to set a custom 'root scope' rewriter,

which sets the path of all cookies to pywb root.
Option to enable per url-prefix in rules, still more testing, other
options needed
This commit is contained in:
Ilya Kreymer 2014-09-30 12:42:11 -07:00
parent 7feb0893eb
commit f1b3f8c76f
9 changed files with 107 additions and 49 deletions

View File

@ -71,7 +71,8 @@ class WbRequest(object):
self.wb_url = wburl_class(wb_url_str) self.wb_url = wburl_class(wb_url_str)
self.urlrewriter = urlrewriter_class(self.wb_url, self.wb_prefix, self.urlrewriter = urlrewriter_class(self.wb_url, self.wb_prefix,
host_prefix + rel_prefix, host_prefix + rel_prefix,
rel_prefix) rel_prefix,
env.get('SCRIPT_NAME', '/'))
else: else:
# no wb_url, just store blank wb_url # no wb_url, just store blank wb_url
self.wb_url = None self.wb_url = None
@ -96,9 +97,6 @@ class WbRequest(object):
if value and value.lower() == 'xmlhttprequest': if value and value.lower() == 'xmlhttprequest':
return True return True
#if self.referrer and ('ajaxpipe' in self.env.get('QUERY_STRING')):
# return True
return False return False
def __repr__(self): def __repr__(self):

View File

@ -2,10 +2,8 @@ from Cookie import SimpleCookie, CookieError
#================================================================= #=================================================================
class WbUrlCookieRewriter(object): class WbUrlBaseCookieRewriter(object):
""" Cookie rewriter for wburl-based requests """ Base Cookie rewriter for wburl-based requests.
Remove the domain and rewrite path, if any, to match
given WbUrl using the url rewriter.
""" """
def __init__(self, url_rewriter): def __init__(self, url_rewriter):
self.url_rewriter = url_rewriter self.url_rewriter = url_rewriter
@ -19,21 +17,68 @@ class WbUrlCookieRewriter(object):
return results return results
for name, morsel in cookie.iteritems(): for name, morsel in cookie.iteritems():
# if domain set, no choice but to expand cookie path to root morsel = self.rewrite_cookie(name, morsel)
if morsel.get('domain'): if morsel:
del morsel['domain'] results.append((header, morsel.OutputString()))
morsel['path'] = self.url_rewriter.rel_prefix
# else set cookie to rewritten path
elif morsel.get('path'):
morsel['path'] = self.url_rewriter.rewrite(morsel['path'])
# remove expires as it refers to archived time
if morsel.get('expires'):
del morsel['expires']
# don't use max-age, just expire at end of session
if morsel.get('max-age'):
del morsel['max-age']
results.append((header, morsel.OutputString()))
return results return results
#=================================================================
class MinimalScopeCookieRewriter(WbUrlBaseCookieRewriter):
"""
Attempt to rewrite cookies to minimal scope possible
If path present, rewrite path to current rewritten url only
If domain present, remove domain and set to path prefix
"""
def rewrite_cookie(self, name, morsel):
# if domain set, no choice but to expand cookie path to root
if morsel.get('domain'):
del morsel['domain']
morsel['path'] = self.url_rewriter.rel_prefix
# else set cookie to rewritten path
elif morsel.get('path'):
morsel['path'] = self.url_rewriter.rewrite(morsel['path'])
# remove expires as it refers to archived time
if morsel.get('expires'):
del morsel['expires']
# don't use max-age, just expire at end of session
if morsel.get('max-age'):
del morsel['max-age']
return morsel
#=================================================================
class RootScopeCookieRewriter(WbUrlBaseCookieRewriter):
"""
Sometimes it is necessary to rewrite cookies to root scope
in order to work across time boundaries and modifiers
This rewriter simply sets all cookies to be in the root
"""
def rewrite_cookie(self, name, morsel):
# get root path
morsel['path'] = self.url_rewriter.root_path
# remove domain
if morsel.get('domain'):
del morsel['domain']
# remove expires as it refers to archived time
if morsel.get('expires'):
del morsel['expires']
return morsel
#=================================================================
def get_cookie_rewriter(rule):
if rule and rule.cookie_scope == 'root':
return RootScopeCookieRewriter
else:
return MinimalScopeCookieRewriter

View File

@ -49,7 +49,7 @@ class HeaderRewriter:
def __init__(self, header_prefix='X-Archive-Orig-'): def __init__(self, header_prefix='X-Archive-Orig-'):
self.header_prefix = header_prefix self.header_prefix = header_prefix
def rewrite(self, status_headers, urlrewriter): def rewrite(self, status_headers, urlrewriter, cookie_rewriter):
content_type = status_headers.get_header('Content-Type') content_type = status_headers.get_header('Content-Type')
text_type = None text_type = None
charset = None charset = None
@ -63,6 +63,7 @@ class HeaderRewriter:
result = self._rewrite_headers(status_headers.headers, result = self._rewrite_headers(status_headers.headers,
urlrewriter, urlrewriter,
cookie_rewriter,
strip_encoding) strip_encoding)
new_headers = result[0] new_headers = result[0]
@ -89,15 +90,12 @@ class HeaderRewriter:
return content_type[idx + len(CHARSET_TOKEN):].lower() return content_type[idx + len(CHARSET_TOKEN):].lower()
def _rewrite_headers(self, headers, urlrewriter, content_rewritten=False): def _rewrite_headers(self, headers, urlrewriter, cookie_rewriter,
content_rewritten):
new_headers = [] new_headers = []
removed_header_dict = {} removed_header_dict = {}
if urlrewriter:
cookie_rewriter = urlrewriter.get_cookie_rewriter()
else:
cookie_rewriter = None
for (name, value) in headers: for (name, value) in headers:
lowername = name.lower() lowername = name.lower()

View File

@ -37,13 +37,17 @@ class RewriteContent:
return (status_headers, stream) return (status_headers, stream)
def rewrite_headers(self, urlrewriter, status_headers, stream, urlkey=''): def _rewrite_headers(self, urlrewriter, rule, status_headers, stream, urlkey=''):
header_rewriter_class = (self.ruleset.get_first_match(urlkey). header_rewriter_class = rule.rewriters['header']
rewriters['header'])
cookie_rewriter = None
if urlrewriter:
cookie_rewriter = urlrewriter.get_cookie_rewriter(rule)
rewritten_headers = (header_rewriter_class(). rewritten_headers = (header_rewriter_class().
rewrite(status_headers, urlrewriter)) rewrite(status_headers, urlrewriter, cookie_rewriter))
# note: since chunk encoding may/may not be valid, # note: since chunk encoding may/may not be valid,
# the approach taken here is to *always* attempt # the approach taken here is to *always* attempt
@ -74,9 +78,12 @@ class RewriteContent:
if wb_url.is_banner_only: if wb_url.is_banner_only:
urlrewriter = None urlrewriter = None
(rewritten_headers, stream) = self.rewrite_headers(urlrewriter, rule = self.ruleset.get_first_match(urlkey)
headers,
stream) (rewritten_headers, stream) = self._rewrite_headers(urlrewriter,
rule,
headers,
stream)
status_headers = rewritten_headers.status_headers status_headers = rewritten_headers.status_headers
@ -112,8 +119,6 @@ class RewriteContent:
else: else:
stream = DecompressingBufferedReader(stream) stream = DecompressingBufferedReader(stream)
rule = self.ruleset.get_first_match(urlkey)
rewriter_class = rule.rewriters[text_type] rewriter_class = rule.rewriters[text_type]
# for html, need to perform header insert, supply js, css, xml # for html, need to perform header insert, supply js, css, xml

View File

@ -42,6 +42,9 @@ class RewriteRules(BaseRule):
# add any regexs for js rewriter # add any regexs for js rewriter
self._add_custom_regexs('js', config) self._add_custom_regexs('js', config)
# cookie rewrite scope
self.cookie_scope = config.get('cookie_scope', 'default')
def _add_custom_regexs(self, field, config): def _add_custom_regexs(self, field, config):
regexs = config.get(field + '_regexs') regexs = config.get(field + '_regexs')
if not regexs: if not regexs:

View File

@ -26,7 +26,7 @@ r"""
""" """
from pywb.rewrite.cookie_rewriter import WbUrlCookieRewriter from pywb.rewrite.cookie_rewriter import MinimalScopeCookieRewriter
from pywb.rewrite.url_rewriter import UrlRewriter from pywb.rewrite.url_rewriter import UrlRewriter
urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/pywb/') urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/pywb/')
@ -35,5 +35,5 @@ urlrewriter2 = UrlRewriter('em_/http://example.com/', '/preview/')
def rewrite_cookie(cookie_str, rewriter=urlrewriter): def rewrite_cookie(cookie_str, rewriter=urlrewriter):
return WbUrlCookieRewriter(rewriter).rewrite(cookie_str) return MinimalScopeCookieRewriter(rewriter).rewrite(cookie_str)

View File

@ -71,7 +71,7 @@ urlrewriter = UrlRewriter('20131010/http://example.com/', '/web/')
headerrewriter = HeaderRewriter() headerrewriter = HeaderRewriter()
def _test_headers(headers, status = '200 OK'): def _test_headers(headers, status = '200 OK'):
rewritten = headerrewriter.rewrite(StatusAndHeaders(status, headers), urlrewriter) rewritten = headerrewriter.rewrite(StatusAndHeaders(status, headers), urlrewriter, urlrewriter.get_cookie_rewriter())
return pprint.pprint(vars(rewritten)) return pprint.pprint(vars(rewritten))

View File

@ -2,7 +2,7 @@ import copy
import urlparse import urlparse
from wburl import WbUrl from wburl import WbUrl
from cookie_rewriter import WbUrlCookieRewriter from cookie_rewriter import get_cookie_rewriter
#================================================================= #=================================================================
@ -18,11 +18,12 @@ class UrlRewriter(object):
PROTOCOLS = ['http:', 'https:', 'ftp:', 'mms:', 'rtsp:', 'wais:'] PROTOCOLS = ['http:', 'https:', 'ftp:', 'mms:', 'rtsp:', 'wais:']
def __init__(self, wburl, prefix, full_prefix=None, rel_prefix=None): def __init__(self, wburl, prefix, full_prefix=None, rel_prefix=None, root_path=None):
self.wburl = wburl if isinstance(wburl, WbUrl) else WbUrl(wburl) self.wburl = wburl if isinstance(wburl, WbUrl) else WbUrl(wburl)
self.prefix = prefix self.prefix = prefix
self.full_prefix = full_prefix self.full_prefix = full_prefix
self.rel_prefix = rel_prefix if rel_prefix else prefix self.rel_prefix = rel_prefix if rel_prefix else prefix
self.root_path = root_path if root_path else '/'
def rewrite(self, url, mod=None): def rewrite(self, url, mod=None):
# if special protocol, no rewriting at all # if special protocol, no rewriting at all
@ -83,8 +84,9 @@ class UrlRewriter(object):
new_wburl = WbUrl(new_url) new_wburl = WbUrl(new_url)
return UrlRewriter(new_wburl, self.prefix) return UrlRewriter(new_wburl, self.prefix)
def get_cookie_rewriter(self): def get_cookie_rewriter(self, rule=None):
return WbUrlCookieRewriter(self) cls = get_cookie_rewriter(rule)
return cls(self)
def __repr__(self): def __repr__(self):
return "UrlRewriter('{0}', '{1}')".format(self.wburl, self.prefix) return "UrlRewriter('{0}', '{1}')".format(self.wburl, self.prefix)
@ -149,5 +151,5 @@ class HttpsUrlRewriter(UrlRewriter):
def rebase_rewriter(self, new_url): def rebase_rewriter(self, new_url):
return self return self
def get_cookie_rewriter(self): def get_cookie_rewriter(self, rule=None):
return None return None

View File

@ -46,7 +46,14 @@ rules:
parse_comments: true parse_comments: true
# flickr rules # instagram rules
#=================================================================
- url_prefix: 'com,instagram'
rewrite:
cookie_scope: root
# flickr rules
#================================================================= #=================================================================
- url_prefix: ['com,yimg,l)/g/combo', 'com,yimg,s)/pw/combo', 'com,yahooapis,yui)/combo'] - url_prefix: ['com,yimg,l)/g/combo', 'com,yimg,s)/pw/combo', 'com,yahooapis,yui)/combo']
fuzzy_lookup: '([^/]+(?:\.css|\.js))' fuzzy_lookup: '([^/]+(?:\.css|\.js))'