From f1b3f8c76f19a016314eea169fa13041ecba2319 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 30 Sep 2014 12:42:11 -0700 Subject: [PATCH] cookie rewriter work: ability to set a custom 'root scope' rewriter, which sets the path of all cookies to pywb root. Option to enable per url-prefix in rules, still more testing, other options needed --- pywb/framework/wbrequestresponse.py | 6 +- pywb/rewrite/cookie_rewriter.py | 85 +++++++++++++++++------ pywb/rewrite/header_rewriter.py | 12 ++-- pywb/rewrite/rewrite_content.py | 23 +++--- pywb/rewrite/rewriterules.py | 3 + pywb/rewrite/test/test_cookie_rewriter.py | 4 +- pywb/rewrite/test/test_header_rewriter.py | 2 +- pywb/rewrite/url_rewriter.py | 12 ++-- pywb/rules.yaml | 9 ++- 9 files changed, 107 insertions(+), 49 deletions(-) diff --git a/pywb/framework/wbrequestresponse.py b/pywb/framework/wbrequestresponse.py index 17dea3d8..49b4737b 100644 --- a/pywb/framework/wbrequestresponse.py +++ b/pywb/framework/wbrequestresponse.py @@ -71,7 +71,8 @@ class WbRequest(object): self.wb_url = wburl_class(wb_url_str) self.urlrewriter = urlrewriter_class(self.wb_url, self.wb_prefix, host_prefix + rel_prefix, - rel_prefix) + rel_prefix, + env.get('SCRIPT_NAME', '/')) else: # no wb_url, just store blank wb_url self.wb_url = None @@ -96,9 +97,6 @@ class WbRequest(object): if value and value.lower() == 'xmlhttprequest': return True - #if self.referrer and ('ajaxpipe' in self.env.get('QUERY_STRING')): - # return True - return False def __repr__(self): diff --git a/pywb/rewrite/cookie_rewriter.py b/pywb/rewrite/cookie_rewriter.py index f4736b91..d540b749 100644 --- a/pywb/rewrite/cookie_rewriter.py +++ b/pywb/rewrite/cookie_rewriter.py @@ -2,10 +2,8 @@ from Cookie import SimpleCookie, CookieError #================================================================= -class WbUrlCookieRewriter(object): - """ Cookie rewriter for wburl-based requests - Remove the domain and rewrite path, if any, to match - given WbUrl using the url rewriter. +class WbUrlBaseCookieRewriter(object): + """ Base Cookie rewriter for wburl-based requests. """ def __init__(self, url_rewriter): self.url_rewriter = url_rewriter @@ -19,21 +17,68 @@ class WbUrlCookieRewriter(object): return results for name, morsel in cookie.iteritems(): - # if domain set, no choice but to expand cookie path to root - if morsel.get('domain'): - del morsel['domain'] - morsel['path'] = self.url_rewriter.rel_prefix - # else set cookie to rewritten path - elif morsel.get('path'): - morsel['path'] = self.url_rewriter.rewrite(morsel['path']) - # remove expires as it refers to archived time - if morsel.get('expires'): - del morsel['expires'] - - # don't use max-age, just expire at end of session - if morsel.get('max-age'): - del morsel['max-age'] - - results.append((header, morsel.OutputString())) + morsel = self.rewrite_cookie(name, morsel) + if morsel: + results.append((header, morsel.OutputString())) return results + + +#================================================================= +class MinimalScopeCookieRewriter(WbUrlBaseCookieRewriter): + """ + Attempt to rewrite cookies to minimal scope possible + + If path present, rewrite path to current rewritten url only + If domain present, remove domain and set to path prefix + """ + + def rewrite_cookie(self, name, morsel): + # if domain set, no choice but to expand cookie path to root + if morsel.get('domain'): + del morsel['domain'] + morsel['path'] = self.url_rewriter.rel_prefix + # else set cookie to rewritten path + elif morsel.get('path'): + morsel['path'] = self.url_rewriter.rewrite(morsel['path']) + + # remove expires as it refers to archived time + if morsel.get('expires'): + del morsel['expires'] + + # don't use max-age, just expire at end of session + if morsel.get('max-age'): + del morsel['max-age'] + + return morsel + + +#================================================================= +class RootScopeCookieRewriter(WbUrlBaseCookieRewriter): + """ + Sometimes it is necessary to rewrite cookies to root scope + in order to work across time boundaries and modifiers + + This rewriter simply sets all cookies to be in the root + """ + def rewrite_cookie(self, name, morsel): + # get root path + morsel['path'] = self.url_rewriter.root_path + + # remove domain + if morsel.get('domain'): + del morsel['domain'] + + # remove expires as it refers to archived time + if morsel.get('expires'): + del morsel['expires'] + + return morsel + + +#================================================================= +def get_cookie_rewriter(rule): + if rule and rule.cookie_scope == 'root': + return RootScopeCookieRewriter + else: + return MinimalScopeCookieRewriter diff --git a/pywb/rewrite/header_rewriter.py b/pywb/rewrite/header_rewriter.py index 950817d4..fd35d254 100644 --- a/pywb/rewrite/header_rewriter.py +++ b/pywb/rewrite/header_rewriter.py @@ -49,7 +49,7 @@ class HeaderRewriter: def __init__(self, header_prefix='X-Archive-Orig-'): self.header_prefix = header_prefix - def rewrite(self, status_headers, urlrewriter): + def rewrite(self, status_headers, urlrewriter, cookie_rewriter): content_type = status_headers.get_header('Content-Type') text_type = None charset = None @@ -63,6 +63,7 @@ class HeaderRewriter: result = self._rewrite_headers(status_headers.headers, urlrewriter, + cookie_rewriter, strip_encoding) new_headers = result[0] @@ -89,15 +90,12 @@ class HeaderRewriter: return content_type[idx + len(CHARSET_TOKEN):].lower() - def _rewrite_headers(self, headers, urlrewriter, content_rewritten=False): + def _rewrite_headers(self, headers, urlrewriter, cookie_rewriter, + content_rewritten): + new_headers = [] removed_header_dict = {} - if urlrewriter: - cookie_rewriter = urlrewriter.get_cookie_rewriter() - else: - cookie_rewriter = None - for (name, value) in headers: lowername = name.lower() diff --git a/pywb/rewrite/rewrite_content.py b/pywb/rewrite/rewrite_content.py index 0897daa9..5ba5e62b 100644 --- a/pywb/rewrite/rewrite_content.py +++ b/pywb/rewrite/rewrite_content.py @@ -37,13 +37,17 @@ class RewriteContent: return (status_headers, stream) - def rewrite_headers(self, urlrewriter, status_headers, stream, urlkey=''): + def _rewrite_headers(self, urlrewriter, rule, status_headers, stream, urlkey=''): - header_rewriter_class = (self.ruleset.get_first_match(urlkey). - rewriters['header']) + header_rewriter_class = rule.rewriters['header'] + + cookie_rewriter = None + + if urlrewriter: + cookie_rewriter = urlrewriter.get_cookie_rewriter(rule) rewritten_headers = (header_rewriter_class(). - rewrite(status_headers, urlrewriter)) + rewrite(status_headers, urlrewriter, cookie_rewriter)) # note: since chunk encoding may/may not be valid, # the approach taken here is to *always* attempt @@ -74,9 +78,12 @@ class RewriteContent: if wb_url.is_banner_only: urlrewriter = None - (rewritten_headers, stream) = self.rewrite_headers(urlrewriter, - headers, - stream) + rule = self.ruleset.get_first_match(urlkey) + + (rewritten_headers, stream) = self._rewrite_headers(urlrewriter, + rule, + headers, + stream) status_headers = rewritten_headers.status_headers @@ -112,8 +119,6 @@ class RewriteContent: else: stream = DecompressingBufferedReader(stream) - rule = self.ruleset.get_first_match(urlkey) - rewriter_class = rule.rewriters[text_type] # for html, need to perform header insert, supply js, css, xml diff --git a/pywb/rewrite/rewriterules.py b/pywb/rewrite/rewriterules.py index 880540f7..ed218bcd 100644 --- a/pywb/rewrite/rewriterules.py +++ b/pywb/rewrite/rewriterules.py @@ -42,6 +42,9 @@ class RewriteRules(BaseRule): # add any regexs for js rewriter self._add_custom_regexs('js', config) + # cookie rewrite scope + self.cookie_scope = config.get('cookie_scope', 'default') + def _add_custom_regexs(self, field, config): regexs = config.get(field + '_regexs') if not regexs: diff --git a/pywb/rewrite/test/test_cookie_rewriter.py b/pywb/rewrite/test/test_cookie_rewriter.py index f15a9450..c20f56f9 100644 --- a/pywb/rewrite/test/test_cookie_rewriter.py +++ b/pywb/rewrite/test/test_cookie_rewriter.py @@ -26,7 +26,7 @@ r""" """ -from pywb.rewrite.cookie_rewriter import WbUrlCookieRewriter +from pywb.rewrite.cookie_rewriter import MinimalScopeCookieRewriter from pywb.rewrite.url_rewriter import UrlRewriter urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/pywb/') @@ -35,5 +35,5 @@ urlrewriter2 = UrlRewriter('em_/http://example.com/', '/preview/') def rewrite_cookie(cookie_str, rewriter=urlrewriter): - return WbUrlCookieRewriter(rewriter).rewrite(cookie_str) + return MinimalScopeCookieRewriter(rewriter).rewrite(cookie_str) diff --git a/pywb/rewrite/test/test_header_rewriter.py b/pywb/rewrite/test/test_header_rewriter.py index 0b22d533..05d7dea5 100644 --- a/pywb/rewrite/test/test_header_rewriter.py +++ b/pywb/rewrite/test/test_header_rewriter.py @@ -71,7 +71,7 @@ urlrewriter = UrlRewriter('20131010/http://example.com/', '/web/') headerrewriter = HeaderRewriter() def _test_headers(headers, status = '200 OK'): - rewritten = headerrewriter.rewrite(StatusAndHeaders(status, headers), urlrewriter) + rewritten = headerrewriter.rewrite(StatusAndHeaders(status, headers), urlrewriter, urlrewriter.get_cookie_rewriter()) return pprint.pprint(vars(rewritten)) diff --git a/pywb/rewrite/url_rewriter.py b/pywb/rewrite/url_rewriter.py index c6bd6f9c..dc16385a 100644 --- a/pywb/rewrite/url_rewriter.py +++ b/pywb/rewrite/url_rewriter.py @@ -2,7 +2,7 @@ import copy import urlparse from wburl import WbUrl -from cookie_rewriter import WbUrlCookieRewriter +from cookie_rewriter import get_cookie_rewriter #================================================================= @@ -18,11 +18,12 @@ class UrlRewriter(object): PROTOCOLS = ['http:', 'https:', 'ftp:', 'mms:', 'rtsp:', 'wais:'] - def __init__(self, wburl, prefix, full_prefix=None, rel_prefix=None): + def __init__(self, wburl, prefix, full_prefix=None, rel_prefix=None, root_path=None): self.wburl = wburl if isinstance(wburl, WbUrl) else WbUrl(wburl) self.prefix = prefix self.full_prefix = full_prefix self.rel_prefix = rel_prefix if rel_prefix else prefix + self.root_path = root_path if root_path else '/' def rewrite(self, url, mod=None): # if special protocol, no rewriting at all @@ -83,8 +84,9 @@ class UrlRewriter(object): new_wburl = WbUrl(new_url) return UrlRewriter(new_wburl, self.prefix) - def get_cookie_rewriter(self): - return WbUrlCookieRewriter(self) + def get_cookie_rewriter(self, rule=None): + cls = get_cookie_rewriter(rule) + return cls(self) def __repr__(self): return "UrlRewriter('{0}', '{1}')".format(self.wburl, self.prefix) @@ -149,5 +151,5 @@ class HttpsUrlRewriter(UrlRewriter): def rebase_rewriter(self, new_url): return self - def get_cookie_rewriter(self): + def get_cookie_rewriter(self, rule=None): return None diff --git a/pywb/rules.yaml b/pywb/rules.yaml index 20cc0753..227e0198 100644 --- a/pywb/rules.yaml +++ b/pywb/rules.yaml @@ -46,7 +46,14 @@ rules: parse_comments: true - # flickr rules + # instagram rules + #================================================================= + - url_prefix: 'com,instagram' + rewrite: + cookie_scope: root + + + # flickr rules #================================================================= - url_prefix: ['com,yimg,l)/g/combo', 'com,yimg,s)/pw/combo', 'com,yahooapis,yui)/combo'] fuzzy_lookup: '([^/]+(?:\.css|\.js))'