From 498a8644418a707b6484520439e909462ed3a432 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 6 Oct 2014 10:14:45 -0700 Subject: [PATCH] rewriting: support setting cookie_scope at collection level js rewriting: add custom url rewrite option to per-url rewrite rules --- pywb/framework/archivalrouter.py | 4 +++- pywb/framework/wbrequestresponse.py | 9 ++++++--- pywb/rewrite/cookie_rewriter.py | 4 ++-- pywb/rewrite/regex_rewriters.py | 20 +++++++++++++------- pywb/rewrite/rewriterules.py | 6 +++--- pywb/rewrite/url_rewriter.py | 14 ++++++++++---- pywb/rules.yaml | 14 ++++++++++---- pywb/webapp/pywb_init.py | 4 ++-- 8 files changed, 49 insertions(+), 26 deletions(-) diff --git a/pywb/framework/archivalrouter.py b/pywb/framework/archivalrouter.py index df5b7ec6..33230027 100644 --- a/pywb/framework/archivalrouter.py +++ b/pywb/framework/archivalrouter.py @@ -61,7 +61,8 @@ class ArchivalRouter(object): coll=coll, use_abs_prefix=use_abs_prefix, wburl_class=route.handler.get_wburl_type(), - urlrewriter_class=UrlRewriter) + urlrewriter_class=UrlRewriter, + cookie_scope=route.cookie_scope) # Allow for applying of additional filters route.apply_filters(wbrequest, matcher) @@ -99,6 +100,7 @@ class Route(object): self.request_class = request_class # collection id from regex group (default 0) self.coll_group = coll_group + self.cookie_scope = config.get('cookie_scope') self._custom_init(config) def is_handling(self, request_uri): diff --git a/pywb/framework/wbrequestresponse.py b/pywb/framework/wbrequestresponse.py index 49b4737b..ddb5b8d1 100644 --- a/pywb/framework/wbrequestresponse.py +++ b/pywb/framework/wbrequestresponse.py @@ -37,7 +37,8 @@ class WbRequest(object): use_abs_prefix=False, wburl_class=None, urlrewriter_class=None, - is_proxy=False): + is_proxy=False, + cookie_scope=None): self.env = env @@ -69,10 +70,12 @@ class WbRequest(object): # wb_url present and not root page if wb_url_str != '/' and wburl_class: self.wb_url = wburl_class(wb_url_str) - self.urlrewriter = urlrewriter_class(self.wb_url, self.wb_prefix, + self.urlrewriter = urlrewriter_class(self.wb_url, + self.wb_prefix, host_prefix + rel_prefix, rel_prefix, - env.get('SCRIPT_NAME', '/')) + env.get('SCRIPT_NAME', '/'), + cookie_scope) else: # no wb_url, just store blank wb_url self.wb_url = None diff --git a/pywb/rewrite/cookie_rewriter.py b/pywb/rewrite/cookie_rewriter.py index d540b749..0eb507b6 100644 --- a/pywb/rewrite/cookie_rewriter.py +++ b/pywb/rewrite/cookie_rewriter.py @@ -77,8 +77,8 @@ class RootScopeCookieRewriter(WbUrlBaseCookieRewriter): #================================================================= -def get_cookie_rewriter(rule): - if rule and rule.cookie_scope == 'root': +def get_cookie_rewriter(cookie_scope): + if cookie_scope == 'root': return RootScopeCookieRewriter else: return MinimalScopeCookieRewriter diff --git a/pywb/rewrite/regex_rewriters.py b/pywb/rewrite/regex_rewriters.py index 97151190..9f19385a 100644 --- a/pywb/rewrite/regex_rewriters.py +++ b/pywb/rewrite/regex_rewriters.py @@ -90,13 +90,19 @@ class RegexRewriter(object): @staticmethod def parse_rules_from_config(config): - def parse_rule(obj): - match = obj.get('match') - replace = RegexRewriter.format(obj.get('replace', '{0}')) - group = obj.get('group', 0) - result = (match, replace, group) - return result - return map(parse_rule, config) + def run_parse_rules(rewriter): + def parse_rule(obj): + match = obj.get('match') + if 'rewrite' in obj: + replace = RegexRewriter.archival_rewrite(rewriter) + else: + replace = RegexRewriter.format(obj.get('replace', '{0}')) + group = obj.get('group', 0) + result = (match, replace, group) + return result + + return map(parse_rule, config) + return run_parse_rules #================================================================= diff --git a/pywb/rewrite/rewriterules.py b/pywb/rewrite/rewriterules.py index ed218bcd..5bc99e3a 100644 --- a/pywb/rewrite/rewriterules.py +++ b/pywb/rewrite/rewriterules.py @@ -52,11 +52,11 @@ class RewriteRules(BaseRule): rewriter_cls = self.rewriters[field] - rule_def_tuples = RegexRewriter.parse_rules_from_config(regexs) + #rule_def_tuples = RegexRewriter.parse_rules_from_config(regexs) + parse_rules_func = RegexRewriter.parse_rules_from_config(regexs) def extend_rewriter_with_regex(urlrewriter): - #import sys - #sys.stderr.write('\n\nEXTEND: ' + str(rule_def_tuples)) + rule_def_tuples = parse_rules_func(urlrewriter) return rewriter_cls(urlrewriter, rule_def_tuples) self.rewriters[field] = extend_rewriter_with_regex diff --git a/pywb/rewrite/url_rewriter.py b/pywb/rewrite/url_rewriter.py index dc16385a..d90097ed 100644 --- a/pywb/rewrite/url_rewriter.py +++ b/pywb/rewrite/url_rewriter.py @@ -18,12 +18,14 @@ class UrlRewriter(object): PROTOCOLS = ['http:', 'https:', 'ftp:', 'mms:', 'rtsp:', 'wais:'] - def __init__(self, wburl, prefix, full_prefix=None, rel_prefix=None, root_path=None): + def __init__(self, wburl, prefix, full_prefix=None, rel_prefix=None, root_path=None, + cookie_scope=None): self.wburl = wburl if isinstance(wburl, WbUrl) else WbUrl(wburl) self.prefix = prefix self.full_prefix = full_prefix self.rel_prefix = rel_prefix if rel_prefix else prefix self.root_path = root_path if root_path else '/' + self.cookie_scope = cookie_scope def rewrite(self, url, mod=None): # if special protocol, no rewriting at all @@ -84,8 +86,12 @@ class UrlRewriter(object): new_wburl = WbUrl(new_url) return UrlRewriter(new_wburl, self.prefix) - def get_cookie_rewriter(self, rule=None): - cls = get_cookie_rewriter(rule) + def get_cookie_rewriter(self, scope=None): + # collection scope overrides rule scope? + if self.cookie_scope: + scope = self.cookie_scope + + cls = get_cookie_rewriter(scope) return cls(self) def __repr__(self): @@ -151,5 +157,5 @@ class HttpsUrlRewriter(UrlRewriter): def rebase_rewriter(self, new_url): return self - def get_cookie_rewriter(self, rule=None): + def get_cookie_rewriter(self, scope=None): return None diff --git a/pywb/rules.yaml b/pywb/rules.yaml index 227e0198..f0f035ec 100644 --- a/pywb/rules.yaml +++ b/pywb/rules.yaml @@ -45,14 +45,20 @@ rules: parse_comments: true - - # instagram rules - #================================================================= - - url_prefix: 'com,instagram' + - url_prefix: 'com,facebook' rewrite: cookie_scope: root + # instagram rules + #================================================================= + - url_prefix: 'net,cloudfront,' + rewrite: + js_regexs: + - match: '\burl\((//[^)]+)\)' + rewrite: true + group: 1 + # flickr rules #================================================================= - url_prefix: ['com,yimg,l)/g/combo', 'com,yimg,s)/pw/combo', 'com,yahooapis,yui)/combo'] diff --git a/pywb/webapp/pywb_init.py b/pywb/webapp/pywb_init.py index 9f26c4e5..7b225a48 100644 --- a/pywb/webapp/pywb_init.py +++ b/pywb/webapp/pywb_init.py @@ -179,7 +179,7 @@ def create_wb_router(passed_config={}): for name, value in collections.iteritems(): if isinstance(value, BaseHandler): handler_dict[name] = value - routes.append(Route(name, value)) + routes.append(Route(name, value, config=route_config)) continue route_config = init_route_config(value, config) @@ -187,7 +187,7 @@ def create_wb_router(passed_config={}): if route_config.get('index_paths') == '$liveweb': live = create_live_handler(route_config) handler_dict[name] = live - routes.append(Route(name, live)) + routes.append(Route(name, live, config=route_config)) continue query_handler = init_collection(route_config)