import re from pywb.rewrite.content_rewriter import StreamingRewriter #================================================================= def load_function(string): import importlib string = string.split(':', 1) mod = importlib.import_module(string[0]) return getattr(mod, string[1]) #================================================================= class RegexRewriter(StreamingRewriter): #@staticmethod #def comment_out(string): # return '/*' + string + '*/' @staticmethod def format(template): return lambda string: template.format(string) @staticmethod def remove_https(string): return string.replace("https", "http") @staticmethod def add_prefix(prefix): return lambda string: prefix + string @staticmethod def archival_rewrite(rewriter): return lambda string: rewriter.rewrite(string) #@staticmethod #def replacer(other): # return lambda m, string: other HTTPX_MATCH_STR = r'https?:\\?/\\?/[A-Za-z0-9:_@.-]+' #DEFAULT_OP = add_prefix def __init__(self, rewriter, rules): super(RegexRewriter, self).__init__(rewriter) #rules = self.create_rules(http_prefix) # Build regexstr, concatenating regex list regex_str = '|'.join(['(' + rx + ')' for rx, op, count in rules]) # ensure it's not middle of a word, wrap in non-capture group regex_str = '(? 0: i += 1 count -= 1 if not m.group(i): continue # Optional filter to skip matches if not self.filter(m): return m.group(0) # Custom func #if not hasattr(op, '__call__'): # op = RegexRewriter.DEFAULT_OP(op) result = op(m.group(i)) final_str = result # if extracting partial match if i != full_m: final_str = m.string[m.start(full_m):m.start(i)] final_str += result final_str += m.string[m.end(i):m.end(full_m)] return final_str @staticmethod def parse_rules_from_config(config): def run_parse_rules(rewriter): def parse_rule(obj): match = obj.get('match') if 'rewrite' in obj: replace = RegexRewriter.archival_rewrite(rewriter) elif 'function' in obj: replace = load_function(obj['function']) else: replace = RegexRewriter.format(obj.get('replace', '{0}')) group = obj.get('group', 0) result = (match, replace, group) return result return list(map(parse_rule, config)) return run_parse_rules #================================================================= class JSLinkRewriterMixin(object): """ JS Rewriter which rewrites absolute http://, https:// and // urls at the beginning of a string """ #JS_HTTPX = r'(?:(?:(?<=["\';])https?:)|(?<=["\']))\\{0,4}/\\{0,4}/[A-Za-z0-9:_@.-]+.*(?=["\s\';&\\])' #JS_HTTPX = r'(?<=["\';])(?:https?:)?\\{0,4}/\\{0,4}/[A-Za-z0-9:_@.\-/\\?&#]+(?=["\';&\\])' #JS_HTTPX = r'(?:(?<=["\';])https?:|(?<=["\']))\\{0,4}/\\{0,4}/[A-Za-z0-9:_@.-][^"\s\';&\\]*(?=["\';&\\])' JS_HTTPX = r'(?:(?<=["\';])https?:|(?<=["\']))\\{0,4}/\\{0,4}/[A-Za-z0-9:_@%.\\-]+/' def __init__(self, rewriter, rules=[]): rules = rules + [ (self.JS_HTTPX, RegexRewriter.archival_rewrite(rewriter), 0) ] super(JSLinkRewriterMixin, self).__init__(rewriter, rules) #================================================================= class JSLocationRewriterMixin(object): """ JS Rewriter mixin which rewrites location and domain to the specified prefix (default: 'WB_wombat_') """ def __init__(self, rewriter, rules=[], prefix='WB_wombat_'): rules = rules + [ (r'(?