From 5a41f59f39807575dcd1b4ec5f5e78235bf0a3cc Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 26 Feb 2014 18:02:01 -0800 Subject: [PATCH] new unified config system, via rules.yaml! contains configs for cdx canon, fuzzy matching and rewriting! rewriting: ability to add custom regexs per domain also, ability to toggle js rewriting and custom rewriting file (default is wombat.js) --- .coveragerc | 3 + pywb/cdx/cdxdomainspecific.py | 68 ++++++++++++--------- pywb/cdx/rules.yaml | 24 -------- pywb/rewrite/regex_rewriters.py | 80 ++++++++++++++++--------- pywb/rewrite/rewrite_content.py | 53 +++++++++------- pywb/rewrite/rewrite_live.py | 34 ++++++++++- pywb/rewrite/test/test_rewrite.py | 2 +- pywb/rewrite/test/test_rewrite_live.py | 34 ++++++++++- pywb/rules.yaml | 49 +++++++++++++++ pywb/utils/loaders.py | 1 + sample_archive/text_content/sample.html | 14 +++++ setup.py | 3 +- 12 files changed, 253 insertions(+), 112 deletions(-) delete mode 100644 pywb/cdx/rules.yaml create mode 100644 pywb/rules.yaml create mode 100644 sample_archive/text_content/sample.html diff --git a/.coveragerc b/.coveragerc index 63400c07..d41f9d40 100644 --- a/.coveragerc +++ b/.coveragerc @@ -2,6 +2,9 @@ omit = */test/* */tests/* + *.html + *.js + *.css [report] exclude_lines = diff --git a/pywb/cdx/cdxdomainspecific.py b/pywb/cdx/cdxdomainspecific.py index 2c733c8d..a9e06778 100644 --- a/pywb/cdx/cdxdomainspecific.py +++ b/pywb/cdx/cdxdomainspecific.py @@ -3,31 +3,38 @@ import re import logging import pkgutil +from pywb.utils.dsrules import BaseRule, RuleSet + from canonicalize import unsurt, UrlCanonicalizer #================================================================= def load_domain_specific_cdx_rules(filename, surt_ordered): - fh = pkgutil.get_data(__package__, filename) - config = yaml.load(fh) + #fh = pkgutil.get_data(__package__, filename) + #config = yaml.load(fh) + + canon = None + fuzzy = None # Load Canonicalizer Rules - rules = StartsWithRule.load_rules(config.get('canon_rules'), - surt_ordered) + rules = RuleSet(CDXDomainSpecificRule, 'canonicalize') + + if not surt_ordered: + for rule in rules: + rule.unsurt() if rules: canon = CustomUrlCanonicalizer(rules, surt_ordered) - else: - canon = None # Load Fuzzy Lookup Rules - rules = StartsWithRule.load_rules(config.get('fuzzy_lookup_rules'), - surt_ordered) + rules = RuleSet(CDXDomainSpecificRule, 'fuzzy_lookup') + + if not surt_ordered: + for rule in rules: + rule.unsurt() if rules: fuzzy = FuzzyQuery(rules) - else: - fuzzy = None logging.debug('CANON: ' + str(canon)) logging.debug('FUZZY: ' + str(fuzzy)) @@ -43,10 +50,7 @@ class CustomUrlCanonicalizer(UrlCanonicalizer): def __call__(self, url): urlkey = super(CustomUrlCanonicalizer, self).__call__(url) - for rule in self.rules: - if not any(urlkey.startswith(x) for x in rule.starts): - continue - + for rule in self.rules.iter_matching(urlkey): m = rule.regex.match(urlkey) if not m: continue @@ -68,10 +72,7 @@ class FuzzyQuery: urlkey = params['key'] url = params['url'] - for rule in self.rules: - if not any(urlkey.startswith(x) for x in rule.starts): - continue - + for rule in self.rules.iter_matching(urlkey): m = rule.regex.search(urlkey) if not m: continue @@ -96,20 +97,29 @@ class FuzzyQuery: #================================================================= -class StartsWithRule: - def __init__(self, config, surt_ordered=True): - self.starts = config.get('startswith') - if not isinstance(self.starts, list): - self.starts = [self.starts] +class CDXDomainSpecificRule(BaseRule): + def __init__(self, name, config): + super(CDXDomainSpecificRule, self).__init__(name, config) - self.regex = re.compile(config.get('matches')) - self.replace = config.get('replace') + if isinstance(config, basestring): + self.regex = re.compile(config) + self.replace = None + else: + self.regex = re.compile(config.get('match')) + self.replace = config.get('replace') def unsurt(self): - # must convert to non-surt form - self.starts = map(unsurt, self.starts) - self.regex = unsurt(self.regex) - self.replace = unsurt(self.replace) + """ + urlkey is assumed to be in surt format by default + In the case of non-surt format, this method is called + to desurt any urls + """ + self.url_prefix = map(unsurt, self.url_prefix) + if self.regex: + self.regex = unsurt(self.regex) + + if self.replace: + self.replace = unsurt(self.replace) @staticmethod def load_rules(rules_config, surt_ordered=True): diff --git a/pywb/cdx/rules.yaml b/pywb/cdx/rules.yaml deleted file mode 100644 index 1da70582..00000000 --- a/pywb/cdx/rules.yaml +++ /dev/null @@ -1,24 +0,0 @@ - -fuzzy_lookup_rules: - - startswith: 'com,twitter)/i/profiles/show/' - matches: '/profiles/show/.*with_replies\?.*(max_id=[^&]+)' - - - startswith: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet' - matches: 'com,facebook\)/.*[?&]data=(.*(?:[&]|query_type[^,]+))' - - - startswith: ['com,yimg,l)/g/combo', 'com,yahooapis,yui)/combo'] - matches: '([^/]+(?:\.css|\.js))' - - # matches all urls - - startswith: '' - matches: '[&?](?:_|uncache)=[\d]+[&]?' - -canon_rules: - - startswith: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet' - matches: 'com,facebook\)/.*[?&]data=([^&]+).*' - replace: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet?data=\1' - - - - - diff --git a/pywb/rewrite/regex_rewriters.py b/pywb/rewrite/regex_rewriters.py index 690775e7..a435b104 100644 --- a/pywb/rewrite/regex_rewriters.py +++ b/pywb/rewrite/regex_rewriters.py @@ -4,11 +4,16 @@ import itertools from url_rewriter import UrlRewriter + #================================================================= class RegexRewriter(object): + #@staticmethod + #def comment_out(string): + # return '/*' + string + '*/' + @staticmethod - def comment_out(string): - return '/*' + string + '*/' + def format(template): + return lambda string: template.format(string) @staticmethod def remove_https(string): @@ -20,19 +25,16 @@ class RegexRewriter(object): @staticmethod def archival_rewrite(rewriter): - return lambda x: rewriter.rewrite(x) + return lambda string: rewriter.rewrite(string) - @staticmethod - def replacer(string): - return lambda x: string + #@staticmethod + #def replacer(other): + # return lambda m, string: other HTTPX_MATCH_STR = r'https?:\\?/\\?/[A-Za-z0-9:_@.-]+' - - DEFAULT_OP = add_prefix - def __init__(self, rules): #rules = self.create_rules(http_prefix) @@ -76,52 +78,68 @@ class RegexRewriter(object): op = RegexRewriter.DEFAULT_OP(op) result = op(m.group(i)) + final_str = result # if extracting partial match if i != full_m: - result = m.string[m.start(full_m):m.start(i)] + result + m.string[m.end(i):m.end(full_m)] + final_str = m.string[m.start(full_m):m.start(i)] + final_str += result + final_str += m.string[m.end(i):m.end(full_m)] + return final_str + + @staticmethod + def parse_rules_from_config(config): + def parse_rule(obj): + match = obj.get('match') + replace = RegexRewriter.format(obj.get('replace', '{0}')) + group = obj.get('group', 0) + result = (match, replace, group) return result - + return map(parse_rule, config) #================================================================= -class JSLinkRewriter(RegexRewriter): +class JSLinkOnlyRewriter(RegexRewriter): """ JS Rewriter which rewrites absolute http://, https:// and // urls at the beginning of a string """ JS_HTTPX = r'(?<="|\')(?:https?:)?\\{0,2}/\\{0,2}/[A-Za-z0-9:_@.-]+' - def __init__(self, rewriter, rules = []): + def __init__(self, rewriter, rules=[]): rules = rules + [(self.JS_HTTPX, rewriter.get_abs_url(), 0)] - super(JSLinkRewriter, self).__init__(rules) + super(JSLinkOnlyRewriter, self).__init__(rules) + #================================================================= -class JSLocationAndLinkRewriter(JSLinkRewriter): +class JSLinkAndLocationRewriter(JSLinkOnlyRewriter): """ JS Rewriter which also rewrites location and domain to the specified prefix (default: 'WB_wombat_') """ - def __init__(self, rewriter, rules = [], prefix = 'WB_wombat_'): + def __init__(self, rewriter, rules=[], prefix='WB_wombat_'): rules = rules + [ (r'(?>> _test_js('window.location = "http://example.com/abc.html"; some_func(); ', [('some_func\(\).*', RegexRewriter.comment_out, 0)]) +>>> _test_js('window.location = "http://example.com/abc.html"; some_func(); ', [('some_func\(\).*', RegexRewriter.format('/*{0}*/'), 0)]) 'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"; /*some_func(); */' # scheme-agnostic diff --git a/pywb/rewrite/test/test_rewrite_live.py b/pywb/rewrite/test/test_rewrite_live.py index 6d66ce60..f3a7667a 100644 --- a/pywb/rewrite/test/test_rewrite_live.py +++ b/pywb/rewrite/test/test_rewrite_live.py @@ -1,12 +1,39 @@ from pywb.rewrite.rewrite_live import get_rewritten from pywb.rewrite.url_rewriter import UrlRewriter +from pywb import get_test_dir + # This module has some rewriting tests against the 'live web' # As such, the content may change and the test may break urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/pywb/') +def test_local_1(): + status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html', urlrewriter, 'com,example,test)/') + + # wombat insert added + assert '' in buff + + # location rewritten + assert 'window.WB_wombat_location = "/other.html"' in buff + + # link rewritten + assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff + + +def test_local_2_no_js_location_rewrite(): + status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html', urlrewriter, 'example,example,test)/nolocation_rewrite') + + # no wombat insert + assert '' not in buff + + # no location rewrite + assert 'window.location = "/other.html"' in buff + + # still link rewrite + assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff + def test_example_1(): status_headers, buff = get_rewritten('http://example.com/', urlrewriter) @@ -24,9 +51,10 @@ def test_example_2(): -#def test_example_3(): -# status_headers, buff = get_rewritten('http://archive.org/', urlrewriter) +def test_example_domain_specific_3(): + status_headers, buff = get_rewritten('http://facebook.com/digitalpreservation', urlrewriter) -# assert '/pywb/20131226101010/http://example.com/about/terms.php' in buff, buff + # comment out bootloader + assert '/* Bootloader.configurePage' in buff, buff diff --git a/pywb/rules.yaml b/pywb/rules.yaml new file mode 100644 index 00000000..5cf29154 --- /dev/null +++ b/pywb/rules.yaml @@ -0,0 +1,49 @@ + +rules: + + # twitter rules + #================================================================= + - url_prefix: 'com,twitter)/i/profiles/show/' + + fuzzy_lookup: '/profiles/show/.*with_replies\?.*(max_id=[^&]+)' + + + # facebook rules + #================================================================= + - url_prefix: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet' + + fuzzy_lookup: 'com,facebook\)/.*[?&]data=(.*(?:[&]|query_type[^,]+))' + + canonicalize: + match: 'com,facebook\)/.*[?&]data=([^&]+).*' + replace: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet?data=\1' + + + - url_prefix: 'com,facebook)/' + rewrite: + js_regexs: + - match: 'Bootloader\.configurePage.*' + replace: '/* {0} */' + + + # yahoo rules + #================================================================= + - url_prefix: ['com,yimg,l)/g/combo', 'com,yahooapis,yui)/combo'] + + fuzzy_lookup: '([^/]+(?:\.css|\.js))' + + + # testing rules -- not for valid domain + #================================================================= + # this rule block is a non-existent prefix merely for testing + - url_prefix: 'example,example,test)/nolocation_rewrite' + + rewrite: + js_rewrite_location: False + + + # all domain rules -- fallback to this dataset + #================================================================= + # Applies to all urls -- should be last + - url_prefix: '' + fuzzy_lookup: '[&?](?:_|uncache)=[\d]+[&]?' diff --git a/pywb/utils/loaders.py b/pywb/utils/loaders.py index a117f539..7813ded8 100644 --- a/pywb/utils/loaders.py +++ b/pywb/utils/loaders.py @@ -9,6 +9,7 @@ import urllib2 import time +#================================================================= def is_http(filename): return any(filename.startswith(x) for x in ['http://', 'https://']) diff --git a/sample_archive/text_content/sample.html b/sample_archive/text_content/sample.html new file mode 100644 index 00000000..c4f3ce35 --- /dev/null +++ b/sample_archive/text_content/sample.html @@ -0,0 +1,14 @@ + + +Sample Page For Rewrite Test + + + +Test Content +Some Link + diff --git a/setup.py b/setup.py index 20ac8518..dac8a907 100755 --- a/setup.py +++ b/setup.py @@ -15,7 +15,8 @@ setuptools.setup(name='pywb', provides=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite'], package_data={'pywb': ['ui/*', 'static/*'], 'pywb.cdx': ['*.yaml']}, data_files = [('sample_archive/cdx/', glob.glob('sample_archive/cdx/*')), - ('sample_archive/warcs/', glob.glob('sample_archive/warcs/*'))], + ('sample_archive/warcs/', glob.glob('sample_archive/warcs/*')), + ('sample_archive/text_content/', glob.glob('sample_archive/text_content/*'))], install_requires=['uwsgi', 'rfc3987', 'chardet', 'redis', 'jinja2', 'surt', 'pyyaml', 'WebTest','pytest'], # tests_require=['WebTest', 'pytest'], zip_safe=False)