From 349a1a7a3a529136de3b8df798154437c4337456 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 25 Feb 2014 15:30:16 -0800 Subject: [PATCH 1/8] add unit test to timeutils.py tweak .travis.yml --- .travis.yml | 5 ++--- pywb/utils/timeutils.py | 4 ++++ 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index bab78128..354f2c61 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,9 +3,8 @@ python: - "2.7" # command to install dependencies install: - - "python setup.py -q install" - - "pip install python-coveralls" - - "pip install pytest-cov" + - python setup.py -q install + - pip install coverage pytest-cov coveralls --use-mirrors # command to run tests #script: nosetests --with-doctest #script: py.test run-tests.py ./pywb/ --doctest-modules --ignore=setup.py diff --git a/pywb/utils/timeutils.py b/pywb/utils/timeutils.py index 7af3401f..f93f324d 100644 --- a/pywb/utils/timeutils.py +++ b/pywb/utils/timeutils.py @@ -162,6 +162,10 @@ def timestamp_to_datetime(string): >>> timestamp_to_datetime('40001965252477') datetime.datetime(2999, 12, 31, 23, 24, 59) + # not a number! + >>> timestamp_to_datetime('2010abc') + datetime.datetime(2010, 12, 31, 23, 59, 59) + """ # pad to 6 digits From 5a41f59f39807575dcd1b4ec5f5e78235bf0a3cc Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 26 Feb 2014 18:02:01 -0800 Subject: [PATCH 2/8] new unified config system, via rules.yaml! contains configs for cdx canon, fuzzy matching and rewriting! rewriting: ability to add custom regexs per domain also, ability to toggle js rewriting and custom rewriting file (default is wombat.js) --- .coveragerc | 3 + pywb/cdx/cdxdomainspecific.py | 68 ++++++++++++--------- pywb/cdx/rules.yaml | 24 -------- pywb/rewrite/regex_rewriters.py | 80 ++++++++++++++++--------- pywb/rewrite/rewrite_content.py | 53 +++++++++------- pywb/rewrite/rewrite_live.py | 34 ++++++++++- pywb/rewrite/test/test_rewrite.py | 2 +- pywb/rewrite/test/test_rewrite_live.py | 34 ++++++++++- pywb/rules.yaml | 49 +++++++++++++++ pywb/utils/loaders.py | 1 + sample_archive/text_content/sample.html | 14 +++++ setup.py | 3 +- 12 files changed, 253 insertions(+), 112 deletions(-) delete mode 100644 pywb/cdx/rules.yaml create mode 100644 pywb/rules.yaml create mode 100644 sample_archive/text_content/sample.html diff --git a/.coveragerc b/.coveragerc index 63400c07..d41f9d40 100644 --- a/.coveragerc +++ b/.coveragerc @@ -2,6 +2,9 @@ omit = */test/* */tests/* + *.html + *.js + *.css [report] exclude_lines = diff --git a/pywb/cdx/cdxdomainspecific.py b/pywb/cdx/cdxdomainspecific.py index 2c733c8d..a9e06778 100644 --- a/pywb/cdx/cdxdomainspecific.py +++ b/pywb/cdx/cdxdomainspecific.py @@ -3,31 +3,38 @@ import re import logging import pkgutil +from pywb.utils.dsrules import BaseRule, RuleSet + from canonicalize import unsurt, UrlCanonicalizer #================================================================= def load_domain_specific_cdx_rules(filename, surt_ordered): - fh = pkgutil.get_data(__package__, filename) - config = yaml.load(fh) + #fh = pkgutil.get_data(__package__, filename) + #config = yaml.load(fh) + + canon = None + fuzzy = None # Load Canonicalizer Rules - rules = StartsWithRule.load_rules(config.get('canon_rules'), - surt_ordered) + rules = RuleSet(CDXDomainSpecificRule, 'canonicalize') + + if not surt_ordered: + for rule in rules: + rule.unsurt() if rules: canon = CustomUrlCanonicalizer(rules, surt_ordered) - else: - canon = None # Load Fuzzy Lookup Rules - rules = StartsWithRule.load_rules(config.get('fuzzy_lookup_rules'), - surt_ordered) + rules = RuleSet(CDXDomainSpecificRule, 'fuzzy_lookup') + + if not surt_ordered: + for rule in rules: + rule.unsurt() if rules: fuzzy = FuzzyQuery(rules) - else: - fuzzy = None logging.debug('CANON: ' + str(canon)) logging.debug('FUZZY: ' + str(fuzzy)) @@ -43,10 +50,7 @@ class CustomUrlCanonicalizer(UrlCanonicalizer): def __call__(self, url): urlkey = super(CustomUrlCanonicalizer, self).__call__(url) - for rule in self.rules: - if not any(urlkey.startswith(x) for x in rule.starts): - continue - + for rule in self.rules.iter_matching(urlkey): m = rule.regex.match(urlkey) if not m: continue @@ -68,10 +72,7 @@ class FuzzyQuery: urlkey = params['key'] url = params['url'] - for rule in self.rules: - if not any(urlkey.startswith(x) for x in rule.starts): - continue - + for rule in self.rules.iter_matching(urlkey): m = rule.regex.search(urlkey) if not m: continue @@ -96,20 +97,29 @@ class FuzzyQuery: #================================================================= -class StartsWithRule: - def __init__(self, config, surt_ordered=True): - self.starts = config.get('startswith') - if not isinstance(self.starts, list): - self.starts = [self.starts] +class CDXDomainSpecificRule(BaseRule): + def __init__(self, name, config): + super(CDXDomainSpecificRule, self).__init__(name, config) - self.regex = re.compile(config.get('matches')) - self.replace = config.get('replace') + if isinstance(config, basestring): + self.regex = re.compile(config) + self.replace = None + else: + self.regex = re.compile(config.get('match')) + self.replace = config.get('replace') def unsurt(self): - # must convert to non-surt form - self.starts = map(unsurt, self.starts) - self.regex = unsurt(self.regex) - self.replace = unsurt(self.replace) + """ + urlkey is assumed to be in surt format by default + In the case of non-surt format, this method is called + to desurt any urls + """ + self.url_prefix = map(unsurt, self.url_prefix) + if self.regex: + self.regex = unsurt(self.regex) + + if self.replace: + self.replace = unsurt(self.replace) @staticmethod def load_rules(rules_config, surt_ordered=True): diff --git a/pywb/cdx/rules.yaml b/pywb/cdx/rules.yaml deleted file mode 100644 index 1da70582..00000000 --- a/pywb/cdx/rules.yaml +++ /dev/null @@ -1,24 +0,0 @@ - -fuzzy_lookup_rules: - - startswith: 'com,twitter)/i/profiles/show/' - matches: '/profiles/show/.*with_replies\?.*(max_id=[^&]+)' - - - startswith: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet' - matches: 'com,facebook\)/.*[?&]data=(.*(?:[&]|query_type[^,]+))' - - - startswith: ['com,yimg,l)/g/combo', 'com,yahooapis,yui)/combo'] - matches: '([^/]+(?:\.css|\.js))' - - # matches all urls - - startswith: '' - matches: '[&?](?:_|uncache)=[\d]+[&]?' - -canon_rules: - - startswith: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet' - matches: 'com,facebook\)/.*[?&]data=([^&]+).*' - replace: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet?data=\1' - - - - - diff --git a/pywb/rewrite/regex_rewriters.py b/pywb/rewrite/regex_rewriters.py index 690775e7..a435b104 100644 --- a/pywb/rewrite/regex_rewriters.py +++ b/pywb/rewrite/regex_rewriters.py @@ -4,11 +4,16 @@ import itertools from url_rewriter import UrlRewriter + #================================================================= class RegexRewriter(object): + #@staticmethod + #def comment_out(string): + # return '/*' + string + '*/' + @staticmethod - def comment_out(string): - return '/*' + string + '*/' + def format(template): + return lambda string: template.format(string) @staticmethod def remove_https(string): @@ -20,19 +25,16 @@ class RegexRewriter(object): @staticmethod def archival_rewrite(rewriter): - return lambda x: rewriter.rewrite(x) + return lambda string: rewriter.rewrite(string) - @staticmethod - def replacer(string): - return lambda x: string + #@staticmethod + #def replacer(other): + # return lambda m, string: other HTTPX_MATCH_STR = r'https?:\\?/\\?/[A-Za-z0-9:_@.-]+' - - DEFAULT_OP = add_prefix - def __init__(self, rules): #rules = self.create_rules(http_prefix) @@ -76,52 +78,68 @@ class RegexRewriter(object): op = RegexRewriter.DEFAULT_OP(op) result = op(m.group(i)) + final_str = result # if extracting partial match if i != full_m: - result = m.string[m.start(full_m):m.start(i)] + result + m.string[m.end(i):m.end(full_m)] + final_str = m.string[m.start(full_m):m.start(i)] + final_str += result + final_str += m.string[m.end(i):m.end(full_m)] + return final_str + + @staticmethod + def parse_rules_from_config(config): + def parse_rule(obj): + match = obj.get('match') + replace = RegexRewriter.format(obj.get('replace', '{0}')) + group = obj.get('group', 0) + result = (match, replace, group) return result - + return map(parse_rule, config) #================================================================= -class JSLinkRewriter(RegexRewriter): +class JSLinkOnlyRewriter(RegexRewriter): """ JS Rewriter which rewrites absolute http://, https:// and // urls at the beginning of a string """ JS_HTTPX = r'(?<="|\')(?:https?:)?\\{0,2}/\\{0,2}/[A-Za-z0-9:_@.-]+' - def __init__(self, rewriter, rules = []): + def __init__(self, rewriter, rules=[]): rules = rules + [(self.JS_HTTPX, rewriter.get_abs_url(), 0)] - super(JSLinkRewriter, self).__init__(rules) + super(JSLinkOnlyRewriter, self).__init__(rules) + #================================================================= -class JSLocationAndLinkRewriter(JSLinkRewriter): +class JSLinkAndLocationRewriter(JSLinkOnlyRewriter): """ JS Rewriter which also rewrites location and domain to the specified prefix (default: 'WB_wombat_') """ - def __init__(self, rewriter, rules = [], prefix = 'WB_wombat_'): + def __init__(self, rewriter, rules=[], prefix='WB_wombat_'): rules = rules + [ (r'(?>> _test_js('window.location = "http://example.com/abc.html"; some_func(); ', [('some_func\(\).*', RegexRewriter.comment_out, 0)]) +>>> _test_js('window.location = "http://example.com/abc.html"; some_func(); ', [('some_func\(\).*', RegexRewriter.format('/*{0}*/'), 0)]) 'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"; /*some_func(); */' # scheme-agnostic diff --git a/pywb/rewrite/test/test_rewrite_live.py b/pywb/rewrite/test/test_rewrite_live.py index 6d66ce60..f3a7667a 100644 --- a/pywb/rewrite/test/test_rewrite_live.py +++ b/pywb/rewrite/test/test_rewrite_live.py @@ -1,12 +1,39 @@ from pywb.rewrite.rewrite_live import get_rewritten from pywb.rewrite.url_rewriter import UrlRewriter +from pywb import get_test_dir + # This module has some rewriting tests against the 'live web' # As such, the content may change and the test may break urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/pywb/') +def test_local_1(): + status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html', urlrewriter, 'com,example,test)/') + + # wombat insert added + assert '' in buff + + # location rewritten + assert 'window.WB_wombat_location = "/other.html"' in buff + + # link rewritten + assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff + + +def test_local_2_no_js_location_rewrite(): + status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html', urlrewriter, 'example,example,test)/nolocation_rewrite') + + # no wombat insert + assert '' not in buff + + # no location rewrite + assert 'window.location = "/other.html"' in buff + + # still link rewrite + assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff + def test_example_1(): status_headers, buff = get_rewritten('http://example.com/', urlrewriter) @@ -24,9 +51,10 @@ def test_example_2(): -#def test_example_3(): -# status_headers, buff = get_rewritten('http://archive.org/', urlrewriter) +def test_example_domain_specific_3(): + status_headers, buff = get_rewritten('http://facebook.com/digitalpreservation', urlrewriter) -# assert '/pywb/20131226101010/http://example.com/about/terms.php' in buff, buff + # comment out bootloader + assert '/* Bootloader.configurePage' in buff, buff diff --git a/pywb/rules.yaml b/pywb/rules.yaml new file mode 100644 index 00000000..5cf29154 --- /dev/null +++ b/pywb/rules.yaml @@ -0,0 +1,49 @@ + +rules: + + # twitter rules + #================================================================= + - url_prefix: 'com,twitter)/i/profiles/show/' + + fuzzy_lookup: '/profiles/show/.*with_replies\?.*(max_id=[^&]+)' + + + # facebook rules + #================================================================= + - url_prefix: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet' + + fuzzy_lookup: 'com,facebook\)/.*[?&]data=(.*(?:[&]|query_type[^,]+))' + + canonicalize: + match: 'com,facebook\)/.*[?&]data=([^&]+).*' + replace: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet?data=\1' + + + - url_prefix: 'com,facebook)/' + rewrite: + js_regexs: + - match: 'Bootloader\.configurePage.*' + replace: '/* {0} */' + + + # yahoo rules + #================================================================= + - url_prefix: ['com,yimg,l)/g/combo', 'com,yahooapis,yui)/combo'] + + fuzzy_lookup: '([^/]+(?:\.css|\.js))' + + + # testing rules -- not for valid domain + #================================================================= + # this rule block is a non-existent prefix merely for testing + - url_prefix: 'example,example,test)/nolocation_rewrite' + + rewrite: + js_rewrite_location: False + + + # all domain rules -- fallback to this dataset + #================================================================= + # Applies to all urls -- should be last + - url_prefix: '' + fuzzy_lookup: '[&?](?:_|uncache)=[\d]+[&]?' diff --git a/pywb/utils/loaders.py b/pywb/utils/loaders.py index a117f539..7813ded8 100644 --- a/pywb/utils/loaders.py +++ b/pywb/utils/loaders.py @@ -9,6 +9,7 @@ import urllib2 import time +#================================================================= def is_http(filename): return any(filename.startswith(x) for x in ['http://', 'https://']) diff --git a/sample_archive/text_content/sample.html b/sample_archive/text_content/sample.html new file mode 100644 index 00000000..c4f3ce35 --- /dev/null +++ b/sample_archive/text_content/sample.html @@ -0,0 +1,14 @@ + + +Sample Page For Rewrite Test + + + +Test Content +Some Link + diff --git a/setup.py b/setup.py index 20ac8518..dac8a907 100755 --- a/setup.py +++ b/setup.py @@ -15,7 +15,8 @@ setuptools.setup(name='pywb', provides=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite'], package_data={'pywb': ['ui/*', 'static/*'], 'pywb.cdx': ['*.yaml']}, data_files = [('sample_archive/cdx/', glob.glob('sample_archive/cdx/*')), - ('sample_archive/warcs/', glob.glob('sample_archive/warcs/*'))], + ('sample_archive/warcs/', glob.glob('sample_archive/warcs/*')), + ('sample_archive/text_content/', glob.glob('sample_archive/text_content/*'))], install_requires=['uwsgi', 'rfc3987', 'chardet', 'redis', 'jinja2', 'surt', 'pyyaml', 'WebTest','pytest'], # tests_require=['WebTest', 'pytest'], zip_safe=False) From 453ab678ed47101b6a27422e5b83084d715ec5c6 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 26 Feb 2014 22:04:37 -0800 Subject: [PATCH 3/8] refactor domain specific rules: - head insert callback passed in with rule, up to template to handle additional inserts based on rule properties - ability to pass in custom rules config to both cdx server and content rewriter - move canonicalize to utils pkg - add wombat, modify wb.js to remove wombat-related settings --- pywb/cdx/cdxdomainspecific.py | 10 +- pywb/cdx/cdxserver.py | 10 +- pywb/config_utils.py | 8 +- pywb/pywb_init.py | 9 +- pywb/replay_views.py | 25 ++- pywb/rewrite/rewrite_content.py | 17 +- pywb/rewrite/rewrite_live.py | 18 +- pywb/rewrite/rewriterules.py | 53 ++++++ pywb/rewrite/test/test_rewrite_live.py | 18 +- pywb/static/wb.js | 25 +-- pywb/static/wombat.js | 219 +++++++++++++++++++++++++ pywb/ui/head_insert.html | 11 +- pywb/{cdx => utils}/canonicalize.py | 10 +- pywb/utils/dsrules.py | 98 +++++++++++ pywb/wbapp.py | 4 +- setup.py | 2 +- 16 files changed, 482 insertions(+), 55 deletions(-) create mode 100644 pywb/rewrite/rewriterules.py create mode 100644 pywb/static/wombat.js rename pywb/{cdx => utils}/canonicalize.py (95%) create mode 100644 pywb/utils/dsrules.py diff --git a/pywb/cdx/cdxdomainspecific.py b/pywb/cdx/cdxdomainspecific.py index a9e06778..006dd88d 100644 --- a/pywb/cdx/cdxdomainspecific.py +++ b/pywb/cdx/cdxdomainspecific.py @@ -5,11 +5,11 @@ import pkgutil from pywb.utils.dsrules import BaseRule, RuleSet -from canonicalize import unsurt, UrlCanonicalizer +from pywb.utils.canonicalize import unsurt, UrlCanonicalizer #================================================================= -def load_domain_specific_cdx_rules(filename, surt_ordered): +def load_domain_specific_cdx_rules(ds_rules_file, surt_ordered): #fh = pkgutil.get_data(__package__, filename) #config = yaml.load(fh) @@ -17,7 +17,8 @@ def load_domain_specific_cdx_rules(filename, surt_ordered): fuzzy = None # Load Canonicalizer Rules - rules = RuleSet(CDXDomainSpecificRule, 'canonicalize') + rules = RuleSet(CDXDomainSpecificRule, 'canonicalize', + ds_rules_file=ds_rules_file) if not surt_ordered: for rule in rules: @@ -27,7 +28,8 @@ def load_domain_specific_cdx_rules(filename, surt_ordered): canon = CustomUrlCanonicalizer(rules, surt_ordered) # Load Fuzzy Lookup Rules - rules = RuleSet(CDXDomainSpecificRule, 'fuzzy_lookup') + rules = RuleSet(CDXDomainSpecificRule, 'fuzzy_lookup', + ds_rules_file=ds_rules_file) if not surt_ordered: for rule in rules: diff --git a/pywb/cdx/cdxserver.py b/pywb/cdx/cdxserver.py index 1a68f7e4..7f548ec4 100644 --- a/pywb/cdx/cdxserver.py +++ b/pywb/cdx/cdxserver.py @@ -1,4 +1,4 @@ -from canonicalize import UrlCanonicalizer, calc_search_range +from pywb.utils.canonicalize import UrlCanonicalizer, calc_search_range from cdxops import cdx_load from cdxsource import CDXSource, CDXFile, RemoteCDXSource, RedisCDXSource @@ -17,13 +17,13 @@ import urlparse #================================================================= class BaseCDXServer(object): def __init__(self, **kwargs): - ds_rules = kwargs.get('ds_rules') + ds_rules_file = kwargs.get('ds_rules_file') surt_ordered = kwargs.get('surt_ordered', True) # load from domain-specific rules - if ds_rules: + if ds_rules_file: self.url_canon, self.fuzzy_query = ( - load_domain_specific_cdx_rules(ds_rules, surt_ordered)) + load_domain_specific_cdx_rules(ds_rules_file, surt_ordered)) # or custom passed in canonicalizer else: self.url_canon = kwargs.get('url_canon') @@ -166,7 +166,7 @@ def create_cdx_server(config, ds_rules_file=None): return server_cls(paths, config=pass_config, surt_ordered=surt_ordered, - ds_rules=ds_rules_file, + ds_rules_file=ds_rules_file, perms_checker=perms_checker) diff --git a/pywb/config_utils.py b/pywb/config_utils.py index 672e8735..05844a2e 100644 --- a/pywb/config_utils.py +++ b/pywb/config_utils.py @@ -18,17 +18,19 @@ def load_template_file(file, desc = None, view_class = views.J2TemplateView): return file #================================================================= -def create_wb_handler(cdx_server, config): +def create_wb_handler(cdx_server, config, ds_rules_file=None): record_loader = ArcWarcRecordLoader(cookie_maker = config.get('cookie_maker')) paths = config.get('archive_paths') - resolving_loader = ResolvingLoader(paths = paths, cdx_server = cdx_server, record_loader = record_loader) + resolving_loader = ResolvingLoader(paths=paths, + cdx_server=cdx_server, + record_loader=record_loader) replayer = replay_views.ReplayView( content_loader = resolving_loader, - content_rewriter = RewriteContent(), + content_rewriter = RewriteContent(ds_rules_file=ds_rules_file), head_insert_view = load_template_file(config.get('head_insert_html'), 'Head Insert'), diff --git a/pywb/pywb_init.py b/pywb/pywb_init.py index be4bdded..bd63bfd5 100644 --- a/pywb/pywb_init.py +++ b/pywb/pywb_init.py @@ -57,12 +57,13 @@ def pywb_config_manual(passed_config = {}): route_config = DictChain(value, config) - ds_rules = route_config.get('domain_specific_rules', None) - cdx_server = IndexReader(route_config, ds_rules) + ds_rules_file = route_config.get('domain_specific_rules', None) + cdx_server = IndexReader(route_config, ds_rules_file) wb_handler = config_utils.create_wb_handler( - cdx_server = cdx_server, - config = route_config, + cdx_server=cdx_server, + config=route_config, + ds_rules_file=ds_rules_file, ) logging.debug('Adding Collection: ' + name) diff --git a/pywb/replay_views.py b/pywb/replay_views.py index 4c6907eb..9113ad5f 100644 --- a/pywb/replay_views.py +++ b/pywb/replay_views.py @@ -7,6 +7,7 @@ from wbrequestresponse import WbResponse from wbexceptions import CaptureException, InternalRedirect from pywb.warc.recordloader import ArchiveLoadFailed + #================================================================= class ReplayView: def __init__(self, content_loader, content_rewriter, head_insert_view = None, @@ -99,20 +100,34 @@ class ReplayView: def rewrite_content(self, wbrequest, cdx, status_headers, stream): urlrewriter = wbrequest.urlrewriter - (rewritten_headers, stream) = self.content_rewriter.rewrite_headers(urlrewriter, status_headers, stream) + result = self.content_rewriter.rewrite_headers(urlrewriter, + status_headers, + stream, + cdx['urlkey']) + (rewritten_headers, stream) = result # no rewriting needed! if rewritten_headers.text_type is None: response_iter = self.stream_to_iter(stream) return WbResponse(rewritten_headers.status_headers, response_iter) - # do head insert + def make_head_insert(rule): + return (self.head_insert_view.render_to_string(wbrequest=wbrequest, + cdx=cdx, + rule=rule)) + # do head insert if self.head_insert_view: - head_insert_str = self.head_insert_view.render_to_string(wbrequest = wbrequest, cdx = cdx) + head_insert_func = make_head_insert else: - head_insert_str = None + head_insert_func = None - (status_headers, response_gen) = self.content_rewriter.rewrite_content(urlrewriter, rewritten_headers, stream, head_insert_str) + result = self.content_rewriter.rewrite_content(urlrewriter, + rewritten_headers, + stream, + head_insert_func, + cdx['urlkey']) + + (status_headers, response_gen) = result if self.buffer_response: if wbrequest.wb_url.mod == 'id_': diff --git a/pywb/rewrite/rewrite_content.py b/pywb/rewrite/rewrite_content.py index 80daf7e3..1ba3d321 100644 --- a/pywb/rewrite/rewrite_content.py +++ b/pywb/rewrite/rewrite_content.py @@ -11,9 +11,12 @@ from pywb.utils.statusandheaders import StatusAndHeaders from pywb.utils.bufferedreaders import DecompressingBufferedReader, ChunkedDataReader +#================================================================= class RewriteContent: - def __init__(self, config=None): - self.ruleset = RuleSet(RewriteRules, 'rewrite', config, {}) + def __init__(self, ds_rules_file=None): + self.ruleset = RuleSet(RewriteRules, 'rewrite', + default_rule_config={}, + ds_rules_file=ds_rules_file) def rewrite_headers(self, urlrewriter, status_headers, stream, urlkey=''): header_rewriter_class = self.ruleset.get_first_match(urlkey).rewriters['header'] @@ -31,7 +34,7 @@ class RewriteContent: return (rewritten_headers, stream) - def rewrite_content(self, urlrewriter, headers, stream, head_insert_str=None, urlkey=''): + def rewrite_content(self, urlrewriter, headers, stream, head_insert_func=None, urlkey=''): # see if we've already rewritten headers if isinstance(headers, RewrittenStatusAndHeaders): @@ -65,7 +68,6 @@ class RewriteContent: text_type = rewritten_headers.text_type - #rewriter_class = self.rewriters.get(text_type) rule = self.ruleset.get_first_match(urlkey) try: @@ -74,10 +76,13 @@ class RewriteContent: raise Exception('Unknown Text Type for Rewrite: ' + text_type) #import sys - #sys.stderr.write(str(vars(self.ruleset.get_first_match(urlkey)))) + #sys.stderr.write(str(vars(rule))) if text_type == 'html': - head_insert_str = rule.create_head_inserts() + head_insert_str + head_insert_str = '' + + if head_insert_func: + head_insert_str = head_insert_func(rule) rewriter = rewriter_class(urlrewriter, outstream=None, diff --git a/pywb/rewrite/rewrite_live.py b/pywb/rewrite/rewrite_live.py index 9d752d10..63783234 100644 --- a/pywb/rewrite/rewrite_live.py +++ b/pywb/rewrite/rewrite_live.py @@ -7,11 +7,11 @@ import mimetypes from pywb.utils.loaders import is_http from pywb.utils.timeutils import datetime_to_timestamp from pywb.utils.statusandheaders import StatusAndHeaders +from pywb.utils.canonicalize import canonicalize from pywb.rewrite.url_rewriter import UrlRewriter from pywb.rewrite.rewrite_content import RewriteContent -from pywb.cdx.canonicalize import canonicalize """ Fetch a url from live web and apply rewriting rules @@ -43,7 +43,7 @@ def get_local_file(uri): return (status_headers, stream) #================================================================= -def get_rewritten(url, urlrewriter, urlkey=None): +def get_rewritten(url, urlrewriter, urlkey=None, head_insert_func=None): if is_http(url): (status_headers, stream) = get_status_and_stream(url) else: @@ -53,11 +53,15 @@ def get_rewritten(url, urlrewriter, urlkey=None): if not urlkey: urlkey = canonicalize(url) - status_headers, gen = RewriteContent().rewrite_content(urlrewriter, - status_headers, - stream, - head_insert_str='', - urlkey=urlkey) + rewriter = RewriteContent() + + result = rewriter.rewrite_content(urlrewriter, + status_headers, + stream, + head_insert_func=head_insert_func, + urlkey=urlkey) + + status_headers, gen = result buff = '' for x in gen: diff --git a/pywb/rewrite/rewriterules.py b/pywb/rewrite/rewriterules.py new file mode 100644 index 00000000..e1584162 --- /dev/null +++ b/pywb/rewrite/rewriterules.py @@ -0,0 +1,53 @@ +from pywb.utils.dsrules import BaseRule + +from regex_rewriters import RegexRewriter, CSSRewriter, XMLRewriter +from regex_rewriters import JSLinkAndLocationRewriter, JSLinkOnlyRewriter +from html_rewriter import HTMLRewriter +from header_rewriter import HeaderRewriter + +import itertools + +class RewriteRules(BaseRule): + def __init__(self, url_prefix, config={}): + super(RewriteRules, self).__init__(url_prefix, config) + + self.rewriters = {} + + #self._script_head_inserts = config.get('script_head_inserts', {}) + + self.rewriters['header'] = config.get('header_class', HeaderRewriter) + self.rewriters['css'] = config.get('css_class', CSSRewriter) + self.rewriters['xml'] = config.get('xml_class', XMLRewriter) + self.rewriters['html'] = config.get('html_class', HTMLRewriter) + + # Custom handling for js rewriting, often the most complex + self.js_rewrite_location = config.get('js_rewrite_location', True) + self.js_rewrite_location = bool(self.js_rewrite_location) + + # ability to toggle rewriting + if self.js_rewrite_location: + js_default_class = JSLinkAndLocationRewriter + else: + js_default_class = JSLinkOnlyRewriter + + # set js class, using either default or override from config + self.rewriters['js'] = config.get('js_class', js_default_class) + + # add any regexs for js rewriter + self._add_custom_regexs('js', config) + + def _add_custom_regexs(self, field, config): + regexs = config.get(field + '_regexs') + if not regexs: + return + + rewriter_cls = self.rewriters[field] + + rule_def_tuples = RegexRewriter.parse_rules_from_config(regexs) + + def extend_rewriter_with_regex(urlrewriter): + #import sys + #sys.stderr.write('\n\nEXTEND: ' + str(rule_def_tuples)) + return rewriter_cls(urlrewriter, rule_def_tuples) + + self.rewriters[field] = extend_rewriter_with_regex diff --git a/pywb/rewrite/test/test_rewrite_live.py b/pywb/rewrite/test/test_rewrite_live.py index f3a7667a..36e74848 100644 --- a/pywb/rewrite/test/test_rewrite_live.py +++ b/pywb/rewrite/test/test_rewrite_live.py @@ -8,9 +8,18 @@ from pywb import get_test_dir urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/pywb/') +def head_insert_func(rule): + if rule.js_rewrite_location == True: + return '' + else: + return '' + def test_local_1(): - status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html', urlrewriter, 'com,example,test)/') + status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html', + urlrewriter, + 'com,example,test)/', + head_insert_func) # wombat insert added assert '' in buff @@ -23,7 +32,10 @@ def test_local_1(): def test_local_2_no_js_location_rewrite(): - status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html', urlrewriter, 'example,example,test)/nolocation_rewrite') + status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html', + urlrewriter, + 'example,example,test)/nolocation_rewrite', + head_insert_func) # no wombat insert assert '' not in buff @@ -55,6 +67,6 @@ def test_example_domain_specific_3(): status_headers, buff = get_rewritten('http://facebook.com/digitalpreservation', urlrewriter) # comment out bootloader - assert '/* Bootloader.configurePage' in buff, buff + assert '/* Bootloader.configurePage' in buff diff --git a/pywb/static/wb.js b/pywb/static/wb.js index a7b39370..c4798da8 100644 --- a/pywb/static/wb.js +++ b/pywb/static/wb.js @@ -1,18 +1,21 @@ +/* +Copyright(c) 2013-2014 Ilya Kreymer. Released under the GNU General Public License. +This file is part of pywb. -// Rewritten location and domain obj setup -window.WB_wombat_location = window.location + pywb is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. -if (window.top != window) { - window.top.WB_wombat_location = window.top.location -} - -if (window.opener) { - window.opener.WB_wombat_location = window.opener.location -} - -document.WB_wombat_domain = document.domain + pywb is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + You should have received a copy of the GNU General Public License + along with pywb. If not, see . +*/ function initBanner() { diff --git a/pywb/static/wombat.js b/pywb/static/wombat.js new file mode 100644 index 00000000..d2b7d12c --- /dev/null +++ b/pywb/static/wombat.js @@ -0,0 +1,219 @@ +/* +Copyright(c) 2013-2014 Ilya Kreymer. Released under the GNU General Public License. + +This file is part of pywb. + + pywb is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + pywb is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with pywb. If not, see . +*/ + +//============================================ +// Wombat JS-Rewriting Library +//============================================ + +var WB_wombat_replayPrefix; +var WB_wombat_replayDatePrefix; +var WB_wombat_captureDatePart; +var WB_wombat_origHost; + + +function WB_StripPort(str) +{ + var hostWithPort = str.match(/^http:\/\/[\w\d@.-]+:\d+/); + if (hostWithPort) { + var hostName = hostWithPort[0].substr(0, hostWithPort[0].lastIndexOf(':')); + return hostName + str.substr(hostWithPort[0].length); + } + + return str; +} + +function WB_IsHostUrl(str) +{ + // Good guess that's its a hostname + if (str.indexOf("www.") == 0) { + return true; + } + + // hostname:port (port required) + var matches = str.match(/^[\w-]+(\.[\w-_]+)+(:\d+)(\/|$)/); + if (matches && (matches[0].length < 64)) { + return true; + } + + // ip:port + matches = str.match(/^\d+\.\d+\.\d+\.\d+(:\d+)?(\/|$)/); + if (matches && (matches[0].length < 64)) { + return true; + } + + return false; +} + +function WB_RewriteUrl(url) +{ + var httpPrefix = "http://"; + + // If not dealing with a string, just return it + if (!url || (typeof url) != "string") { + return url; + } + + // If starts with prefix, no rewriting needed + // Only check replay prefix (no date) as date may be different for each capture + if (url.indexOf(WB_wombat_replayPrefix) == 0) { + return url; + } + + // If server relative url, add prefix and original host + if (url.charAt(0) == "/") { + + // Already a relative url, don't make any changes! + if (url.indexOf(WB_wombat_captureDatePart) >= 0) { + return url; + } + + return WB_wombat_replayDatePrefix + WB_wombat_origHost + url; + } + + // If full url starting with http://, add prefix + if (url.indexOf(httpPrefix) == 0) { + return WB_wombat_replayDatePrefix + url; + } + + // May or may not be a hostname, call function to determine + // If it is, add the prefix and make sure port is removed + if (WB_IsHostUrl(url)) { + return WB_wombat_replayDatePrefix + httpPrefix + url; + } + + return url; +} + +function WB_CopyObjectFields(obj) +{ + var newObj = {}; + + for (prop in obj) { + if ((typeof obj[prop]) != "function") { + newObj[prop] = obj[prop]; + } + } + + return newObj; +} + +function WB_ExtractOrig(href) +{ + if (!href) { + return ""; + } + href = href.toString(); + var index = href.indexOf("/http", 1); + if (index > 0) { + return href.substr(index + 1); + } else { + return href; + } +} + +function WB_CopyLocationObj(loc) +{ + var newLoc = WB_CopyObjectFields(loc); + + newLoc._origLoc = loc; + newLoc._origHref = loc.href; + + // Rewrite replace and assign functions + newLoc.replace = function(url) { this._origLoc.replace(WB_RewriteUrl(url)); } + newLoc.assign = function(url) { this._origLoc.assign(WB_RewriteUrl(url)); } + newLoc.reload = loc.reload; + newLoc.href = WB_ExtractOrig(newLoc._origHref); + newLoc.toString = function() { return this.href; } + + return newLoc; +} + +function WB_wombat_updateLoc(reqHref, origHref, location) +{ + if (reqHref && (WB_ExtractOrig(origHref) != WB_ExtractOrig(reqHref))) { + var finalHref = WB_RewriteUrl(reqHref); + + location.href = finalHref; + } +} + +function WB_wombat_checkLocationChange(wbLoc, isTop) +{ + var locType = (typeof wbLoc); + + var location = (isTop ? window.top.location : window.location); + + // String has been assigned to location, so assign it + if (locType == "string") { + WB_wombat_updateLoc(wbLoc, location.href, location) + + } else if (locType == "object") { + WB_wombat_updateLoc(wbLoc.href, wbLoc._origHref, location); + } +} + +var wombat_updating = false; + +function WB_wombat_checkLocations() +{ + if (wombat_updating) { + return false; + } + + wombat_updating = true; + + WB_wombat_checkLocationChange(window.WB_wombat_location, false); + + if (window.self.location != window.top.location) { + WB_wombat_checkLocationChange(window.top.WB_wombat_location, true); + } + + wombat_updating = false; +} + +function WB_wombat_Init(replayPrefix, captureDate, origHost) +{ + WB_wombat_replayPrefix = replayPrefix; + WB_wombat_replayDatePrefix = replayPrefix + captureDate + "/"; + WB_wombat_captureDatePart = "/" + captureDate + "/"; + + WB_wombat_origHost = "http://" + origHost; + + window.WB_wombat_location = WB_CopyLocationObj(window.self.location); + + + if (window.self.location != window.top.location) { + window.top.WB_wombat_location = WB_CopyLocationObj(window.top.location); + } + + if (window.opener) { + window.opener.WB_wombat_location = (window.opener ? WB_CopyLocationObj(window.opener.location) : null); + } + + + document.WB_wombat_domain = origHost; + +} + +// Check quickly after page load +setTimeout(WB_wombat_checkLocations, 100); + + +// Check periodically every few seconds +setInterval(WB_wombat_checkLocations, 500); diff --git a/pywb/ui/head_insert.html b/pywb/ui/head_insert.html index b30cd015..aa910442 100644 --- a/pywb/ui/head_insert.html +++ b/pywb/ui/head_insert.html @@ -1,7 +1,14 @@ +{% if rule.js_rewrite_location %} + +{% endif %} + + diff --git a/pywb/cdx/canonicalize.py b/pywb/utils/canonicalize.py similarity index 95% rename from pywb/cdx/canonicalize.py rename to pywb/utils/canonicalize.py index e2f818b9..bd21e4ca 100644 --- a/pywb/cdx/canonicalize.py +++ b/pywb/utils/canonicalize.py @@ -3,8 +3,6 @@ import surt import urlparse -from cdxobject import CDXException - #================================================================= class UrlCanonicalizer(object): @@ -15,6 +13,12 @@ class UrlCanonicalizer(object): return canonicalize(url, self.surt_ordered) +#================================================================= +class UrlCanonicalizeException(Exception): + def status(self): + return '400 Bad Request' + + #================================================================= def canonicalize(url, surt_ordered=True): """ @@ -31,7 +35,7 @@ def canonicalize(url, surt_ordered=True): try: key = surt.surt(url) except Exception as e: - raise CDXException('Invalid Url: ' + url) + raise UrlCanonicalizeException('Invalid Url: ' + url) # if not surt, unsurt the surt to get canonicalized non-surt url if not surt_ordered: diff --git a/pywb/utils/dsrules.py b/pywb/utils/dsrules.py new file mode 100644 index 00000000..2e6f9626 --- /dev/null +++ b/pywb/utils/dsrules.py @@ -0,0 +1,98 @@ +import yaml +import pkgutil + +#================================================================= + +DEFAULT_RULES_FILE = 'rules.yaml' +DEFAULT_RULES_PKG = 'pywb' + + +#================================================================= +class RuleSet(object): + DEFAULT_KEY = '' + + def __init__(self, rule_cls, fieldname, **kwargs): + """ + A domain specific rules block, inited via config map. + If config map not specified, it is loaded from default location. + + The rules are represented as a map by domain. + Each rules configuration will load is own field type + from the list and given a specified rule_cls. + """ + + self.rules = [] + + ds_rules_file = kwargs.get('ds_rules_file') + default_rule_config = kwargs.get('default_rule_config') + + config = self.load_default_rules(ds_rules_file) + + rulesmap = config.get('rules') if config else None + + # if default_rule_config provided, always init a default ruleset + if not rulesmap and default_rule_config is not None: + self.rules = [rule_cls(self.DEFAULT_KEY, default_rule_config)] + return + + def_key_found = False + + # iterate over master rules file + for value in rulesmap: + url_prefix = value.get('url_prefix') + rules_def = value.get(fieldname) + if not rules_def: + continue + + if url_prefix == self.DEFAULT_KEY: + def_key_found = True + + self.rules.append(rule_cls(url_prefix, rules_def)) + + # if default_rule_config provided, always init a default ruleset + if not def_key_found and default_rule_config is not None: + self.rules.append(rule_cls(self.DEFAULT_KEY, default_rule_config)) + + @staticmethod + def load_default_rules(filename=None, pkg=None): + config = None + + if not filename: + filename = DEFAULT_RULES_FILE + + if not pkg: + pkg = DEFAULT_RULES_PKG + + if filename: + yaml_str = pkgutil.get_data(pkg, filename) + config = yaml.load(yaml_str) + + return config + + def iter_matching(self, urlkey): + """ + Iterate over all matching rules for given urlkey + """ + for rule in self.rules: + if rule.applies(urlkey): + yield rule + + def get_first_match(self, urlkey): + for rule in self.rules: + if rule.applies(urlkey): + return rule + + +#================================================================= +class BaseRule(object): + """ + Base rule class -- subclassed to handle specific + rules for given url_prefix key + """ + def __init__(self, url_prefix, rules): + self.url_prefix = url_prefix + if not isinstance(self.url_prefix, list): + self.url_prefix = [self.url_prefix] + + def applies(self, urlkey): + return any(urlkey.startswith(x) for x in self.url_prefix) diff --git a/pywb/wbapp.py b/pywb/wbapp.py index 0befa172..ac51ba9d 100644 --- a/pywb/wbapp.py +++ b/pywb/wbapp.py @@ -2,6 +2,7 @@ from wbexceptions import WbException, NotFoundException, InternalRedirect from wbrequestresponse import WbResponse, StatusAndHeaders from pywb.cdx.cdxserver import CDXException +from pywb.utils.canonicalize import UrlCanonicalizeException from pywb.warc.recordloader import ArchiveLoadFailed import os @@ -55,7 +56,8 @@ def create_wb_app(wb_router): except InternalRedirect as ir: response = WbResponse(StatusAndHeaders(ir.status, ir.httpHeaders)) - except (WbException, CDXException, ArchiveLoadFailed) as e: + except (WbException, CDXException, + UrlCanonicalizeException, ArchiveLoadFailed) as e: response = handle_exception(env, wb_router.error_view, e, False) except Exception as e: diff --git a/setup.py b/setup.py index dac8a907..0750fe55 100755 --- a/setup.py +++ b/setup.py @@ -13,7 +13,7 @@ setuptools.setup(name='pywb', license='GPL', packages=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite'], provides=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite'], - package_data={'pywb': ['ui/*', 'static/*'], 'pywb.cdx': ['*.yaml']}, + package_data={'pywb': ['ui/*', 'static/*'], 'pywb': ['*.yaml']}, data_files = [('sample_archive/cdx/', glob.glob('sample_archive/cdx/*')), ('sample_archive/warcs/', glob.glob('sample_archive/warcs/*')), ('sample_archive/text_content/', glob.glob('sample_archive/text_content/*'))], From 22f1f78fcabbc5deec3c441d9fc0ffce3d43f178 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 27 Feb 2014 18:22:10 +0000 Subject: [PATCH 4/8] cdx: clean up filters, add '~' modifier for contains rules: fix regex to be lazy not greedy, turn off unneeded custom canonicalizer (need tests for custom canon) cleanup fuzzy match query fix data package in setup.py --- pywb/cdx/cdxdomainspecific.py | 17 +++++++++++------ pywb/cdx/cdxops.py | 33 ++++++++++++++++++++++++--------- pywb/cdx/cdxserver.py | 15 +++++++-------- pywb/cdx/test/cdxserver_test.py | 10 ++++++++++ pywb/rules.yaml | 9 +++++---- setup.py | 2 +- 6 files changed, 58 insertions(+), 28 deletions(-) diff --git a/pywb/cdx/cdxdomainspecific.py b/pywb/cdx/cdxdomainspecific.py index 006dd88d..54654b5e 100644 --- a/pywb/cdx/cdxdomainspecific.py +++ b/pywb/cdx/cdxdomainspecific.py @@ -38,8 +38,8 @@ def load_domain_specific_cdx_rules(ds_rules_file, surt_ordered): if rules: fuzzy = FuzzyQuery(rules) - logging.debug('CANON: ' + str(canon)) - logging.debug('FUZZY: ' + str(fuzzy)) + logging.debug('CustomCanonilizer? ' + str(bool(canon))) + logging.debug('FuzzyMatcher? ' + str(bool(canon))) return (canon, fuzzy) @@ -73,6 +73,8 @@ class FuzzyQuery: urlkey = params['key'] url = params['url'] + filter_ = params.get('filter', []) + output = params.get('output') for rule in self.rules.iter_matching(urlkey): m = rule.regex.search(urlkey) @@ -82,7 +84,7 @@ class FuzzyQuery: matched_rule = rule if len(m.groups()) == 1: - params['filter'] = '=urlkey:' + m.group(1) + filter_.append('~urlkey:' + m.group(1)) break @@ -91,10 +93,13 @@ class FuzzyQuery: inx = url.find('?') if inx > 0: - params['url'] = url[:inx + 1] + url = url[:inx + 1] + + params = {'url': url, + 'matchType': 'prefix', + 'filter': filter_, + 'output': output} - params['matchType'] = 'prefix' - params['key'] = None return params diff --git a/pywb/cdx/cdxops.py b/pywb/cdx/cdxops.py index 247f3d18..1a90d7ca 100644 --- a/pywb/cdx/cdxops.py +++ b/pywb/cdx/cdxops.py @@ -157,9 +157,15 @@ def cdx_filter(cdx_iter, filter_strings): if self.invert: string = string[1:] - self.exact = string.startswith('=') - if self.exact: + # exact match + if string.startswith('='): string = string[1:] + self.compare_func = self.exact + elif string.startswith('~'): + string = string[1:] + self.compare_func = self.contains + else: + self.compare_func = self.regex parts = string.split(':', 1) # no field set, apply filter to entire cdx @@ -170,19 +176,28 @@ def cdx_filter(cdx_iter, filter_strings): self.field = parts[0] string = parts[1] - if self.exact: - self.exact_str = string - else: + # make regex if regex mode + if self.compare_func == self.regex: self.regex = re.compile(string) + else: + self.filter_str = string def __call__(self, cdx): val = cdx[self.field] if self.field else str(cdx) - if self.exact: - matched = (self.exact_str == val) - else: - matched = self.regex.match(val) is not None + + matched = self.compare_func(val) + return matched ^ self.invert + def exact(self, val): + return (self.filter_str == val) + + def contains(self, val): + return (self.filter_str in val) + + def regex(self, val): + return self.regex.match(val) is not None + filters = map(Filter, filter_strings) for cdx in cdx_iter: diff --git a/pywb/cdx/cdxserver.py b/pywb/cdx/cdxserver.py index 7f548ec4..8eff842c 100644 --- a/pywb/cdx/cdxserver.py +++ b/pywb/cdx/cdxserver.py @@ -50,14 +50,14 @@ class BaseCDXServer(object): url = params['url'] - if self.fuzzy_query and params.get('allowFuzzy'): - if not 'key' in params: - params['key'] = self.url_canon(url) + # check if fuzzy is allowed and ensure that its an + # exact match + if (self.fuzzy_query and params.get('allowFuzzy') and + params.get('matchType', 'exact') == 'exact'): - params = self.fuzzy_query(params) - if params: - params['allowFuzzy'] = False - return self.load_cdx(**params) + fuzzy_params = self.fuzzy_query(params) + if fuzzy_params: + return self.load_cdx(**fuzzy_params) msg = 'No Captures found for: ' + url raise CaptureNotFoundException(msg) @@ -95,7 +95,6 @@ class CDXServer(BaseCDXServer): msg = 'A url= param must be specified to query the cdx server' raise CDXException(msg) - #params['key'] = self.url_canon(url) match_type = params.get('matchType', 'exact') key, end_key = calc_search_range(url=url, diff --git a/pywb/cdx/test/cdxserver_test.py b/pywb/cdx/test/cdxserver_test.py index 0e799ce9..384d7187 100644 --- a/pywb/cdx/test/cdxserver_test.py +++ b/pywb/cdx/test/cdxserver_test.py @@ -54,6 +54,16 @@ com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz +# Filter contains +>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '~urlkey:example=1') +com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz +com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz + +# Filter contains invert +>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '!~urlkey:example=1') +com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz +com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz + # Collapse by timestamp # unresolved revisits, different statuscode results in an extra repeat >>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', collapseTime = 11) diff --git a/pywb/rules.yaml b/pywb/rules.yaml index 5cf29154..8927d2f1 100644 --- a/pywb/rules.yaml +++ b/pywb/rules.yaml @@ -12,11 +12,12 @@ rules: #================================================================= - url_prefix: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet' - fuzzy_lookup: 'com,facebook\)/.*[?&]data=(.*(?:[&]|query_type[^,]+))' + fuzzy_lookup: 'com,facebook\)/.*[?&]data=(.*?(?:[&]|query_type[^,]+))' - canonicalize: - match: 'com,facebook\)/.*[?&]data=([^&]+).*' - replace: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet?data=\1' +# not actually needed, fuzzy match is used instead here +# canonicalize: +# match: 'com,facebook\)/.*[?&]data=([^&]+).*' +# replace: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet?data=\1' - url_prefix: 'com,facebook)/' diff --git a/setup.py b/setup.py index 0750fe55..94c1bca7 100755 --- a/setup.py +++ b/setup.py @@ -13,7 +13,7 @@ setuptools.setup(name='pywb', license='GPL', packages=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite'], provides=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite'], - package_data={'pywb': ['ui/*', 'static/*'], 'pywb': ['*.yaml']}, + package_data={'pywb': ['ui/*', 'static/*', '*.yaml']}, data_files = [('sample_archive/cdx/', glob.glob('sample_archive/cdx/*')), ('sample_archive/warcs/', glob.glob('sample_archive/warcs/*')), ('sample_archive/text_content/', glob.glob('sample_archive/text_content/*'))], From 7863b2bade76443823a702dc81e9fc76128a9f7d Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 27 Feb 2014 20:10:44 +0000 Subject: [PATCH 5/8] add sample data for zipnum #17 --- sample_archive/zipcdx/zipnum-sample.cdx.gz | Bin 0 -> 9768 bytes sample_archive/zipcdx/zipnum-sample.idx | 38 +++++++++++++++++++++ sample_archive/zipcdx/zipnum-sample.loc | 1 + 3 files changed, 39 insertions(+) create mode 100644 sample_archive/zipcdx/zipnum-sample.cdx.gz create mode 100644 sample_archive/zipcdx/zipnum-sample.idx create mode 100644 sample_archive/zipcdx/zipnum-sample.loc diff --git a/sample_archive/zipcdx/zipnum-sample.cdx.gz b/sample_archive/zipcdx/zipnum-sample.cdx.gz new file mode 100644 index 0000000000000000000000000000000000000000..8687b97a4f9db618b9b3defac200e2ffb89e865b GIT binary patch literal 9768 zcmZ9SRa6~okcIJJ32?B3yW4@_?tXB0cS~@0hu}_dcS3Nt;O?%$B{)HcduOd#!+q z&Q`VoHo7Z!#uUT7+s0I&)Zk=n&T(_h{eatt(_|Ws;|6tFQNJIK5*?Xh%Oj5#M^Yb;ZAN$l4T+@O8=P~g5=g((e z>^ozr7iemxg+NxHXcQ*0V{PeS&ycb*12VsQfcLS{-eUtcm^uTIOD{MwESiw$1lBba z-F27YJA1m4VSQ(w^P!^vWZpB>a|{#&1kwN1q&==%E>I9P4@}>BQiF6OPE+|a zSP8`7#tdUgp_W{uE#>85X^f&4Egqe=WAbvIfiq>pI9xG2&YUO85he#-o2Ah>&?|ia z?074qkHMXmeSirf{e$cQ%Zw5*74UGu!kmy{X;oaCP}5KRJx5R70AiZKEVorxSF&;} zU1x76+^y|UKN`=#+XM3Og##`n>PI!a4}d<*6T|7!r4ZMKZwkk zxB8Unl_3+(!)?K>7bz{Jqt;wfuxM7cQ+A^mtt?DGETG~^wEaEE#JQc}N;WC4hg3cI zNNNsa$v8!BlK#({A-8Lw+nI+A&a#L(H?(;dG_X1za^mZbC{hHPx=i4+o zMAlI_f&7h=BOJk(3}si~cW$fKMnP12k}srhzO)h(^!qxF$n^+f2;TYh+kkEMrTa^mw2tmiNR&PlY#rd|4ly1aLl3Ec>nUuT(DPbZ@J<+Co<*?#+z z>ca$dA6~Z5CdLhOo_pMFEkk=x$H~oPH>8S%9=;lysGbh%EGyAfIBIJQEVjhzGi2vD zU&jjrW9o{f8ZZ=r;+Ac)`$I?0gx$vJr;gM&rKn-IP*^$@@b^n%5>#BJkZyi%U4BJdcs+w#mdstPu9 zHFx2#ZYvDU$-`1+ZNQeQ{N-mt(F#16;XvcX@Q9DSLVo?zz1ZSLb4BaB*V^uw2_qfy zWI=b{)Q(S^qRx2#W8Z%78Ev^1AW4FNM$$d>IFfo*WT~-W(7^aI)K;H%$9Uy2!Rp?= zf3v04GA#l@G{il3;gvpywL~eq%_+5?;Z(osQw89Ys&k=$sH6xIN2O_6JHoBFcmII` zT>u37p^C?;{2+orto#6n3M3Cyl!qvo>Vkk3&+C4B#KreBNMt|Ve}R00dI)#?4{CqB zjQpFc44nH!w5P*AZq{yojJ5!0W4K=%sY%DdX#QwTD#%_B?(1XKvr6|!@0ImQ*ci=@ zJSs9{Xbq_jPK+hCNdum_m6Px4E=kWz??iKGE0+>X+Ts~*A4%K5tS24Wk(@lP;{z$N z=a^E1e^q=KUE?S+&-2TBsxYnegS`Ah@ipL(+&il<91l(i2u`qIn_w@WbdONtGlCfC z_T!$t;{~1J0?nM8wtcbxFwqM_6s9U2%R*sYibJ|jVY^}V?;^uJ0A6EBunl(6ux;*i zY2?vSP3!GbIUk_DvYm>C9()LQ)>X!qoofqjQz&F908l(m(xJM^&Le%5-(LEfW^=+h z(n~*x@O|Hs*F9DCkUF^5_L2V&{WcNBF;Z9p=hxMD;^?0ldlLsn$d-5YwhhwQQ+8UP zE}C|bRv{dlok?seoV2)E5uC>jWtG-J70H@!Td4V$%xXfbl|@ev&JaP^op=Y2uVCca z?<16pl;j;1`~nqJ%vm|w)W3qUqAXSZ5bn4OTdp(|OrY+U>r0PR{XCPg-;l1nkvJNW zAvl6Z;I*HZVaEJxRKI+p+Ci7)5)%xn?q|=Q*kdZnAom*-q=_aCTxsqQDy!zX`w(L2 z;S~atBvp0qhn)3oLcwo2X9}8SR{(?xGA-@f0@Cze;NXLf`9)t$bRdgnWea!SqfQ9P z^n!rDE?Tom)eXUy9pLq!tskeD6*zR#%2Uw9)sgaXq<(G|H{KL%jv1Y!yoP39-Bi?8 zwqu1F!$VON;P(yHGsC-CEq zqZ?jaSex9u8Ay6Wf~GQnAgh!bx&U%*k|PgyZ-zywyab*rxp8jKC3y@tYN35uY_s;TV+;cDT(;=!B9>Mhc+MI=NGVFtHO)U-#aJnTNr$4#&l?wQuMw_f44zeEQ=Xd!E!TTl{X_N zJygb@a7;HFd93IpH)_!-NZ8s{Y(hf;HL_qBLa4QhX| z8KEb_NGBDPHvi}7*;MKfgluwxy28;}a3(@ar2O}8R^disdEva$norc=2dU>M`-+o# zL7vVAUq!as;}3ef>ofMXGTsW&oL=xQK(TwFy;dkahVBs?9{U0r6`C5>h=oLS6f%xE zM~RYMSr{V&GQe+{+-f=|X!%tHAZS9pBtJ+M$8j0<9<@Cd{7?9WoWJ}Veq5~wl{x+v z(knTpz5ND-l6jLgc+4*z{)9H^zbvBlsLpw>Mpu6R)=XOXOVgo`6Nr<$TIjYmJHKhB zJWE*}Z5)wulWKPYQ4PH4K2lr`gJk4RqDr|3utqIRagwIUx{EQ)x8x~P{88sFmRGZ_;)E6Y8Fdg!pa!{jpAZBTO^lR+MH>fb_42uZ>wc)H z((3p&kiec4Cu&{a(wA|0du*G*Kgaas*un$5cp(zP%o5j3r`!S2f0`xZ-be*Qi3LW~ z?Zi!NQ$S($_(D6r3wB%fE24x9E$OR}lH1V}HkkI|F5XX2sp>ZU2|}P`Crsy^7TV+G zkpcuU`p&&0Tt8a9p0Ctvdw>EWJ{7tl#DSP-%gL_~S6ZCmg=>!gnC*SA$5m%6Ya8Fc zR&pZvM+cV2Wi3>C5KPp+U=e>su{s<-x`@4Xe)l(NG+ie!bQN<}_gq!|rY7i$UzVeF zhJT5W%v&;^Goduxm79ge>{#J%0)<6*PV}@|56^Rw(C|RTUxH~CHm(9#TLuXp!uXxt zSh!e(4OT*>gCP<&;uL?}*E7FJ0ULlKjE_1|th|PJ&oI|}o~d@kx4{l(bG>*`ZL+n! zJ^SV)DwwoH!$KGrfs`3cnqtsZ%(1Kq514dW#Nl70-OVSZtIOrhI?@8rG99QCUgP%ld$9kkft}06+;eI(RiVl5k-{>n}TeaJ9i#rO! zw&vOIE36xIqB5>Y?{<$q*{pwAMg8lo;2o`EF;*S7>lasf*&Y{hzb}a-J1+Y`c8DSg zH;?*H`LoUVa&YdwkgvuI8uR%B$jAC-I%cJPe$ zmp)U!jL0+w*dyJ9V6V z_%FkTHUdtZb9bsdMDfj>!s#+f>rkX$`+ODiLJ0|I87l!o(%HQ$66)!nqlzX?mt<91 z;v?!}C8V@{G%vNcgqx7gvQ@FfGC!b1h{@A_bNjJ8avy9b`U4lo8w3BIxHp^Z5~jew z*FUo)GnfVlE$&T8_4)!SYb)l>^S;IL|F$^jZFJY}UOhd1rf`vlfeqK9o)Q;^XO51O z0d3BBY9V6L))Kgw3Ai-4{4NZEU z(;fkMbZbTAJ<)XSn`n{YHrI*=X%|~%(pzs=6A=-WoE6F<6P2v6=@yK-iyK?GI7?>P zc3=O(m+COn&Ny->4pdZ8&|Jd~b33~GV6DNlAkdMENCy{e&9bJ{CfcqKxT zseczNxXo61X7%n9GgmX$C=Ri1;cyR&@b*MhkmO>?vR&1xNH zni*y8XsmfGw1*aPD-&cdgI(#}B*-KP+*lzK0KEPB3Q7noj=~C}gsR zH@INdw@s+#q)hLm0CrMkq`HjAw=NymFEOknK_HWW#7!_)*IflfFL=scdipHfilEpd zE(B!LJAy33k#Ee6+?du-4n^Vi28nMSe{8GQ%VCLd>Wnx0QddtT1Kj_^H4T=5W=4JE zddtqdjESLw3!Pjb8fNKyB#iE~;DTg}Q@_6bd_Hy|y$quv<>+G&rd2q44#g!n8`lu|kWt&jYagy`Y*4t`UkCJ~C&LhPL%_{#M=Z z)SDrM2Y-E>y_+u^BPukP%j@;+o|f&Ob_n={lw<0aA@kVgd*l~Uhv_2qss2)TU^?OZ zM*S}2EAT#|JJxqGX3a@}S|fLAPEBC!Mm1Wy#c;CSFF*n%!{1Z<|YzeiZi?VYo#=EhUlprgTMEuV~=6^`I=I0~``kSvnvq~y`L8&w$#w%ILO1Iuj zzg*DrI{EUew}pPSSdW8|5O}Zf&h^ygW9aULZJg}pgGXf4o2J|C-FSYI=deNMQ_60+ zPH~FdXKUW$eJ?7aGrrM1P@77;h6NQTkLezm6c>=y%vr&|7SrKpaS-s0b>+LAg zP=s2vvt;D_GR}8iKPx|$E5U2fWMNaprF7U4Q**{$l8u7=2!lea>F23!zx&3WjfFYM z!{g@%mMpi5H$T=M+|)e38CbcstaD~6`B*u2wmBcn)6~=6d1P!gOYt;M@Ls=N++>thT?c^K+!2G+nJv!`NAwJSiNNsU*5kUZ1HPP8y78YQUMf zmHE13Jgl{DCKadCyFi~3OI%S3{6z3uVOVJ?#a)YnMYUWw;3bJvmQyGU9q!gvdrjzV z{^v$6F=g1d8M^EUnBj*RHIlld_zaR$JqoT;=%gcS++k8ANIZI zIlY!RSB~sK>gz{!;unD?WQ9Njr^~cX2wLPplc@KlnwEJRe^4PdJ>QPA(3#Agp+xCV z#;M*sMiR6Y;pW`D$!h!Kc?x*Lea_JxW1#k79L=y%nEYN9Ga0KWMuwwea`%{3n0|n$CNl0vI5NtN)IgX^jhg*HSlD02}7|iDbszHxi(wkp)^Qck~^%pQj zK|>y2)2J9f2pQjbZLDJdF*&WZeor*XhtvalGpIHFFXaXaBZX2?`S*&tefCaqqyXq; zNd~N^8#Rd!LWsICt)4{dv8~_a>wb=*yc}`n#>zVQ%GS}hz+f*vZ{?)_qLxI@!-Uz- zYr5fN8i{Y0*Bo4{K4uw<;T7O}TEj*aV$nNE&@|e0V9mCFY_;{}%CNC$SDWd!;>uE^ zLn~SIo5z*vrrLfEY&NHR8cA9NE~xnzVX5aRMLP@ypp_I>3}DXM7Oh3dsO47<6c#lx zr=})8TiwAn9r@kRn6+@21_j@ zxG&a;Enz70v%Xx>kB4~^QJ980KKaJubVZnzTPJ>eV z65J;=o)k(T8(=62NASXjkPn-1Xlhy)@mW@ntl*xNHJ4 zTz=`U%Mfd69L`rEUkWeD%tBX%kmgi9VAa48*sah@i$A!l^??K{mxGMoSL)qCZ>g)+ zJXI6A&LIw(WUO@qQK+b}Jlv!;=7_k(a*csv&Ou^Hqe>YoG_7WA=!1nLOU0XJY)+-Z zjFQB-at_w%U&l388tgU{Rfei*w3F{x8^Bt-Cx<4thR03PBSqSw#EsH6CV~Sy59FV^ zxmdSMi5T(>Fpp+fmi=dfr{P$0<6)ks=hbR6+9>-wH7vz{;YvxP^m>$t_W=TKqc%lq zcRKu0g4$YnQR%KzHBiE=H}*FcIbfT3SPb7oO@jY%FS!48ua?$2eY-})JQRQ^{3#&v zgR-y~MZSKvd(@{bp*_}Kp5ER>=axYm!~NZK4j7spMzU=G435dh02UR=QbYL0hRWn1 zr?)*e{D+*c?NV8bdAM!e4`O+$MwAtDXSMogE<`UThXIk_k1G)JTU^sqzk$`;4(LM$ z1VmkjGLo>;ioI;e5eb`R-EuSQd{;WvUfCioJDI7vp!6IQeo*gG}IH;Eb_af45; z@@qI5=Ap;x|8(#_!ckLea{Y&E`rj*Sh<}ExYRY8qQ5NMk|Bt1>+Azg~ew!MhFdBH3 zS$(Fj%wSn>{6Rv>x}~(gEK6lW@`!b815X_`%>V%XkN{J~4^xq{U=>bL*iW7(Ef0xT z9p{x7VspD2s%b4gv@A22bIG0K(pN^y6*Nb77F57Y>-r#I?_|uZsaJ-=j}FN-R(_e2 zK^bUvyl)NRaV}vncp0}vLlsZYB@Ya06UBYwI9s60Yzz-&w9!W*CPW%USp6hf-CI|A@E${29~n<5yR6U5UmF5&+>t2>)C57KK*K#XlU+*a?NAp0I*OJ)id4SIGle z`3Jl^CJ-56myz9F+6l%2O<`w|C=}u`dPD-0HT6O=)m8g((MG_Ry6XLVU1JwnSe6J_ z_b*zE$#$$?2g+l^*b{U?8{)b0bq4(D;A3HUZ@!+Lo z*02=wpF|=|tTVL8<~N+9Bn7od8W?bc^IPPnF!et3qv|KXkZ=7L=xvtxC#9Yvmf+m= z*FC9LXh^s6P9XW%Nl1z!E6OQuj_b5w8H8)*xOdq(`xV9-Mk`TE{mg z#}n*^n@5A$?o&UxY6YcWsTID!GF=sakUDSQHW!`Z%lk;3W^{^XyN^Oc>%VLnO&zLB zCM{T5(7Qsqe4^{LWIMyDGM10>>jUbPBsb`t@{*=Uz6O9W;5NOpNms^I0K$?rJd8>~ z5yaFf;80pxIlAP&<=eyZdhG90hfRZ&j^o+qnNn+`c4bDII=!K=p}?UC^4|Wp&2zmF z_QMKNfXS!{rE%fMJ>d#Fl2se_=_;wRjHSa4@Kltca)(+QAP3fouRAcT8P3rtLn-~C z8Plw)a;6J0I;WGt5;6&tqofL*a8tHB%_yAituxozvt6qfNd)MJXBL%@zYWgwLU&-y z%5GqQpXB*<<~T7gktG84S*lG>c*h@4RE~wUWf&W>kp@mwSp^KOZ)uL|pvpee;?9`7 z15hhWZJ-1*V=^!z*6xLhg4HI_U&H_62?_yCMx&hZ_j(6^8ZAU|o1IArEi}1S@+0ZE zc8Wwp)Pq%}nXb>XDw;}u4S0w$P7wv+uMh~-c_n{R68>O_ztgb>R;o_8xm0jPA9!fk zIgP7l=Qhu#=VBW8AgIZ81>HdxgDIV2AQ4`2zCv;)mY-PIoN}mXQe)8?%bF%ek4jHJ+f+0T#K_eTi2!3`&Z4c3k%iXjwP&h zb*xO4&s|wyx}A36wldK_lUSz0&U?jw^1j97R0M>2aKiEZ1?}N(W;*=pa$e;UV0=8z zZag|Hx&7O~%SNowdyV*azp1f-gf}ZBWUZ@KJes?ONws#@lf#~PMTZkNp#&?Y}BbNH3w9mH0&p4|w|dv4>y`Q)rj*po|pj|E6&ro>!+VX!j}=!Vh$A5 zM_vu@Rn|^H+xF^}Mi2L@>8U5>o9~A+Mn7#i;;E4m4`hMpg648z&R>A*k$=G?L20O6V@1y%QAcUqAn--ph&q=|;JPaqh@PZa;fjTre;EdHV2D$-O^TE zDH(_{Ypn;)%@G$8Z5&rk8Mj>Lf??)yF;t{mfqK$2dhwae7>W@PF4QKztQ&wHw+P~Q zzB0ljy)Yy@Dl Date: Thu, 27 Feb 2014 12:33:11 -0800 Subject: [PATCH 6/8] add first set of zipnum tests #17 still need to test timed reload, multi sources --- pywb/cdx/cdxserver.py | 2 +- pywb/cdx/test/cdxserver_test.py | 5 +-- pywb/cdx/test/zipnum_test.py | 44 +++++++++++++++++++++++++ sample_archive/zipcdx/zipnum-sample.loc | 2 +- setup.py | 1 + 5 files changed, 50 insertions(+), 4 deletions(-) create mode 100644 pywb/cdx/test/zipnum_test.py diff --git a/pywb/cdx/cdxserver.py b/pywb/cdx/cdxserver.py index 8eff842c..fd0c14e9 100644 --- a/pywb/cdx/cdxserver.py +++ b/pywb/cdx/cdxserver.py @@ -214,7 +214,7 @@ def create_cdx_source(filename, config): if filename.endswith('.cdx'): return CDXFile(filename) - if filename.endswith('.summary'): + if filename.endswith(('.summary', '.idx')): return ZipNumCluster(filename, config) return None diff --git a/pywb/cdx/test/cdxserver_test.py b/pywb/cdx/test/cdxserver_test.py index 384d7187..44483ca4 100644 --- a/pywb/cdx/test/cdxserver_test.py +++ b/pywb/cdx/test/cdxserver_test.py @@ -142,8 +142,8 @@ org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db tex ('filename', 'dupes.warc.gz')] # NOTE: external dependency -- need self-contained test -#>>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'example.com', output = 'raw', limit = '2') -#>>> pprint.pprint(x.next().items()) +>>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'example.com', output = 'raw', limit = '2') +>>> pprint.pprint(x.next().items()) [('urlkey', 'com,example)/'), ('timestamp', '20020120142510'), ('original', 'http://example.com:80/'), @@ -172,6 +172,7 @@ def cdx_ops_test(url, sources = [test_cdx_dir + 'iana.cdx'], **kwparams): results = server.load_cdx(**kwparams) for x in results: + x = x.replace('\t', ' ') sys.stdout.write(x) diff --git a/pywb/cdx/test/zipnum_test.py b/pywb/cdx/test/zipnum_test.py new file mode 100644 index 00000000..7c98309a --- /dev/null +++ b/pywb/cdx/test/zipnum_test.py @@ -0,0 +1,44 @@ +""" +>>> zip_ops_test(url = 'http://iana.org') +org,iana)/ 20140126200624 http://www.iana.org/ text/html 200 OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 2258 334 iana.warc.gz +org,iana)/ 20140127171238 http://iana.org unk 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 343 1858 dupes.warc.gz +org,iana)/ 20140127171238 http://www.iana.org/ warc/revisit - OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 536 2678 dupes.warc.gz + +# test idx index (tabs replacad with 4 spaces) +>>> zip_ops_test(url = 'http://iana.org/domains/', matchType = 'prefix', showPagedIndex = True) +org,iana)/dnssec 20140126201307 zipnum 8511 373 +org,iana)/domains/int 20140126201239 zipnum 8884 353 +org,iana)/domains/root/servers 20140126201227 zipnum 9237 386 + +>>> zip_ops_test(url = 'http://iana.org/domains/', matchType = 'prefix') +org,iana)/domains/arpa 20140126201248 http://www.iana.org/domains/arpa text/html 200 QOFZZRN6JIKAL2JRL6ZC2VVG42SPKGHT - - 2939 759039 iana.warc.gz +org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz +org,iana)/domains/idn-tables 20140126201127 http://www.iana.org/domains/idn-tables text/html 200 HNCUFTJMOQOGAEY6T56KVC3T7TVLKGEW - - 8118 715878 iana.warc.gz +org,iana)/domains/int 20140126201239 http://www.iana.org/domains/int text/html 200 X32BBNNORV4SPEHTQF5KI5NFHSKTZK6Q - - 2482 746788 iana.warc.gz +org,iana)/domains/reserved 20140126201054 http://www.iana.org/domains/reserved text/html 200 R5AAEQX5XY5X5DG66B23ODN5DUBWRA27 - - 3573 701457 iana.warc.gz +org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz +org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz +org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz +org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz + +""" + + + + +from cdxserver_test import cdx_ops_test + +from pywb import get_test_dir +test_zipnum = get_test_dir() + 'zipcdx/zipnum-sample.idx' +print test_zipnum + +def zip_ops_test(url, **kwargs): + sources = test_zipnum + cdx_ops_test(url, sources, **kwargs) + + +if __name__ == "__main__": + import doctest + doctest.testmod() + + diff --git a/sample_archive/zipcdx/zipnum-sample.loc b/sample_archive/zipcdx/zipnum-sample.loc index 249e1071..df4f3196 100644 --- a/sample_archive/zipcdx/zipnum-sample.loc +++ b/sample_archive/zipcdx/zipnum-sample.loc @@ -1 +1 @@ -zipnum ./sample_archive/zipcdx/zipnum-sample.cdx.gz +zipnum ./sample_archive/zipcdx/zipnum-sample.cdx.gz diff --git a/setup.py b/setup.py index 94c1bca7..307506fe 100755 --- a/setup.py +++ b/setup.py @@ -15,6 +15,7 @@ setuptools.setup(name='pywb', provides=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite'], package_data={'pywb': ['ui/*', 'static/*', '*.yaml']}, data_files = [('sample_archive/cdx/', glob.glob('sample_archive/cdx/*')), + ('sample_archive/zipcdx/', glob.glob('sample_archive/zipcdx/*')), ('sample_archive/warcs/', glob.glob('sample_archive/warcs/*')), ('sample_archive/text_content/', glob.glob('sample_archive/text_content/*'))], install_requires=['uwsgi', 'rfc3987', 'chardet', 'redis', 'jinja2', 'surt', 'pyyaml', 'WebTest','pytest'], From 921b2eb2e1135cdf62aa9b770a7c50597ef35494 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 27 Feb 2014 18:43:55 -0800 Subject: [PATCH 7/8] improve testing and a few fixes: archivalrouter: support empty collection, with and without SCRIPT_NAME cdx: remove cdx source test, including access denied replay: when content-type present, limit the decompressed stream to content-length (this ensures last 4 bytes in warc/arc record are not read) integration tests for identity replay --- pywb/archivalrouter.py | 6 +++-- pywb/cdx/cdxsource.py | 4 ++- pywb/cdx/perms.py | 2 +- pywb/cdx/test/cdxserver_test.py | 6 ++++- pywb/replay_views.py | 12 +++++++++ pywb/test/test_archivalrouter.py | 14 +++++++++++ pywb/utils/canonicalize.py | 13 +++++++--- tests/test_integration.py | 42 +++++++++++++++----------------- 8 files changed, 67 insertions(+), 32 deletions(-) diff --git a/pywb/archivalrouter.py b/pywb/archivalrouter.py index 4d28b57e..5d3dc9f4 100644 --- a/pywb/archivalrouter.py +++ b/pywb/archivalrouter.py @@ -50,7 +50,10 @@ class Route: def __init__(self, regex, handler, coll_group = 0, config = {}, lookahead = SLASH_QUERY_LOOKAHEAD): self.path = regex - self.regex = re.compile(regex + lookahead) + if regex: + self.regex = re.compile(regex + lookahead) + else: + self.regex = re.compile('') self.handler = handler # collection id from regex group (default 0) self.coll_group = coll_group @@ -70,7 +73,6 @@ class Route: return None matched_str = matcher.group(0) - if matched_str: rel_prefix = env['SCRIPT_NAME'] + '/' + matched_str + '/' wb_url_str = request_uri[len(matched_str) + 2:] # remove the '/' + rel_prefix part of uri diff --git a/pywb/cdx/cdxsource.py b/pywb/cdx/cdxsource.py index 783cf36b..ba5f8b3b 100644 --- a/pywb/cdx/cdxsource.py +++ b/pywb/cdx/cdxsource.py @@ -1,6 +1,8 @@ from pywb.utils.binsearch import iter_range from pywb.utils.loaders import SeekableTextFileReader +from cdxobject import AccessException + import urllib import urllib2 import itertools @@ -93,7 +95,7 @@ class RedisCDXSource(CDXSource): self.key_prefix = self.DEFAULT_KEY_PREFIX if config: self.key_prefix = config.get('redis_key_prefix', self.key_prefix) - + def load_cdx(self, params): """ diff --git a/pywb/cdx/perms.py b/pywb/cdx/perms.py index a7b90eb4..ad6ea00d 100644 --- a/pywb/cdx/perms.py +++ b/pywb/cdx/perms.py @@ -1,7 +1,7 @@ #================================================================= -class AllowAllPerms: +class AllowAllPerms(object): """ Sample Perm Checker which allows all """ diff --git a/pywb/cdx/test/cdxserver_test.py b/pywb/cdx/test/cdxserver_test.py index 44483ca4..e5fac6b3 100644 --- a/pywb/cdx/test/cdxserver_test.py +++ b/pywb/cdx/test/cdxserver_test.py @@ -141,7 +141,7 @@ org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db tex ('offset', '334'), ('filename', 'dupes.warc.gz')] -# NOTE: external dependency -- need self-contained test +# NOTE: external dependency -- need self-contained test TODO >>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'example.com', output = 'raw', limit = '2') >>> pprint.pprint(x.next().items()) [('urlkey', 'com,example)/'), @@ -152,6 +152,10 @@ org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db tex ('digest', 'HT2DYGA5UKZCPBSFVCV3JOBXGW2G5UUA'), ('length', '1792')] + +>>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'facebook.com', output = 'raw', limit = '2') +Traceback (most recent call last): +AccessException: Blocked By Robots """ #================================================================= diff --git a/pywb/replay_views.py b/pywb/replay_views.py index 9113ad5f..31e7af9a 100644 --- a/pywb/replay_views.py +++ b/pywb/replay_views.py @@ -7,6 +7,7 @@ from wbrequestresponse import WbResponse from wbexceptions import CaptureException, InternalRedirect from pywb.warc.recordloader import ArchiveLoadFailed +from pywb.utils.loaders import LimitReader #================================================================= class ReplayView: @@ -54,10 +55,21 @@ class ReplayView: response = None + # if Content-Length for payload is present, ensure we don't read past it + content_len = status_headers.get_header('content-length') + try: + content_len=int(content_len) + if content_len > 0: + stream = LimitReader(stream, content_len) + except ValueError: + pass + if self.content_rewriter and wbrequest.wb_url.mod != 'id_': response = self.rewrite_content(wbrequest, cdx, status_headers, stream) else: (status_headers, stream) = self.sanitize_content(status_headers, stream) + #status_headers.remove_header('content-length') + response_iter = self.stream_to_iter(stream) response = WbResponse(status_headers, response_iter) diff --git a/pywb/test/test_archivalrouter.py b/pywb/test/test_archivalrouter.py index 4379fbfd..229fafb6 100644 --- a/pywb/test/test_archivalrouter.py +++ b/pywb/test/test_archivalrouter.py @@ -15,6 +15,13 @@ 'wb_prefix': 'https://localhost:8081/my_pywb/web/', 'wb_url': ('replay', '2013', 'im_', 'http://test.example.com', '2013im_/http://test.example.com')} +# route with no collection +>>> print_req(Route('', BaseHandler())({'REL_REQUEST_URI': 'http://example.com', 'SCRIPT_NAME': '/pywb'}, False)) +{'coll': '', + 'request_uri': 'http://example.com', + 'wb_prefix': '/pywb/', + 'wb_url': None} + # not matching route -- skipped >>> Route('web', BaseHandler())({'REL_REQUEST_URI': '/other/test.example.com', 'SCRIPT_NAME': ''}, False) @@ -67,6 +74,13 @@ False >>> _test_redir('http://localhost:8080/', '/../other.html', 'http://localhost:8080/extra/coll/20131010/http://example.com/path/page.html', '/extr') False +# With no collection +>>> _test_redir('http://localhost:8080/', '/other.html', 'http://localhost:8080/2013/http://example.com/path/page.html', coll='') +'http://localhost:8080/2013/http://example.com/other.html' + +# With SCRIPT_NAME but no collection +>>> _test_redir('http://localhost:8080/', '/other.html', 'http://localhost:8080/pywb-access/http://example.com/path/page.html', '/pywb-access', coll='') +'http://localhost:8080/pywb-access/http://example.com/other.html' """ diff --git a/pywb/utils/canonicalize.py b/pywb/utils/canonicalize.py index bd21e4ca..73555ca6 100644 --- a/pywb/utils/canonicalize.py +++ b/pywb/utils/canonicalize.py @@ -118,10 +118,15 @@ def calc_search_range(url, match_type, surt_ordered=True, url_canon=None): >>> calc_search_range('http://example.com/path/file.html', 'host', False) ('example.com/', 'example.com0') - # domain range not supported + # errors: domain range not supported >>> calc_search_range('http://example.com/path/file.html', 'domain', False) Traceback (most recent call last): - Exception: matchType=domain unsupported for non-surt + UrlCanonicalizeException: matchType=domain unsupported for non-surt + + >>> calc_search_range('http://example.com/path/file.html', 'blah', False) + Traceback (most recent call last): + UrlCanonicalizeException: Invalid match_type: blah + """ def inc_last_char(x): return x[0:-1] + chr(ord(x[-1]) + 1) @@ -159,7 +164,7 @@ def calc_search_range(url, match_type, surt_ordered=True, url_canon=None): elif match_type == 'domain': if not surt_ordered: - raise Exception('matchType=domain unsupported for non-surt') + raise UrlCanonicalizeException('matchType=domain unsupported for non-surt') host = start_key.split(')/')[0] @@ -172,7 +177,7 @@ def calc_search_range(url, match_type, surt_ordered=True, url_canon=None): end_key = host + '-' else: - raise Exception('Invalid match_type: ' + match_type) + raise UrlCanonicalizeException('Invalid match_type: ' + match_type) return (start_key, end_key) diff --git a/tests/test_integration.py b/tests/test_integration.py index 1a7a943c..5a165041 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -2,6 +2,7 @@ import webtest from pywb.pywb_init import pywb_config from pywb.wbapp import create_wb_app from pywb.cdx.cdxobject import CDXObject +from pywb.cdx.perms import AllowAllPerms class TestWb: TEST_CONFIG = 'test_config.yaml' @@ -73,7 +74,19 @@ class TestWb: assert 'Mon, Jan 27 2014 17:12:38' in resp.body assert 'wb.js' in resp.body - assert '/pywb/20140127171238/http://www.iana.org/time-zones' in resp.body + assert '/pywb/20140127171238/http://www.iana.org/time-zones"' in resp.body + + def test_replay_identity_1(self): + resp = self.testapp.get('/pywb/20140127171251id_/http://example.com') + #resp = self.testapp.get('/pywb/20140126200654id_/http://www.iana.org/_img/2013.1/rir-map.svg') + #resp = self.testapp.get('/pywb/20140127171239id_/http://www.iana.org/_css/2013.1/screen.css') + #self._assert_basic_html(resp) + + # no wb header insertion + assert 'wb.js' not in resp.body + + # original unrewritten url present + assert '"http://www.iana.org/domains/example"' in resp.body def test_replay_content_length_1(self): # test larger file, rewritten file (svg!) @@ -198,38 +211,21 @@ class TestWb: # Reporter callback for replay view class PrintReporter: def __call__(self, wbrequest, cdx, response): - print wbrequest - print cdx + #print wbrequest + #print cdx pass #================================================================= -class TestExclusionPerms: +class TestExclusionPerms(AllowAllPerms): """ - Sample Perm Checker which allows all + Sample Perm Checker with hard-coded exclusion """ def allow_url_lookup(self, urlkey, url): """ Return true/false if url or urlkey (canonicalized url) should be allowed """ - print urlkey if urlkey == 'org,iana)/_img/bookmark_icon.ico': return False - return True - - def allow_capture(self, cdx): - """ - Return true/false is specified capture (cdx) should be - allowed - """ - return True - - def filter_fields(self, cdx): - """ - Filter out any forbidden cdx fields from cdx dictionary - """ - return cdx - - - + return super(TestExclusionPerms, self).allow_url_lookup(urlkey, url) From 304a33aa5b188751e3f69e7930969cbb72d7cbc7 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 27 Feb 2014 18:52:41 -0800 Subject: [PATCH 8/8] add coverage badge --- README.md | 1 + tests/test_integration.py | 5 ++--- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 726d9709..83f1aa28 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,7 @@ PyWb 0.2 Beta ============== [![Build Status](https://travis-ci.org/ikreymer/pywb.png?branch=master)](https://travis-ci.org/ikreymer/pywb) +[![Coverage Status](https://coveralls.io/repos/ikreymer/pywb/badge.png?branch=master)](https://coveralls.io/r/ikreymer/pywb?branch=master) pywb is a Python re-implementation of the Wayback Machine software. diff --git a/tests/test_integration.py b/tests/test_integration.py index 5a165041..5f6bb666 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -211,9 +211,8 @@ class TestWb: # Reporter callback for replay view class PrintReporter: def __call__(self, wbrequest, cdx, response): - #print wbrequest - #print cdx - pass + print wbrequest + print cdx #================================================================= class TestExclusionPerms(AllowAllPerms):