diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 00000000..d41f9d40 --- /dev/null +++ b/.coveragerc @@ -0,0 +1,11 @@ +[run] +omit = + */test/* + */tests/* + *.html + *.js + *.css + +[report] +exclude_lines = + if __name__ == .__main__.: diff --git a/.travis.yml b/.travis.yml index 81d946f7..354f2c61 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,8 +3,14 @@ python: - "2.7" # command to install dependencies install: - - "python setup.py -q install" + - python setup.py -q install + - pip install coverage pytest-cov coveralls --use-mirrors # command to run tests #script: nosetests --with-doctest #script: py.test run-tests.py ./pywb/ --doctest-modules --ignore=setup.py -script: py.test -v --doctest-module ./tests/*.py ./pywb/ +#script: py.test -v --doctest-module ./tests/*.py ./pywb/ +script: + py.test --cov-config .coveragerc --cov pywb -v --doctest-module ./pywb/ tests/ + +after_success: + coveralls diff --git a/README.md b/README.md index 726d9709..83f1aa28 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,7 @@ PyWb 0.2 Beta ============== [![Build Status](https://travis-ci.org/ikreymer/pywb.png?branch=master)](https://travis-ci.org/ikreymer/pywb) +[![Coverage Status](https://coveralls.io/repos/ikreymer/pywb/badge.png?branch=master)](https://coveralls.io/r/ikreymer/pywb?branch=master) pywb is a Python re-implementation of the Wayback Machine software. diff --git a/pywb/bootstrap/config_utils.py b/pywb/bootstrap/config_utils.py index 2307022a..686a6bbb 100644 --- a/pywb/bootstrap/config_utils.py +++ b/pywb/bootstrap/config_utils.py @@ -18,17 +18,19 @@ def load_template_file(file, desc = None, view_class = J2TemplateView): return file #================================================================= -def create_wb_handler(cdx_server, config): +def create_wb_handler(cdx_server, config, ds_rules_file=None): record_loader = ArcWarcRecordLoader(cookie_maker = config.get('cookie_maker')) paths = config.get('archive_paths') - resolving_loader = ResolvingLoader(paths = paths, cdx_server = cdx_server, record_loader = record_loader) + resolving_loader = ResolvingLoader(paths=paths, + cdx_server=cdx_server, + record_loader=record_loader) replayer = ReplayView( content_loader = resolving_loader, - content_rewriter = RewriteContent(), + content_rewriter = RewriteContent(ds_rules_file=ds_rules_file), head_insert_view = load_template_file(config.get('head_insert_html'), 'Head Insert'), diff --git a/pywb/bootstrap/pywb_init.py b/pywb/bootstrap/pywb_init.py index 7465ba64..1fe33ddc 100644 --- a/pywb/bootstrap/pywb_init.py +++ b/pywb/bootstrap/pywb_init.py @@ -59,12 +59,13 @@ def pywb_config_manual(passed_config = {}): route_config = DictChain(value, config) - ds_rules = route_config.get('domain_specific_rules', None) - cdx_server = IndexReader(route_config, ds_rules) + ds_rules_file = route_config.get('domain_specific_rules', None) + cdx_server = IndexReader(route_config, ds_rules_file) wb_handler = config_utils.create_wb_handler( - cdx_server = cdx_server, - config = route_config, + cdx_server=cdx_server, + config=route_config, + ds_rules_file=ds_rules_file, ) logging.debug('Adding Collection: ' + name) diff --git a/pywb/bootstrap/wbapp.py b/pywb/bootstrap/wbapp.py index f9a6d359..e7ea0c82 100644 --- a/pywb/bootstrap/wbapp.py +++ b/pywb/bootstrap/wbapp.py @@ -2,6 +2,7 @@ from pywb.core.wbexceptions import WbException, NotFoundException, InternalRedir from pywb.core.wbrequestresponse import WbResponse, StatusAndHeaders from pywb.cdx.cdxserver import CDXException +from pywb.utils.canonicalize import UrlCanonicalizeException from pywb.warc.recordloader import ArchiveLoadFailed import os @@ -55,7 +56,8 @@ def create_wb_app(wb_router): except InternalRedirect as ir: response = WbResponse(StatusAndHeaders(ir.status, ir.httpHeaders)) - except (WbException, CDXException, ArchiveLoadFailed) as e: + except (WbException, CDXException, + UrlCanonicalizeException, ArchiveLoadFailed) as e: response = handle_exception(env, wb_router.error_view, e, False) except Exception as e: diff --git a/pywb/cdx/cdxdomainspecific.py b/pywb/cdx/cdxdomainspecific.py index 2c733c8d..556534a7 100644 --- a/pywb/cdx/cdxdomainspecific.py +++ b/pywb/cdx/cdxdomainspecific.py @@ -3,34 +3,45 @@ import re import logging import pkgutil -from canonicalize import unsurt, UrlCanonicalizer +from pywb.utils.dsrules import BaseRule, RuleSet + +from pywb.utils.canonicalize import unsurt, UrlCanonicalizer #================================================================= -def load_domain_specific_cdx_rules(filename, surt_ordered): - fh = pkgutil.get_data(__package__, filename) - config = yaml.load(fh) +def load_domain_specific_cdx_rules(ds_rules_file, surt_ordered): + """ + >>> (canon, fuzzy) = load_domain_specific_cdx_rules(None, True) + >>> canon('http://test.example.example/path/index.html?a=b&id=value&c=d') + 'example,example,test)/path/index.html?id=value' + """ + canon = None + fuzzy = None # Load Canonicalizer Rules - rules = StartsWithRule.load_rules(config.get('canon_rules'), - surt_ordered) + rules = RuleSet(CDXDomainSpecificRule, 'canonicalize', + ds_rules_file=ds_rules_file) + + if not surt_ordered: + for rule in rules: + rule.unsurt() if rules: canon = CustomUrlCanonicalizer(rules, surt_ordered) - else: - canon = None # Load Fuzzy Lookup Rules - rules = StartsWithRule.load_rules(config.get('fuzzy_lookup_rules'), - surt_ordered) + rules = RuleSet(CDXDomainSpecificRule, 'fuzzy_lookup', + ds_rules_file=ds_rules_file) + + if not surt_ordered: + for rule in rules: + rule.unsurt() if rules: fuzzy = FuzzyQuery(rules) - else: - fuzzy = None - logging.debug('CANON: ' + str(canon)) - logging.debug('FUZZY: ' + str(fuzzy)) + logging.debug('CustomCanonilizer? ' + str(bool(canon))) + logging.debug('FuzzyMatcher? ' + str(bool(canon))) return (canon, fuzzy) @@ -43,10 +54,7 @@ class CustomUrlCanonicalizer(UrlCanonicalizer): def __call__(self, url): urlkey = super(CustomUrlCanonicalizer, self).__call__(url) - for rule in self.rules: - if not any(urlkey.startswith(x) for x in rule.starts): - continue - + for rule in self.rules.iter_matching(urlkey): m = rule.regex.match(urlkey) if not m: continue @@ -67,11 +75,10 @@ class FuzzyQuery: urlkey = params['key'] url = params['url'] + filter_ = params.get('filter', []) + output = params.get('output') - for rule in self.rules: - if not any(urlkey.startswith(x) for x in rule.starts): - continue - + for rule in self.rules.iter_matching(urlkey): m = rule.regex.search(urlkey) if not m: continue @@ -79,7 +86,7 @@ class FuzzyQuery: matched_rule = rule if len(m.groups()) == 1: - params['filter'] = '=urlkey:' + m.group(1) + filter_.append('~urlkey:' + m.group(1)) break @@ -88,38 +95,42 @@ class FuzzyQuery: inx = url.find('?') if inx > 0: - params['url'] = url[:inx + 1] + url = url[:inx + 1] + + params = {'url': url, + 'matchType': 'prefix', + 'filter': filter_, + 'output': output} - params['matchType'] = 'prefix' - params['key'] = None return params #================================================================= -class StartsWithRule: - def __init__(self, config, surt_ordered=True): - self.starts = config.get('startswith') - if not isinstance(self.starts, list): - self.starts = [self.starts] +class CDXDomainSpecificRule(BaseRule): + def __init__(self, name, config): + super(CDXDomainSpecificRule, self).__init__(name, config) - self.regex = re.compile(config.get('matches')) - self.replace = config.get('replace') + if isinstance(config, basestring): + self.regex = re.compile(config) + self.replace = None + else: + self.regex = re.compile(config.get('match')) + self.replace = config.get('replace') def unsurt(self): - # must convert to non-surt form - self.starts = map(unsurt, self.starts) - self.regex = unsurt(self.regex) - self.replace = unsurt(self.replace) + """ + urlkey is assumed to be in surt format by default + In the case of non-surt format, this method is called + to desurt any urls + """ + self.url_prefix = map(unsurt, self.url_prefix) + if self.regex: + self.regex = unsurt(self.regex) - @staticmethod - def load_rules(rules_config, surt_ordered=True): - if not rules_config: - return [] + if self.replace: + self.replace = unsurt(self.replace) - rules = map(StartsWithRule, rules_config) - if not surt_ordered: - for rule in rules: - rule.unsurt() - - return rules +if __name__ == "__main__": + import doctest + doctest.testmod() diff --git a/pywb/cdx/cdxops.py b/pywb/cdx/cdxops.py index 247f3d18..1a90d7ca 100644 --- a/pywb/cdx/cdxops.py +++ b/pywb/cdx/cdxops.py @@ -157,9 +157,15 @@ def cdx_filter(cdx_iter, filter_strings): if self.invert: string = string[1:] - self.exact = string.startswith('=') - if self.exact: + # exact match + if string.startswith('='): string = string[1:] + self.compare_func = self.exact + elif string.startswith('~'): + string = string[1:] + self.compare_func = self.contains + else: + self.compare_func = self.regex parts = string.split(':', 1) # no field set, apply filter to entire cdx @@ -170,19 +176,28 @@ def cdx_filter(cdx_iter, filter_strings): self.field = parts[0] string = parts[1] - if self.exact: - self.exact_str = string - else: + # make regex if regex mode + if self.compare_func == self.regex: self.regex = re.compile(string) + else: + self.filter_str = string def __call__(self, cdx): val = cdx[self.field] if self.field else str(cdx) - if self.exact: - matched = (self.exact_str == val) - else: - matched = self.regex.match(val) is not None + + matched = self.compare_func(val) + return matched ^ self.invert + def exact(self, val): + return (self.filter_str == val) + + def contains(self, val): + return (self.filter_str in val) + + def regex(self, val): + return self.regex.match(val) is not None + filters = map(Filter, filter_strings) for cdx in cdx_iter: diff --git a/pywb/cdx/cdxserver.py b/pywb/cdx/cdxserver.py index 1a68f7e4..fd0c14e9 100644 --- a/pywb/cdx/cdxserver.py +++ b/pywb/cdx/cdxserver.py @@ -1,4 +1,4 @@ -from canonicalize import UrlCanonicalizer, calc_search_range +from pywb.utils.canonicalize import UrlCanonicalizer, calc_search_range from cdxops import cdx_load from cdxsource import CDXSource, CDXFile, RemoteCDXSource, RedisCDXSource @@ -17,13 +17,13 @@ import urlparse #================================================================= class BaseCDXServer(object): def __init__(self, **kwargs): - ds_rules = kwargs.get('ds_rules') + ds_rules_file = kwargs.get('ds_rules_file') surt_ordered = kwargs.get('surt_ordered', True) # load from domain-specific rules - if ds_rules: + if ds_rules_file: self.url_canon, self.fuzzy_query = ( - load_domain_specific_cdx_rules(ds_rules, surt_ordered)) + load_domain_specific_cdx_rules(ds_rules_file, surt_ordered)) # or custom passed in canonicalizer else: self.url_canon = kwargs.get('url_canon') @@ -50,14 +50,14 @@ class BaseCDXServer(object): url = params['url'] - if self.fuzzy_query and params.get('allowFuzzy'): - if not 'key' in params: - params['key'] = self.url_canon(url) + # check if fuzzy is allowed and ensure that its an + # exact match + if (self.fuzzy_query and params.get('allowFuzzy') and + params.get('matchType', 'exact') == 'exact'): - params = self.fuzzy_query(params) - if params: - params['allowFuzzy'] = False - return self.load_cdx(**params) + fuzzy_params = self.fuzzy_query(params) + if fuzzy_params: + return self.load_cdx(**fuzzy_params) msg = 'No Captures found for: ' + url raise CaptureNotFoundException(msg) @@ -95,7 +95,6 @@ class CDXServer(BaseCDXServer): msg = 'A url= param must be specified to query the cdx server' raise CDXException(msg) - #params['key'] = self.url_canon(url) match_type = params.get('matchType', 'exact') key, end_key = calc_search_range(url=url, @@ -166,7 +165,7 @@ def create_cdx_server(config, ds_rules_file=None): return server_cls(paths, config=pass_config, surt_ordered=surt_ordered, - ds_rules=ds_rules_file, + ds_rules_file=ds_rules_file, perms_checker=perms_checker) @@ -215,7 +214,7 @@ def create_cdx_source(filename, config): if filename.endswith('.cdx'): return CDXFile(filename) - if filename.endswith('.summary'): + if filename.endswith(('.summary', '.idx')): return ZipNumCluster(filename, config) return None diff --git a/pywb/cdx/cdxsource.py b/pywb/cdx/cdxsource.py index 783cf36b..ba5f8b3b 100644 --- a/pywb/cdx/cdxsource.py +++ b/pywb/cdx/cdxsource.py @@ -1,6 +1,8 @@ from pywb.utils.binsearch import iter_range from pywb.utils.loaders import SeekableTextFileReader +from cdxobject import AccessException + import urllib import urllib2 import itertools @@ -93,7 +95,7 @@ class RedisCDXSource(CDXSource): self.key_prefix = self.DEFAULT_KEY_PREFIX if config: self.key_prefix = config.get('redis_key_prefix', self.key_prefix) - + def load_cdx(self, params): """ diff --git a/pywb/cdx/perms.py b/pywb/cdx/perms.py index a7b90eb4..ad6ea00d 100644 --- a/pywb/cdx/perms.py +++ b/pywb/cdx/perms.py @@ -1,7 +1,7 @@ #================================================================= -class AllowAllPerms: +class AllowAllPerms(object): """ Sample Perm Checker which allows all """ diff --git a/pywb/cdx/rules.yaml b/pywb/cdx/rules.yaml deleted file mode 100644 index 1da70582..00000000 --- a/pywb/cdx/rules.yaml +++ /dev/null @@ -1,24 +0,0 @@ - -fuzzy_lookup_rules: - - startswith: 'com,twitter)/i/profiles/show/' - matches: '/profiles/show/.*with_replies\?.*(max_id=[^&]+)' - - - startswith: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet' - matches: 'com,facebook\)/.*[?&]data=(.*(?:[&]|query_type[^,]+))' - - - startswith: ['com,yimg,l)/g/combo', 'com,yahooapis,yui)/combo'] - matches: '([^/]+(?:\.css|\.js))' - - # matches all urls - - startswith: '' - matches: '[&?](?:_|uncache)=[\d]+[&]?' - -canon_rules: - - startswith: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet' - matches: 'com,facebook\)/.*[?&]data=([^&]+).*' - replace: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet?data=\1' - - - - - diff --git a/pywb/cdx/test/cdxserver_test.py b/pywb/cdx/test/cdxserver_test.py index 0e799ce9..e5fac6b3 100644 --- a/pywb/cdx/test/cdxserver_test.py +++ b/pywb/cdx/test/cdxserver_test.py @@ -54,6 +54,16 @@ com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz +# Filter contains +>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '~urlkey:example=1') +com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz +com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz + +# Filter contains invert +>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '!~urlkey:example=1') +com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz +com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz + # Collapse by timestamp # unresolved revisits, different statuscode results in an extra repeat >>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', collapseTime = 11) @@ -131,9 +141,9 @@ org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db tex ('offset', '334'), ('filename', 'dupes.warc.gz')] -# NOTE: external dependency -- need self-contained test -#>>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'example.com', output = 'raw', limit = '2') -#>>> pprint.pprint(x.next().items()) +# NOTE: external dependency -- need self-contained test TODO +>>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'example.com', output = 'raw', limit = '2') +>>> pprint.pprint(x.next().items()) [('urlkey', 'com,example)/'), ('timestamp', '20020120142510'), ('original', 'http://example.com:80/'), @@ -142,6 +152,10 @@ org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db tex ('digest', 'HT2DYGA5UKZCPBSFVCV3JOBXGW2G5UUA'), ('length', '1792')] + +>>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'facebook.com', output = 'raw', limit = '2') +Traceback (most recent call last): +AccessException: Blocked By Robots """ #================================================================= @@ -162,6 +176,7 @@ def cdx_ops_test(url, sources = [test_cdx_dir + 'iana.cdx'], **kwparams): results = server.load_cdx(**kwparams) for x in results: + x = x.replace('\t', ' ') sys.stdout.write(x) diff --git a/pywb/cdx/test/zipnum_test.py b/pywb/cdx/test/zipnum_test.py new file mode 100644 index 00000000..7c98309a --- /dev/null +++ b/pywb/cdx/test/zipnum_test.py @@ -0,0 +1,44 @@ +""" +>>> zip_ops_test(url = 'http://iana.org') +org,iana)/ 20140126200624 http://www.iana.org/ text/html 200 OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 2258 334 iana.warc.gz +org,iana)/ 20140127171238 http://iana.org unk 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 343 1858 dupes.warc.gz +org,iana)/ 20140127171238 http://www.iana.org/ warc/revisit - OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 536 2678 dupes.warc.gz + +# test idx index (tabs replacad with 4 spaces) +>>> zip_ops_test(url = 'http://iana.org/domains/', matchType = 'prefix', showPagedIndex = True) +org,iana)/dnssec 20140126201307 zipnum 8511 373 +org,iana)/domains/int 20140126201239 zipnum 8884 353 +org,iana)/domains/root/servers 20140126201227 zipnum 9237 386 + +>>> zip_ops_test(url = 'http://iana.org/domains/', matchType = 'prefix') +org,iana)/domains/arpa 20140126201248 http://www.iana.org/domains/arpa text/html 200 QOFZZRN6JIKAL2JRL6ZC2VVG42SPKGHT - - 2939 759039 iana.warc.gz +org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz +org,iana)/domains/idn-tables 20140126201127 http://www.iana.org/domains/idn-tables text/html 200 HNCUFTJMOQOGAEY6T56KVC3T7TVLKGEW - - 8118 715878 iana.warc.gz +org,iana)/domains/int 20140126201239 http://www.iana.org/domains/int text/html 200 X32BBNNORV4SPEHTQF5KI5NFHSKTZK6Q - - 2482 746788 iana.warc.gz +org,iana)/domains/reserved 20140126201054 http://www.iana.org/domains/reserved text/html 200 R5AAEQX5XY5X5DG66B23ODN5DUBWRA27 - - 3573 701457 iana.warc.gz +org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz +org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz +org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz +org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz + +""" + + + + +from cdxserver_test import cdx_ops_test + +from pywb import get_test_dir +test_zipnum = get_test_dir() + 'zipcdx/zipnum-sample.idx' +print test_zipnum + +def zip_ops_test(url, **kwargs): + sources = test_zipnum + cdx_ops_test(url, sources, **kwargs) + + +if __name__ == "__main__": + import doctest + doctest.testmod() + + diff --git a/pywb/core/replay_views.py b/pywb/core/replay_views.py index dd11ed4c..bf046416 100644 --- a/pywb/core/replay_views.py +++ b/pywb/core/replay_views.py @@ -7,6 +7,8 @@ from wbrequestresponse import WbResponse from wbexceptions import CaptureException, InternalRedirect from pywb.warc.recordloader import ArchiveLoadFailed +from pywb.utils.loaders import LimitReader + #================================================================= class ReplayView: def __init__(self, content_loader, content_rewriter, head_insert_view = None, @@ -53,10 +55,21 @@ class ReplayView: response = None + # if Content-Length for payload is present, ensure we don't read past it + content_len = status_headers.get_header('content-length') + try: + content_len=int(content_len) + if content_len > 0: + stream = LimitReader(stream, content_len) + except ValueError: + pass + if self.content_rewriter and wbrequest.wb_url.mod != 'id_': response = self.rewrite_content(wbrequest, cdx, status_headers, stream) else: (status_headers, stream) = self.sanitize_content(status_headers, stream) + #status_headers.remove_header('content-length') + response_iter = self.stream_to_iter(stream) response = WbResponse(status_headers, response_iter) @@ -99,20 +112,34 @@ class ReplayView: def rewrite_content(self, wbrequest, cdx, status_headers, stream): urlrewriter = wbrequest.urlrewriter - (rewritten_headers, stream) = self.content_rewriter.rewrite_headers(urlrewriter, status_headers, stream) + result = self.content_rewriter.rewrite_headers(urlrewriter, + status_headers, + stream, + cdx['urlkey']) + (rewritten_headers, stream) = result # no rewriting needed! if rewritten_headers.text_type is None: response_iter = self.stream_to_iter(stream) return WbResponse(rewritten_headers.status_headers, response_iter) - # do head insert + def make_head_insert(rule): + return (self.head_insert_view.render_to_string(wbrequest=wbrequest, + cdx=cdx, + rule=rule)) + # do head insert if self.head_insert_view: - head_insert_str = self.head_insert_view.render_to_string(wbrequest = wbrequest, cdx = cdx) + head_insert_func = make_head_insert else: - head_insert_str = None + head_insert_func = None - (status_headers, response_gen) = self.content_rewriter.rewrite_content(urlrewriter, rewritten_headers, stream, head_insert_str) + result = self.content_rewriter.rewrite_content(urlrewriter, + rewritten_headers, + stream, + head_insert_func, + cdx['urlkey']) + + (status_headers, response_gen) = result if self.buffer_response: if wbrequest.wb_url.mod == 'id_': diff --git a/pywb/dispatch/archivalrouter.py b/pywb/dispatch/archivalrouter.py index f548969b..fb09fa1a 100644 --- a/pywb/dispatch/archivalrouter.py +++ b/pywb/dispatch/archivalrouter.py @@ -50,7 +50,10 @@ class Route: def __init__(self, regex, handler, coll_group = 0, config = {}, lookahead = SLASH_QUERY_LOOKAHEAD): self.path = regex - self.regex = re.compile(regex + lookahead) + if regex: + self.regex = re.compile(regex + lookahead) + else: + self.regex = re.compile('') self.handler = handler # collection id from regex group (default 0) self.coll_group = coll_group @@ -70,7 +73,6 @@ class Route: return None matched_str = matcher.group(0) - if matched_str: rel_prefix = env['SCRIPT_NAME'] + '/' + matched_str + '/' wb_url_str = request_uri[len(matched_str) + 2:] # remove the '/' + rel_prefix part of uri diff --git a/pywb/dispatch/test/test_archivalrouter.py b/pywb/dispatch/test/test_archivalrouter.py index 82b0d147..a076c015 100644 --- a/pywb/dispatch/test/test_archivalrouter.py +++ b/pywb/dispatch/test/test_archivalrouter.py @@ -15,6 +15,13 @@ 'wb_prefix': 'https://localhost:8081/my_pywb/web/', 'wb_url': ('replay', '2013', 'im_', 'http://test.example.com', '2013im_/http://test.example.com')} +# route with no collection +>>> print_req(Route('', BaseHandler())({'REL_REQUEST_URI': 'http://example.com', 'SCRIPT_NAME': '/pywb'}, False)) +{'coll': '', + 'request_uri': 'http://example.com', + 'wb_prefix': '/pywb/', + 'wb_url': None} + # not matching route -- skipped >>> Route('web', BaseHandler())({'REL_REQUEST_URI': '/other/test.example.com', 'SCRIPT_NAME': ''}, False) @@ -67,6 +74,13 @@ False >>> _test_redir('http://localhost:8080/', '/../other.html', 'http://localhost:8080/extra/coll/20131010/http://example.com/path/page.html', '/extr') False +# With no collection +>>> _test_redir('http://localhost:8080/', '/other.html', 'http://localhost:8080/2013/http://example.com/path/page.html', coll='') +'http://localhost:8080/2013/http://example.com/other.html' + +# With SCRIPT_NAME but no collection +>>> _test_redir('http://localhost:8080/', '/other.html', 'http://localhost:8080/pywb-access/http://example.com/path/page.html', '/pywb-access', coll='') +'http://localhost:8080/pywb-access/http://example.com/other.html' """ diff --git a/pywb/rewrite/regex_rewriters.py b/pywb/rewrite/regex_rewriters.py index 690775e7..a435b104 100644 --- a/pywb/rewrite/regex_rewriters.py +++ b/pywb/rewrite/regex_rewriters.py @@ -4,11 +4,16 @@ import itertools from url_rewriter import UrlRewriter + #================================================================= class RegexRewriter(object): + #@staticmethod + #def comment_out(string): + # return '/*' + string + '*/' + @staticmethod - def comment_out(string): - return '/*' + string + '*/' + def format(template): + return lambda string: template.format(string) @staticmethod def remove_https(string): @@ -20,19 +25,16 @@ class RegexRewriter(object): @staticmethod def archival_rewrite(rewriter): - return lambda x: rewriter.rewrite(x) + return lambda string: rewriter.rewrite(string) - @staticmethod - def replacer(string): - return lambda x: string + #@staticmethod + #def replacer(other): + # return lambda m, string: other HTTPX_MATCH_STR = r'https?:\\?/\\?/[A-Za-z0-9:_@.-]+' - - DEFAULT_OP = add_prefix - def __init__(self, rules): #rules = self.create_rules(http_prefix) @@ -76,52 +78,68 @@ class RegexRewriter(object): op = RegexRewriter.DEFAULT_OP(op) result = op(m.group(i)) + final_str = result # if extracting partial match if i != full_m: - result = m.string[m.start(full_m):m.start(i)] + result + m.string[m.end(i):m.end(full_m)] + final_str = m.string[m.start(full_m):m.start(i)] + final_str += result + final_str += m.string[m.end(i):m.end(full_m)] + return final_str + + @staticmethod + def parse_rules_from_config(config): + def parse_rule(obj): + match = obj.get('match') + replace = RegexRewriter.format(obj.get('replace', '{0}')) + group = obj.get('group', 0) + result = (match, replace, group) return result - + return map(parse_rule, config) #================================================================= -class JSLinkRewriter(RegexRewriter): +class JSLinkOnlyRewriter(RegexRewriter): """ JS Rewriter which rewrites absolute http://, https:// and // urls at the beginning of a string """ JS_HTTPX = r'(?<="|\')(?:https?:)?\\{0,2}/\\{0,2}/[A-Za-z0-9:_@.-]+' - def __init__(self, rewriter, rules = []): + def __init__(self, rewriter, rules=[]): rules = rules + [(self.JS_HTTPX, rewriter.get_abs_url(), 0)] - super(JSLinkRewriter, self).__init__(rules) + super(JSLinkOnlyRewriter, self).__init__(rules) + #================================================================= -class JSLocationAndLinkRewriter(JSLinkRewriter): +class JSLinkAndLocationRewriter(JSLinkOnlyRewriter): """ JS Rewriter which also rewrites location and domain to the specified prefix (default: 'WB_wombat_') """ - def __init__(self, rewriter, rules = [], prefix = 'WB_wombat_'): + def __init__(self, rewriter, rules=[], prefix='WB_wombat_'): rules = rules + [ (r'(?>> _test_js('window.location = "http://example.com/abc.html"; some_func(); ', [('some_func\(\).*', RegexRewriter.comment_out, 0)]) +>>> _test_js('window.location = "http://example.com/abc.html"; some_func(); ', [('some_func\(\).*', RegexRewriter.format('/*{0}*/'), 0)]) 'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"; /*some_func(); */' # scheme-agnostic diff --git a/pywb/rewrite/test/test_rewrite_live.py b/pywb/rewrite/test/test_rewrite_live.py index 6d66ce60..36e74848 100644 --- a/pywb/rewrite/test/test_rewrite_live.py +++ b/pywb/rewrite/test/test_rewrite_live.py @@ -1,11 +1,50 @@ from pywb.rewrite.rewrite_live import get_rewritten from pywb.rewrite.url_rewriter import UrlRewriter +from pywb import get_test_dir + # This module has some rewriting tests against the 'live web' # As such, the content may change and the test may break urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/pywb/') +def head_insert_func(rule): + if rule.js_rewrite_location == True: + return '' + else: + return '' + + +def test_local_1(): + status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html', + urlrewriter, + 'com,example,test)/', + head_insert_func) + + # wombat insert added + assert '' in buff + + # location rewritten + assert 'window.WB_wombat_location = "/other.html"' in buff + + # link rewritten + assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff + + +def test_local_2_no_js_location_rewrite(): + status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html', + urlrewriter, + 'example,example,test)/nolocation_rewrite', + head_insert_func) + + # no wombat insert + assert '' not in buff + + # no location rewrite + assert 'window.location = "/other.html"' in buff + + # still link rewrite + assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff def test_example_1(): status_headers, buff = get_rewritten('http://example.com/', urlrewriter) @@ -24,9 +63,10 @@ def test_example_2(): -#def test_example_3(): -# status_headers, buff = get_rewritten('http://archive.org/', urlrewriter) +def test_example_domain_specific_3(): + status_headers, buff = get_rewritten('http://facebook.com/digitalpreservation', urlrewriter) -# assert '/pywb/20131226101010/http://example.com/about/terms.php' in buff, buff + # comment out bootloader + assert '/* Bootloader.configurePage' in buff diff --git a/pywb/rules.yaml b/pywb/rules.yaml new file mode 100644 index 00000000..ce9c7d81 --- /dev/null +++ b/pywb/rules.yaml @@ -0,0 +1,54 @@ + +rules: + + # twitter rules + #================================================================= + - url_prefix: 'com,twitter)/i/profiles/show/' + + fuzzy_lookup: '/profiles/show/.*with_replies\?.*(max_id=[^&]+)' + + + # facebook rules + #================================================================= + - url_prefix: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet' + + fuzzy_lookup: 'com,facebook\)/.*[?&]data=(.*?(?:[&]|query_type[^,]+))' + +# not actually needed, fuzzy match is used instead here +# canonicalize: +# match: 'com,facebook\)/.*[?&]data=([^&]+).*' +# replace: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet?data=\1' + + + - url_prefix: 'com,facebook)/' + rewrite: + js_regexs: + - match: 'Bootloader\.configurePage.*' + replace: '/* {0} */' + + + # yahoo rules + #================================================================= + - url_prefix: ['com,yimg,l)/g/combo', 'com,yahooapis,yui)/combo'] + + fuzzy_lookup: '([^/]+(?:\.css|\.js))' + + + # testing rules -- not for valid domain + #================================================================= + # this rule block is a non-existent prefix merely for testing + - url_prefix: 'example,example,test)/' + + canonicalize: + match: '(example,example,test\)/.*?)[?].*?(id=value).*' + replace: '\1?\2' + + rewrite: + js_rewrite_location: False + + + # all domain rules -- fallback to this dataset + #================================================================= + # Applies to all urls -- should be last + - url_prefix: '' + fuzzy_lookup: '[&?](?:_|uncache)=[\d]+[&]?' diff --git a/pywb/static/wb.js b/pywb/static/wb.js index a7b39370..c4798da8 100644 --- a/pywb/static/wb.js +++ b/pywb/static/wb.js @@ -1,18 +1,21 @@ +/* +Copyright(c) 2013-2014 Ilya Kreymer. Released under the GNU General Public License. +This file is part of pywb. -// Rewritten location and domain obj setup -window.WB_wombat_location = window.location + pywb is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. -if (window.top != window) { - window.top.WB_wombat_location = window.top.location -} - -if (window.opener) { - window.opener.WB_wombat_location = window.opener.location -} - -document.WB_wombat_domain = document.domain + pywb is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + You should have received a copy of the GNU General Public License + along with pywb. If not, see . +*/ function initBanner() { diff --git a/pywb/static/wombat.js b/pywb/static/wombat.js new file mode 100644 index 00000000..d2b7d12c --- /dev/null +++ b/pywb/static/wombat.js @@ -0,0 +1,219 @@ +/* +Copyright(c) 2013-2014 Ilya Kreymer. Released under the GNU General Public License. + +This file is part of pywb. + + pywb is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + pywb is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with pywb. If not, see . +*/ + +//============================================ +// Wombat JS-Rewriting Library +//============================================ + +var WB_wombat_replayPrefix; +var WB_wombat_replayDatePrefix; +var WB_wombat_captureDatePart; +var WB_wombat_origHost; + + +function WB_StripPort(str) +{ + var hostWithPort = str.match(/^http:\/\/[\w\d@.-]+:\d+/); + if (hostWithPort) { + var hostName = hostWithPort[0].substr(0, hostWithPort[0].lastIndexOf(':')); + return hostName + str.substr(hostWithPort[0].length); + } + + return str; +} + +function WB_IsHostUrl(str) +{ + // Good guess that's its a hostname + if (str.indexOf("www.") == 0) { + return true; + } + + // hostname:port (port required) + var matches = str.match(/^[\w-]+(\.[\w-_]+)+(:\d+)(\/|$)/); + if (matches && (matches[0].length < 64)) { + return true; + } + + // ip:port + matches = str.match(/^\d+\.\d+\.\d+\.\d+(:\d+)?(\/|$)/); + if (matches && (matches[0].length < 64)) { + return true; + } + + return false; +} + +function WB_RewriteUrl(url) +{ + var httpPrefix = "http://"; + + // If not dealing with a string, just return it + if (!url || (typeof url) != "string") { + return url; + } + + // If starts with prefix, no rewriting needed + // Only check replay prefix (no date) as date may be different for each capture + if (url.indexOf(WB_wombat_replayPrefix) == 0) { + return url; + } + + // If server relative url, add prefix and original host + if (url.charAt(0) == "/") { + + // Already a relative url, don't make any changes! + if (url.indexOf(WB_wombat_captureDatePart) >= 0) { + return url; + } + + return WB_wombat_replayDatePrefix + WB_wombat_origHost + url; + } + + // If full url starting with http://, add prefix + if (url.indexOf(httpPrefix) == 0) { + return WB_wombat_replayDatePrefix + url; + } + + // May or may not be a hostname, call function to determine + // If it is, add the prefix and make sure port is removed + if (WB_IsHostUrl(url)) { + return WB_wombat_replayDatePrefix + httpPrefix + url; + } + + return url; +} + +function WB_CopyObjectFields(obj) +{ + var newObj = {}; + + for (prop in obj) { + if ((typeof obj[prop]) != "function") { + newObj[prop] = obj[prop]; + } + } + + return newObj; +} + +function WB_ExtractOrig(href) +{ + if (!href) { + return ""; + } + href = href.toString(); + var index = href.indexOf("/http", 1); + if (index > 0) { + return href.substr(index + 1); + } else { + return href; + } +} + +function WB_CopyLocationObj(loc) +{ + var newLoc = WB_CopyObjectFields(loc); + + newLoc._origLoc = loc; + newLoc._origHref = loc.href; + + // Rewrite replace and assign functions + newLoc.replace = function(url) { this._origLoc.replace(WB_RewriteUrl(url)); } + newLoc.assign = function(url) { this._origLoc.assign(WB_RewriteUrl(url)); } + newLoc.reload = loc.reload; + newLoc.href = WB_ExtractOrig(newLoc._origHref); + newLoc.toString = function() { return this.href; } + + return newLoc; +} + +function WB_wombat_updateLoc(reqHref, origHref, location) +{ + if (reqHref && (WB_ExtractOrig(origHref) != WB_ExtractOrig(reqHref))) { + var finalHref = WB_RewriteUrl(reqHref); + + location.href = finalHref; + } +} + +function WB_wombat_checkLocationChange(wbLoc, isTop) +{ + var locType = (typeof wbLoc); + + var location = (isTop ? window.top.location : window.location); + + // String has been assigned to location, so assign it + if (locType == "string") { + WB_wombat_updateLoc(wbLoc, location.href, location) + + } else if (locType == "object") { + WB_wombat_updateLoc(wbLoc.href, wbLoc._origHref, location); + } +} + +var wombat_updating = false; + +function WB_wombat_checkLocations() +{ + if (wombat_updating) { + return false; + } + + wombat_updating = true; + + WB_wombat_checkLocationChange(window.WB_wombat_location, false); + + if (window.self.location != window.top.location) { + WB_wombat_checkLocationChange(window.top.WB_wombat_location, true); + } + + wombat_updating = false; +} + +function WB_wombat_Init(replayPrefix, captureDate, origHost) +{ + WB_wombat_replayPrefix = replayPrefix; + WB_wombat_replayDatePrefix = replayPrefix + captureDate + "/"; + WB_wombat_captureDatePart = "/" + captureDate + "/"; + + WB_wombat_origHost = "http://" + origHost; + + window.WB_wombat_location = WB_CopyLocationObj(window.self.location); + + + if (window.self.location != window.top.location) { + window.top.WB_wombat_location = WB_CopyLocationObj(window.top.location); + } + + if (window.opener) { + window.opener.WB_wombat_location = (window.opener ? WB_CopyLocationObj(window.opener.location) : null); + } + + + document.WB_wombat_domain = origHost; + +} + +// Check quickly after page load +setTimeout(WB_wombat_checkLocations, 100); + + +// Check periodically every few seconds +setInterval(WB_wombat_checkLocations, 500); diff --git a/pywb/ui/head_insert.html b/pywb/ui/head_insert.html index b30cd015..aa910442 100644 --- a/pywb/ui/head_insert.html +++ b/pywb/ui/head_insert.html @@ -1,7 +1,14 @@ +{% if rule.js_rewrite_location %} + +{% endif %} + + diff --git a/pywb/cdx/canonicalize.py b/pywb/utils/canonicalize.py similarity index 87% rename from pywb/cdx/canonicalize.py rename to pywb/utils/canonicalize.py index e2f818b9..73555ca6 100644 --- a/pywb/cdx/canonicalize.py +++ b/pywb/utils/canonicalize.py @@ -3,8 +3,6 @@ import surt import urlparse -from cdxobject import CDXException - #================================================================= class UrlCanonicalizer(object): @@ -15,6 +13,12 @@ class UrlCanonicalizer(object): return canonicalize(url, self.surt_ordered) +#================================================================= +class UrlCanonicalizeException(Exception): + def status(self): + return '400 Bad Request' + + #================================================================= def canonicalize(url, surt_ordered=True): """ @@ -31,7 +35,7 @@ def canonicalize(url, surt_ordered=True): try: key = surt.surt(url) except Exception as e: - raise CDXException('Invalid Url: ' + url) + raise UrlCanonicalizeException('Invalid Url: ' + url) # if not surt, unsurt the surt to get canonicalized non-surt url if not surt_ordered: @@ -114,10 +118,15 @@ def calc_search_range(url, match_type, surt_ordered=True, url_canon=None): >>> calc_search_range('http://example.com/path/file.html', 'host', False) ('example.com/', 'example.com0') - # domain range not supported + # errors: domain range not supported >>> calc_search_range('http://example.com/path/file.html', 'domain', False) Traceback (most recent call last): - Exception: matchType=domain unsupported for non-surt + UrlCanonicalizeException: matchType=domain unsupported for non-surt + + >>> calc_search_range('http://example.com/path/file.html', 'blah', False) + Traceback (most recent call last): + UrlCanonicalizeException: Invalid match_type: blah + """ def inc_last_char(x): return x[0:-1] + chr(ord(x[-1]) + 1) @@ -155,7 +164,7 @@ def calc_search_range(url, match_type, surt_ordered=True, url_canon=None): elif match_type == 'domain': if not surt_ordered: - raise Exception('matchType=domain unsupported for non-surt') + raise UrlCanonicalizeException('matchType=domain unsupported for non-surt') host = start_key.split(')/')[0] @@ -168,7 +177,7 @@ def calc_search_range(url, match_type, surt_ordered=True, url_canon=None): end_key = host + '-' else: - raise Exception('Invalid match_type: ' + match_type) + raise UrlCanonicalizeException('Invalid match_type: ' + match_type) return (start_key, end_key) diff --git a/pywb/utils/dsrules.py b/pywb/utils/dsrules.py new file mode 100644 index 00000000..2e6f9626 --- /dev/null +++ b/pywb/utils/dsrules.py @@ -0,0 +1,98 @@ +import yaml +import pkgutil + +#================================================================= + +DEFAULT_RULES_FILE = 'rules.yaml' +DEFAULT_RULES_PKG = 'pywb' + + +#================================================================= +class RuleSet(object): + DEFAULT_KEY = '' + + def __init__(self, rule_cls, fieldname, **kwargs): + """ + A domain specific rules block, inited via config map. + If config map not specified, it is loaded from default location. + + The rules are represented as a map by domain. + Each rules configuration will load is own field type + from the list and given a specified rule_cls. + """ + + self.rules = [] + + ds_rules_file = kwargs.get('ds_rules_file') + default_rule_config = kwargs.get('default_rule_config') + + config = self.load_default_rules(ds_rules_file) + + rulesmap = config.get('rules') if config else None + + # if default_rule_config provided, always init a default ruleset + if not rulesmap and default_rule_config is not None: + self.rules = [rule_cls(self.DEFAULT_KEY, default_rule_config)] + return + + def_key_found = False + + # iterate over master rules file + for value in rulesmap: + url_prefix = value.get('url_prefix') + rules_def = value.get(fieldname) + if not rules_def: + continue + + if url_prefix == self.DEFAULT_KEY: + def_key_found = True + + self.rules.append(rule_cls(url_prefix, rules_def)) + + # if default_rule_config provided, always init a default ruleset + if not def_key_found and default_rule_config is not None: + self.rules.append(rule_cls(self.DEFAULT_KEY, default_rule_config)) + + @staticmethod + def load_default_rules(filename=None, pkg=None): + config = None + + if not filename: + filename = DEFAULT_RULES_FILE + + if not pkg: + pkg = DEFAULT_RULES_PKG + + if filename: + yaml_str = pkgutil.get_data(pkg, filename) + config = yaml.load(yaml_str) + + return config + + def iter_matching(self, urlkey): + """ + Iterate over all matching rules for given urlkey + """ + for rule in self.rules: + if rule.applies(urlkey): + yield rule + + def get_first_match(self, urlkey): + for rule in self.rules: + if rule.applies(urlkey): + return rule + + +#================================================================= +class BaseRule(object): + """ + Base rule class -- subclassed to handle specific + rules for given url_prefix key + """ + def __init__(self, url_prefix, rules): + self.url_prefix = url_prefix + if not isinstance(self.url_prefix, list): + self.url_prefix = [self.url_prefix] + + def applies(self, urlkey): + return any(urlkey.startswith(x) for x in self.url_prefix) diff --git a/pywb/utils/loaders.py b/pywb/utils/loaders.py index a117f539..7813ded8 100644 --- a/pywb/utils/loaders.py +++ b/pywb/utils/loaders.py @@ -9,6 +9,7 @@ import urllib2 import time +#================================================================= def is_http(filename): return any(filename.startswith(x) for x in ['http://', 'https://']) diff --git a/pywb/utils/timeutils.py b/pywb/utils/timeutils.py index 7af3401f..f93f324d 100644 --- a/pywb/utils/timeutils.py +++ b/pywb/utils/timeutils.py @@ -162,6 +162,10 @@ def timestamp_to_datetime(string): >>> timestamp_to_datetime('40001965252477') datetime.datetime(2999, 12, 31, 23, 24, 59) + # not a number! + >>> timestamp_to_datetime('2010abc') + datetime.datetime(2010, 12, 31, 23, 59, 59) + """ # pad to 6 digits diff --git a/sample_archive/text_content/sample.html b/sample_archive/text_content/sample.html new file mode 100644 index 00000000..c4f3ce35 --- /dev/null +++ b/sample_archive/text_content/sample.html @@ -0,0 +1,14 @@ + + +Sample Page For Rewrite Test + + + +Test Content +Some Link + diff --git a/sample_archive/zipcdx/zipnum-sample.cdx.gz b/sample_archive/zipcdx/zipnum-sample.cdx.gz new file mode 100644 index 00000000..8687b97a Binary files /dev/null and b/sample_archive/zipcdx/zipnum-sample.cdx.gz differ diff --git a/sample_archive/zipcdx/zipnum-sample.idx b/sample_archive/zipcdx/zipnum-sample.idx new file mode 100644 index 00000000..a70d8e87 --- /dev/null +++ b/sample_archive/zipcdx/zipnum-sample.idx @@ -0,0 +1,38 @@ +com,example)/ 20140127171200 zipnum 0 276 +org,iana)/ 20140127171238 zipnum 276 328 +org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201055 zipnum 604 312 +org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200718 zipnum 916 235 +org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200912 zipnum 1151 235 +org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201240 zipnum 1386 306 +org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200654 zipnum 1692 235 +org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200816 zipnum 1927 231 +org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126201128 zipnum 2158 236 +org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140127171240 zipnum 2394 312 +org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126200805 zipnum 2706 234 +org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126201055 zipnum 2940 235 +org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126201308 zipnum 3175 289 +org,iana)/_css/2013.1/print.css 20140126200737 zipnum 3464 208 +org,iana)/_css/2013.1/print.css 20140126200929 zipnum 3672 207 +org,iana)/_css/2013.1/print.css 20140126201248 zipnum 3879 276 +org,iana)/_css/2013.1/screen.css 20140126200706 zipnum 4155 210 +org,iana)/_css/2013.1/screen.css 20140126200825 zipnum 4365 211 +org,iana)/_css/2013.1/screen.css 20140126201227 zipnum 4576 216 +org,iana)/_img/2013.1/iana-logo-header.svg 20140126200654 zipnum 4792 236 +org,iana)/_img/2013.1/iana-logo-header.svg 20140126200816 zipnum 5028 219 +org,iana)/_img/2013.1/iana-logo-header.svg 20140126201128 zipnum 5247 221 +org,iana)/_img/2013.1/iana-logo-homepage.png 20140126200625 zipnum 5468 299 +org,iana)/_img/2013.1/icann-logo.svg 20140126200719 zipnum 5767 210 +org,iana)/_img/2013.1/icann-logo.svg 20140126200912 zipnum 5977 212 +org,iana)/_img/2013.1/icann-logo.svg 20140126201240 zipnum 6189 281 +org,iana)/_img/bookmark_icon.ico 20140126200631 zipnum 6470 298 +org,iana)/_js/2013.1/iana.js 20140126200716 zipnum 6768 213 +org,iana)/_js/2013.1/iana.js 20140126200912 zipnum 6981 216 +org,iana)/_js/2013.1/iana.js 20140126201239 zipnum 7197 270 +org,iana)/_js/2013.1/jquery.js 20140126200653 zipnum 7467 215 +org,iana)/_js/2013.1/jquery.js 20140126200816 zipnum 7682 209 +org,iana)/_js/2013.1/jquery.js 20140126201127 zipnum 7891 210 +org,iana)/_js/2013.1/jquery.js 20140127171239 zipnum 8101 410 +org,iana)/dnssec 20140126201307 zipnum 8511 373 +org,iana)/domains/int 20140126201239 zipnum 8884 353 +org,iana)/domains/root/servers 20140126201227 zipnum 9237 386 +org,iana)/time-zones 20140126200737 zipnum 9623 145 diff --git a/sample_archive/zipcdx/zipnum-sample.loc b/sample_archive/zipcdx/zipnum-sample.loc new file mode 100644 index 00000000..df4f3196 --- /dev/null +++ b/sample_archive/zipcdx/zipnum-sample.loc @@ -0,0 +1 @@ +zipnum ./sample_archive/zipcdx/zipnum-sample.cdx.gz diff --git a/setup.py b/setup.py index 982e067d..1fe72fa7 100755 --- a/setup.py +++ b/setup.py @@ -13,9 +13,11 @@ setuptools.setup(name='pywb', license='GPL', packages=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite','pywb.core','pywb.dispatch','pywb.bootstrap'], provides=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite','pywb.core','pywb.dispatch','pywb.bootstrap'], - package_data={'pywb': ['ui/*', 'static/*'], 'pywb.cdx': ['*.yaml']}, + package_data={'pywb': ['ui/*', 'static/*', '*.yaml']}, data_files = [('sample_archive/cdx/', glob.glob('sample_archive/cdx/*')), - ('sample_archive/warcs/', glob.glob('sample_archive/warcs/*'))], + ('sample_archive/zipcdx/', glob.glob('sample_archive/zipcdx/*')), + ('sample_archive/warcs/', glob.glob('sample_archive/warcs/*')), + ('sample_archive/text_content/', glob.glob('sample_archive/text_content/*'))], install_requires=['uwsgi', 'rfc3987', 'chardet', 'redis', 'jinja2', 'surt', 'pyyaml', 'WebTest','pytest'], # tests_require=['WebTest', 'pytest'], zip_safe=False) diff --git a/tests/test_integration.py b/tests/test_integration.py index bede0e2b..5af34e34 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -2,6 +2,7 @@ import webtest from pywb.bootstrap.pywb_init import pywb_config from pywb.bootstrap.wbapp import create_wb_app from pywb.cdx.cdxobject import CDXObject +from pywb.cdx.perms import AllowAllPerms class TestWb: TEST_CONFIG = 'test_config.yaml' @@ -73,7 +74,19 @@ class TestWb: assert 'Mon, Jan 27 2014 17:12:38' in resp.body assert 'wb.js' in resp.body - assert '/pywb/20140127171238/http://www.iana.org/time-zones' in resp.body + assert '/pywb/20140127171238/http://www.iana.org/time-zones"' in resp.body + + def test_replay_identity_1(self): + resp = self.testapp.get('/pywb/20140127171251id_/http://example.com') + #resp = self.testapp.get('/pywb/20140126200654id_/http://www.iana.org/_img/2013.1/rir-map.svg') + #resp = self.testapp.get('/pywb/20140127171239id_/http://www.iana.org/_css/2013.1/screen.css') + #self._assert_basic_html(resp) + + # no wb header insertion + assert 'wb.js' not in resp.body + + # original unrewritten url present + assert '"http://www.iana.org/domains/example"' in resp.body def test_replay_content_length_1(self): # test larger file, rewritten file (svg!) @@ -200,36 +213,18 @@ class PrintReporter: def __call__(self, wbrequest, cdx, response): print wbrequest print cdx - pass #================================================================= -class TestExclusionPerms: +class TestExclusionPerms(AllowAllPerms): """ - Sample Perm Checker which allows all + Sample Perm Checker with hard-coded exclusion """ def allow_url_lookup(self, urlkey, url): """ Return true/false if url or urlkey (canonicalized url) should be allowed """ - print urlkey if urlkey == 'org,iana)/_img/bookmark_icon.ico': return False - return True - - def allow_capture(self, cdx): - """ - Return true/false is specified capture (cdx) should be - allowed - """ - return True - - def filter_fields(self, cdx): - """ - Filter out any forbidden cdx fields from cdx dictionary - """ - return cdx - - - + return super(TestExclusionPerms, self).allow_url_lookup(urlkey, url)