diff --git a/.coveragerc b/.coveragerc index 63400c07..d41f9d40 100644 --- a/.coveragerc +++ b/.coveragerc @@ -2,6 +2,9 @@ omit = */test/* */tests/* + *.html + *.js + *.css [report] exclude_lines = diff --git a/.travis.yml b/.travis.yml index bab78128..354f2c61 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,9 +3,8 @@ python: - "2.7" # command to install dependencies install: - - "python setup.py -q install" - - "pip install python-coveralls" - - "pip install pytest-cov" + - python setup.py -q install + - pip install coverage pytest-cov coveralls --use-mirrors # command to run tests #script: nosetests --with-doctest #script: py.test run-tests.py ./pywb/ --doctest-modules --ignore=setup.py diff --git a/README.md b/README.md index 726d9709..83f1aa28 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,7 @@ PyWb 0.2 Beta ============== [](https://travis-ci.org/ikreymer/pywb) +[](https://coveralls.io/r/ikreymer/pywb?branch=master) pywb is a Python re-implementation of the Wayback Machine software. diff --git a/pywb/archivalrouter.py b/pywb/archivalrouter.py index 4d28b57e..5d3dc9f4 100644 --- a/pywb/archivalrouter.py +++ b/pywb/archivalrouter.py @@ -50,7 +50,10 @@ class Route: def __init__(self, regex, handler, coll_group = 0, config = {}, lookahead = SLASH_QUERY_LOOKAHEAD): self.path = regex - self.regex = re.compile(regex + lookahead) + if regex: + self.regex = re.compile(regex + lookahead) + else: + self.regex = re.compile('') self.handler = handler # collection id from regex group (default 0) self.coll_group = coll_group @@ -70,7 +73,6 @@ class Route: return None matched_str = matcher.group(0) - if matched_str: rel_prefix = env['SCRIPT_NAME'] + '/' + matched_str + '/' wb_url_str = request_uri[len(matched_str) + 2:] # remove the '/' + rel_prefix part of uri diff --git a/pywb/cdx/cdxdomainspecific.py b/pywb/cdx/cdxdomainspecific.py index 4d8d9b87..882de0dc 100644 --- a/pywb/cdx/cdxdomainspecific.py +++ b/pywb/cdx/cdxdomainspecific.py @@ -3,34 +3,43 @@ import re import logging import pkg_resources -from canonicalize import unsurt, UrlCanonicalizer +from pywb.utils.dsrules import BaseRule, RuleSet + +from pywb.utils.canonicalize import unsurt, UrlCanonicalizer #================================================================= -def load_domain_specific_cdx_rules(filename, surt_ordered): - fh = pkg_resources.resource_string(__name__, filename) - config = yaml.load(fh) +def load_domain_specific_cdx_rules(ds_rules_file, surt_ordered): + #fh = pkg_resources.resource_string(__name__, filename) + #config = yaml.load(fh) + + canon = None + fuzzy = None # Load Canonicalizer Rules - rules = StartsWithRule.load_rules(config.get('canon_rules'), - surt_ordered) + rules = RuleSet(CDXDomainSpecificRule, 'canonicalize', + ds_rules_file=ds_rules_file) + + if not surt_ordered: + for rule in rules: + rule.unsurt() if rules: canon = CustomUrlCanonicalizer(rules, surt_ordered) - else: - canon = None # Load Fuzzy Lookup Rules - rules = StartsWithRule.load_rules(config.get('fuzzy_lookup_rules'), - surt_ordered) + rules = RuleSet(CDXDomainSpecificRule, 'fuzzy_lookup', + ds_rules_file=ds_rules_file) + + if not surt_ordered: + for rule in rules: + rule.unsurt() if rules: fuzzy = FuzzyQuery(rules) - else: - fuzzy = None - logging.debug('CANON: ' + str(canon)) - logging.debug('FUZZY: ' + str(fuzzy)) + logging.debug('CustomCanonilizer? ' + str(bool(canon))) + logging.debug('FuzzyMatcher? ' + str(bool(canon))) return (canon, fuzzy) @@ -43,10 +52,7 @@ class CustomUrlCanonicalizer(UrlCanonicalizer): def __call__(self, url): urlkey = super(CustomUrlCanonicalizer, self).__call__(url) - for rule in self.rules: - if not any(urlkey.startswith(x) for x in rule.starts): - continue - + for rule in self.rules.iter_matching(urlkey): m = rule.regex.match(urlkey) if not m: continue @@ -67,11 +73,10 @@ class FuzzyQuery: urlkey = params['key'] url = params['url'] + filter_ = params.get('filter', []) + output = params.get('output') - for rule in self.rules: - if not any(urlkey.startswith(x) for x in rule.starts): - continue - + for rule in self.rules.iter_matching(urlkey): m = rule.regex.search(urlkey) if not m: continue @@ -79,7 +84,7 @@ class FuzzyQuery: matched_rule = rule if len(m.groups()) == 1: - params['filter'] = '=urlkey:' + m.group(1) + filter_.append('~urlkey:' + m.group(1)) break @@ -88,28 +93,40 @@ class FuzzyQuery: inx = url.find('?') if inx > 0: - params['url'] = url[:inx + 1] + url = url[:inx + 1] + + params = {'url': url, + 'matchType': 'prefix', + 'filter': filter_, + 'output': output} - params['matchType'] = 'prefix' - params['key'] = None return params #================================================================= -class StartsWithRule: - def __init__(self, config, surt_ordered=True): - self.starts = config.get('startswith') - if not isinstance(self.starts, list): - self.starts = [self.starts] +class CDXDomainSpecificRule(BaseRule): + def __init__(self, name, config): + super(CDXDomainSpecificRule, self).__init__(name, config) - self.regex = re.compile(config.get('matches')) - self.replace = config.get('replace') + if isinstance(config, basestring): + self.regex = re.compile(config) + self.replace = None + else: + self.regex = re.compile(config.get('match')) + self.replace = config.get('replace') def unsurt(self): - # must convert to non-surt form - self.starts = map(unsurt, self.starts) - self.regex = unsurt(self.regex) - self.replace = unsurt(self.replace) + """ + urlkey is assumed to be in surt format by default + In the case of non-surt format, this method is called + to desurt any urls + """ + self.url_prefix = map(unsurt, self.url_prefix) + if self.regex: + self.regex = unsurt(self.regex) + + if self.replace: + self.replace = unsurt(self.replace) @staticmethod def load_rules(rules_config, surt_ordered=True): diff --git a/pywb/cdx/cdxops.py b/pywb/cdx/cdxops.py index 2c2c30af..4bdb0a55 100644 --- a/pywb/cdx/cdxops.py +++ b/pywb/cdx/cdxops.py @@ -151,9 +151,15 @@ def cdx_filter(cdx_iter, filter_strings): if self.invert: string = string[1:] - self.exact = string.startswith('=') - if self.exact: + # exact match + if string.startswith('='): string = string[1:] + self.compare_func = self.exact + elif string.startswith('~'): + string = string[1:] + self.compare_func = self.contains + else: + self.compare_func = self.regex parts = string.split(':', 1) # no field set, apply filter to entire cdx @@ -164,19 +170,28 @@ def cdx_filter(cdx_iter, filter_strings): self.field = parts[0] string = parts[1] - if self.exact: - self.exact_str = string - else: + # make regex if regex mode + if self.compare_func == self.regex: self.regex = re.compile(string) + else: + self.filter_str = string def __call__(self, cdx): val = cdx[self.field] if self.field else str(cdx) - if self.exact: - matched = (self.exact_str == val) - else: - matched = self.regex.match(val) is not None + + matched = self.compare_func(val) + return matched ^ self.invert + def exact(self, val): + return (self.filter_str == val) + + def contains(self, val): + return (self.filter_str in val) + + def regex(self, val): + return self.regex.match(val) is not None + filters = map(Filter, filter_strings) for cdx in cdx_iter: diff --git a/pywb/cdx/cdxserver.py b/pywb/cdx/cdxserver.py index 83627009..0de183ae 100644 --- a/pywb/cdx/cdxserver.py +++ b/pywb/cdx/cdxserver.py @@ -1,4 +1,4 @@ -from canonicalize import UrlCanonicalizer, calc_search_range +from pywb.utils.canonicalize import UrlCanonicalizer, calc_search_range from cdxops import cdx_load from cdxsource import CDXSource, CDXFile, RemoteCDXSource, RedisCDXSource @@ -17,13 +17,13 @@ import urlparse #================================================================= class BaseCDXServer(object): def __init__(self, **kwargs): - ds_rules = kwargs.get('ds_rules') + ds_rules_file = kwargs.get('ds_rules_file') surt_ordered = kwargs.get('surt_ordered', True) # load from domain-specific rules - if ds_rules: + if ds_rules_file: self.url_canon, self.fuzzy_query = ( - load_domain_specific_cdx_rules(ds_rules, surt_ordered)) + load_domain_specific_cdx_rules(ds_rules_file, surt_ordered)) # or custom passed in canonicalizer else: self.url_canon = kwargs.get('url_canon') @@ -50,14 +50,14 @@ class BaseCDXServer(object): url = params['url'] - if self.fuzzy_query and params.get('allowFuzzy'): - if not 'key' in params: - params['key'] = self.url_canon(url) + # check if fuzzy is allowed and ensure that its an + # exact match + if (self.fuzzy_query and params.get('allowFuzzy') and + params.get('matchType', 'exact') == 'exact'): - params = self.fuzzy_query(params) - if params: - params['allowFuzzy'] = False - return self.load_cdx(**params) + fuzzy_params = self.fuzzy_query(params) + if fuzzy_params: + return self.load_cdx(**fuzzy_params) msg = 'No Captures found for: ' + url raise CaptureNotFoundException(msg) @@ -98,7 +98,6 @@ class CDXServer(BaseCDXServer): msg = 'A url= param must be specified to query the cdx server' raise CDXException(msg) - #params['key'] = self.url_canon(url) match_type = params.get('matchType', 'exact') key, end_key = calc_search_range(url=url, @@ -159,7 +158,7 @@ class CDXServer(BaseCDXServer): if filename.endswith('.cdx'): return CDXFile(filename) - if filename.endswith('.summary'): + if filename.endswith(('.summary', '.idx')): return ZipNumCluster(filename, config) logging.warn('skipping unrecognized URI:%s', filename) @@ -218,7 +217,7 @@ def create_cdx_server(config, ds_rules_file=None): return server_cls(paths, config=pass_config, surt_ordered=surt_ordered, - ds_rules=ds_rules_file, + ds_rules_file=ds_rules_file, perms_checker=perms_checker) #================================================================= diff --git a/pywb/cdx/cdxsource.py b/pywb/cdx/cdxsource.py index 783cf36b..ba5f8b3b 100644 --- a/pywb/cdx/cdxsource.py +++ b/pywb/cdx/cdxsource.py @@ -1,6 +1,8 @@ from pywb.utils.binsearch import iter_range from pywb.utils.loaders import SeekableTextFileReader +from cdxobject import AccessException + import urllib import urllib2 import itertools @@ -93,7 +95,7 @@ class RedisCDXSource(CDXSource): self.key_prefix = self.DEFAULT_KEY_PREFIX if config: self.key_prefix = config.get('redis_key_prefix', self.key_prefix) - + def load_cdx(self, params): """ diff --git a/pywb/cdx/perms.py b/pywb/cdx/perms.py index a7b90eb4..ad6ea00d 100644 --- a/pywb/cdx/perms.py +++ b/pywb/cdx/perms.py @@ -1,7 +1,7 @@ #================================================================= -class AllowAllPerms: +class AllowAllPerms(object): """ Sample Perm Checker which allows all """ diff --git a/pywb/cdx/rules.yaml b/pywb/cdx/rules.yaml deleted file mode 100644 index 1da70582..00000000 --- a/pywb/cdx/rules.yaml +++ /dev/null @@ -1,24 +0,0 @@ - -fuzzy_lookup_rules: - - startswith: 'com,twitter)/i/profiles/show/' - matches: '/profiles/show/.*with_replies\?.*(max_id=[^&]+)' - - - startswith: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet' - matches: 'com,facebook\)/.*[?&]data=(.*(?:[&]|query_type[^,]+))' - - - startswith: ['com,yimg,l)/g/combo', 'com,yahooapis,yui)/combo'] - matches: '([^/]+(?:\.css|\.js))' - - # matches all urls - - startswith: '' - matches: '[&?](?:_|uncache)=[\d]+[&]?' - -canon_rules: - - startswith: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet' - matches: 'com,facebook\)/.*[?&]data=([^&]+).*' - replace: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet?data=\1' - - - - - diff --git a/pywb/cdx/test/cdxserver_test.py b/pywb/cdx/test/cdxserver_test.py index ad9286bf..f09af0fc 100644 --- a/pywb/cdx/test/cdxserver_test.py +++ b/pywb/cdx/test/cdxserver_test.py @@ -54,6 +54,16 @@ com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz +# Filter contains +>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '~urlkey:example=1') +com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz +com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz + +# Filter contains invert +>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '!~urlkey:example=1') +com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz +com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz + # Collapse by timestamp # unresolved revisits, different statuscode results in an extra repeat >>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', collapseTime = 11) @@ -131,9 +141,9 @@ org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db tex ('offset', '334'), ('filename', 'dupes.warc.gz')] -# NOTE: external dependency -- need self-contained test -#>>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'example.com', output = 'raw', limit = '2') -#>>> pprint.pprint(x.next().items()) +# NOTE: external dependency -- need self-contained test TODO +>>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'example.com', output = 'raw', limit = '2') +>>> pprint.pprint(x.next().items()) [('urlkey', 'com,example)/'), ('timestamp', '20020120142510'), ('original', 'http://example.com:80/'), @@ -142,6 +152,10 @@ org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db tex ('digest', 'HT2DYGA5UKZCPBSFVCV3JOBXGW2G5UUA'), ('length', '1792')] + +>>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'facebook.com', output = 'raw', limit = '2') +Traceback (most recent call last): +AccessException: Blocked By Robots """ #================================================================= @@ -169,7 +183,8 @@ def cdx_ops_test(url, sources = [test_cdx_dir + 'iana.cdx'], **kwparams): results = server.load_cdx(**kwparams) for x in results: - sys.stdout.write(x.to_text(fields)) + l = x.to_text(fields).replace('\t', ' ') + sys.stdout.write(l) #================================================================ diff --git a/pywb/cdx/test/zipnum_test.py b/pywb/cdx/test/zipnum_test.py new file mode 100644 index 00000000..7c98309a --- /dev/null +++ b/pywb/cdx/test/zipnum_test.py @@ -0,0 +1,44 @@ +""" +>>> zip_ops_test(url = 'http://iana.org') +org,iana)/ 20140126200624 http://www.iana.org/ text/html 200 OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 2258 334 iana.warc.gz +org,iana)/ 20140127171238 http://iana.org unk 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 343 1858 dupes.warc.gz +org,iana)/ 20140127171238 http://www.iana.org/ warc/revisit - OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 536 2678 dupes.warc.gz + +# test idx index (tabs replacad with 4 spaces) +>>> zip_ops_test(url = 'http://iana.org/domains/', matchType = 'prefix', showPagedIndex = True) +org,iana)/dnssec 20140126201307 zipnum 8511 373 +org,iana)/domains/int 20140126201239 zipnum 8884 353 +org,iana)/domains/root/servers 20140126201227 zipnum 9237 386 + +>>> zip_ops_test(url = 'http://iana.org/domains/', matchType = 'prefix') +org,iana)/domains/arpa 20140126201248 http://www.iana.org/domains/arpa text/html 200 QOFZZRN6JIKAL2JRL6ZC2VVG42SPKGHT - - 2939 759039 iana.warc.gz +org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz +org,iana)/domains/idn-tables 20140126201127 http://www.iana.org/domains/idn-tables text/html 200 HNCUFTJMOQOGAEY6T56KVC3T7TVLKGEW - - 8118 715878 iana.warc.gz +org,iana)/domains/int 20140126201239 http://www.iana.org/domains/int text/html 200 X32BBNNORV4SPEHTQF5KI5NFHSKTZK6Q - - 2482 746788 iana.warc.gz +org,iana)/domains/reserved 20140126201054 http://www.iana.org/domains/reserved text/html 200 R5AAEQX5XY5X5DG66B23ODN5DUBWRA27 - - 3573 701457 iana.warc.gz +org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz +org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz +org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz +org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz + +""" + + + + +from cdxserver_test import cdx_ops_test + +from pywb import get_test_dir +test_zipnum = get_test_dir() + 'zipcdx/zipnum-sample.idx' +print test_zipnum + +def zip_ops_test(url, **kwargs): + sources = test_zipnum + cdx_ops_test(url, sources, **kwargs) + + +if __name__ == "__main__": + import doctest + doctest.testmod() + + diff --git a/pywb/config_utils.py b/pywb/config_utils.py index 672e8735..05844a2e 100644 --- a/pywb/config_utils.py +++ b/pywb/config_utils.py @@ -18,17 +18,19 @@ def load_template_file(file, desc = None, view_class = views.J2TemplateView): return file #================================================================= -def create_wb_handler(cdx_server, config): +def create_wb_handler(cdx_server, config, ds_rules_file=None): record_loader = ArcWarcRecordLoader(cookie_maker = config.get('cookie_maker')) paths = config.get('archive_paths') - resolving_loader = ResolvingLoader(paths = paths, cdx_server = cdx_server, record_loader = record_loader) + resolving_loader = ResolvingLoader(paths=paths, + cdx_server=cdx_server, + record_loader=record_loader) replayer = replay_views.ReplayView( content_loader = resolving_loader, - content_rewriter = RewriteContent(), + content_rewriter = RewriteContent(ds_rules_file=ds_rules_file), head_insert_view = load_template_file(config.get('head_insert_html'), 'Head Insert'), diff --git a/pywb/pywb_init.py b/pywb/pywb_init.py index be4bdded..bd63bfd5 100644 --- a/pywb/pywb_init.py +++ b/pywb/pywb_init.py @@ -57,12 +57,13 @@ def pywb_config_manual(passed_config = {}): route_config = DictChain(value, config) - ds_rules = route_config.get('domain_specific_rules', None) - cdx_server = IndexReader(route_config, ds_rules) + ds_rules_file = route_config.get('domain_specific_rules', None) + cdx_server = IndexReader(route_config, ds_rules_file) wb_handler = config_utils.create_wb_handler( - cdx_server = cdx_server, - config = route_config, + cdx_server=cdx_server, + config=route_config, + ds_rules_file=ds_rules_file, ) logging.debug('Adding Collection: ' + name) diff --git a/pywb/replay_views.py b/pywb/replay_views.py index 4c6907eb..31e7af9a 100644 --- a/pywb/replay_views.py +++ b/pywb/replay_views.py @@ -7,6 +7,8 @@ from wbrequestresponse import WbResponse from wbexceptions import CaptureException, InternalRedirect from pywb.warc.recordloader import ArchiveLoadFailed +from pywb.utils.loaders import LimitReader + #================================================================= class ReplayView: def __init__(self, content_loader, content_rewriter, head_insert_view = None, @@ -53,10 +55,21 @@ class ReplayView: response = None + # if Content-Length for payload is present, ensure we don't read past it + content_len = status_headers.get_header('content-length') + try: + content_len=int(content_len) + if content_len > 0: + stream = LimitReader(stream, content_len) + except ValueError: + pass + if self.content_rewriter and wbrequest.wb_url.mod != 'id_': response = self.rewrite_content(wbrequest, cdx, status_headers, stream) else: (status_headers, stream) = self.sanitize_content(status_headers, stream) + #status_headers.remove_header('content-length') + response_iter = self.stream_to_iter(stream) response = WbResponse(status_headers, response_iter) @@ -99,20 +112,34 @@ class ReplayView: def rewrite_content(self, wbrequest, cdx, status_headers, stream): urlrewriter = wbrequest.urlrewriter - (rewritten_headers, stream) = self.content_rewriter.rewrite_headers(urlrewriter, status_headers, stream) + result = self.content_rewriter.rewrite_headers(urlrewriter, + status_headers, + stream, + cdx['urlkey']) + (rewritten_headers, stream) = result # no rewriting needed! if rewritten_headers.text_type is None: response_iter = self.stream_to_iter(stream) return WbResponse(rewritten_headers.status_headers, response_iter) - # do head insert + def make_head_insert(rule): + return (self.head_insert_view.render_to_string(wbrequest=wbrequest, + cdx=cdx, + rule=rule)) + # do head insert if self.head_insert_view: - head_insert_str = self.head_insert_view.render_to_string(wbrequest = wbrequest, cdx = cdx) + head_insert_func = make_head_insert else: - head_insert_str = None + head_insert_func = None - (status_headers, response_gen) = self.content_rewriter.rewrite_content(urlrewriter, rewritten_headers, stream, head_insert_str) + result = self.content_rewriter.rewrite_content(urlrewriter, + rewritten_headers, + stream, + head_insert_func, + cdx['urlkey']) + + (status_headers, response_gen) = result if self.buffer_response: if wbrequest.wb_url.mod == 'id_': diff --git a/pywb/rewrite/regex_rewriters.py b/pywb/rewrite/regex_rewriters.py index 690775e7..a435b104 100644 --- a/pywb/rewrite/regex_rewriters.py +++ b/pywb/rewrite/regex_rewriters.py @@ -4,11 +4,16 @@ import itertools from url_rewriter import UrlRewriter + #================================================================= class RegexRewriter(object): + #@staticmethod + #def comment_out(string): + # return '/*' + string + '*/' + @staticmethod - def comment_out(string): - return '/*' + string + '*/' + def format(template): + return lambda string: template.format(string) @staticmethod def remove_https(string): @@ -20,19 +25,16 @@ class RegexRewriter(object): @staticmethod def archival_rewrite(rewriter): - return lambda x: rewriter.rewrite(x) + return lambda string: rewriter.rewrite(string) - @staticmethod - def replacer(string): - return lambda x: string + #@staticmethod + #def replacer(other): + # return lambda m, string: other HTTPX_MATCH_STR = r'https?:\\?/\\?/[A-Za-z0-9:_@.-]+' - - DEFAULT_OP = add_prefix - def __init__(self, rules): #rules = self.create_rules(http_prefix) @@ -76,52 +78,68 @@ class RegexRewriter(object): op = RegexRewriter.DEFAULT_OP(op) result = op(m.group(i)) + final_str = result # if extracting partial match if i != full_m: - result = m.string[m.start(full_m):m.start(i)] + result + m.string[m.end(i):m.end(full_m)] + final_str = m.string[m.start(full_m):m.start(i)] + final_str += result + final_str += m.string[m.end(i):m.end(full_m)] + return final_str + + @staticmethod + def parse_rules_from_config(config): + def parse_rule(obj): + match = obj.get('match') + replace = RegexRewriter.format(obj.get('replace', '{0}')) + group = obj.get('group', 0) + result = (match, replace, group) return result - + return map(parse_rule, config) #================================================================= -class JSLinkRewriter(RegexRewriter): +class JSLinkOnlyRewriter(RegexRewriter): """ JS Rewriter which rewrites absolute http://, https:// and // urls at the beginning of a string """ JS_HTTPX = r'(?<="|\')(?:https?:)?\\{0,2}/\\{0,2}/[A-Za-z0-9:_@.-]+' - def __init__(self, rewriter, rules = []): + def __init__(self, rewriter, rules=[]): rules = rules + [(self.JS_HTTPX, rewriter.get_abs_url(), 0)] - super(JSLinkRewriter, self).__init__(rules) + super(JSLinkOnlyRewriter, self).__init__(rules) + #================================================================= -class JSLocationAndLinkRewriter(JSLinkRewriter): +class JSLinkAndLocationRewriter(JSLinkOnlyRewriter): """ JS Rewriter which also rewrites location and domain to the specified prefix (default: 'WB_wombat_') """ - def __init__(self, rewriter, rules = [], prefix = 'WB_wombat_'): + def __init__(self, rewriter, rules=[], prefix='WB_wombat_'): rules = rules + [ (r'(?>> _test_js('window.location = "http://example.com/abc.html"; some_func(); ', [('some_func\(\).*', RegexRewriter.comment_out, 0)]) +>>> _test_js('window.location = "http://example.com/abc.html"; some_func(); ', [('some_func\(\).*', RegexRewriter.format('/*{0}*/'), 0)]) 'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"; /*some_func(); */' # scheme-agnostic diff --git a/pywb/rewrite/test/test_rewrite_live.py b/pywb/rewrite/test/test_rewrite_live.py index 6d66ce60..36e74848 100644 --- a/pywb/rewrite/test/test_rewrite_live.py +++ b/pywb/rewrite/test/test_rewrite_live.py @@ -1,11 +1,50 @@ from pywb.rewrite.rewrite_live import get_rewritten from pywb.rewrite.url_rewriter import UrlRewriter +from pywb import get_test_dir + # This module has some rewriting tests against the 'live web' # As such, the content may change and the test may break urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/pywb/') +def head_insert_func(rule): + if rule.js_rewrite_location == True: + return '' + else: + return '' + + +def test_local_1(): + status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html', + urlrewriter, + 'com,example,test)/', + head_insert_func) + + # wombat insert added + assert '
' in buff + + # location rewritten + assert 'window.WB_wombat_location = "/other.html"' in buff + + # link rewritten + assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff + + +def test_local_2_no_js_location_rewrite(): + status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html', + urlrewriter, + 'example,example,test)/nolocation_rewrite', + head_insert_func) + + # no wombat insert + assert '' not in buff + + # no location rewrite + assert 'window.location = "/other.html"' in buff + + # still link rewrite + assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff def test_example_1(): status_headers, buff = get_rewritten('http://example.com/', urlrewriter) @@ -24,9 +63,10 @@ def test_example_2(): -#def test_example_3(): -# status_headers, buff = get_rewritten('http://archive.org/', urlrewriter) +def test_example_domain_specific_3(): + status_headers, buff = get_rewritten('http://facebook.com/digitalpreservation', urlrewriter) -# assert '/pywb/20131226101010/http://example.com/about/terms.php' in buff, buff + # comment out bootloader + assert '/* Bootloader.configurePage' in buff diff --git a/pywb/rules.yaml b/pywb/rules.yaml new file mode 100644 index 00000000..8927d2f1 --- /dev/null +++ b/pywb/rules.yaml @@ -0,0 +1,50 @@ + +rules: + + # twitter rules + #================================================================= + - url_prefix: 'com,twitter)/i/profiles/show/' + + fuzzy_lookup: '/profiles/show/.*with_replies\?.*(max_id=[^&]+)' + + + # facebook rules + #================================================================= + - url_prefix: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet' + + fuzzy_lookup: 'com,facebook\)/.*[?&]data=(.*?(?:[&]|query_type[^,]+))' + +# not actually needed, fuzzy match is used instead here +# canonicalize: +# match: 'com,facebook\)/.*[?&]data=([^&]+).*' +# replace: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet?data=\1' + + + - url_prefix: 'com,facebook)/' + rewrite: + js_regexs: + - match: 'Bootloader\.configurePage.*' + replace: '/* {0} */' + + + # yahoo rules + #================================================================= + - url_prefix: ['com,yimg,l)/g/combo', 'com,yahooapis,yui)/combo'] + + fuzzy_lookup: '([^/]+(?:\.css|\.js))' + + + # testing rules -- not for valid domain + #================================================================= + # this rule block is a non-existent prefix merely for testing + - url_prefix: 'example,example,test)/nolocation_rewrite' + + rewrite: + js_rewrite_location: False + + + # all domain rules -- fallback to this dataset + #================================================================= + # Applies to all urls -- should be last + - url_prefix: '' + fuzzy_lookup: '[&?](?:_|uncache)=[\d]+[&]?' diff --git a/pywb/static/wb.js b/pywb/static/wb.js index a7b39370..c4798da8 100644 --- a/pywb/static/wb.js +++ b/pywb/static/wb.js @@ -1,18 +1,21 @@ +/* +Copyright(c) 2013-2014 Ilya Kreymer. Released under the GNU General Public License. +This file is part of pywb. -// Rewritten location and domain obj setup -window.WB_wombat_location = window.location + pywb is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. -if (window.top != window) { - window.top.WB_wombat_location = window.top.location -} - -if (window.opener) { - window.opener.WB_wombat_location = window.opener.location -} - -document.WB_wombat_domain = document.domain + pywb is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + You should have received a copy of the GNU General Public License + along with pywb. If not, see