From 349a1a7a3a529136de3b8df798154437c4337456 Mon Sep 17 00:00:00 2001
From: Ilya Kreymer <ilya@archive.org>
Date: Tue, 25 Feb 2014 15:30:16 -0800
Subject: [PATCH 1/8] add unit test to timeutils.py tweak .travis.yml

---
 .travis.yml             | 5 ++---
 pywb/utils/timeutils.py | 4 ++++
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index bab78128..354f2c61 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -3,9 +3,8 @@ python:
   - "2.7"
 # command to install dependencies
 install:
-  - "python setup.py -q install"
-  - "pip install python-coveralls"
-  - "pip install pytest-cov"
+  - python setup.py -q install
+  - pip install coverage pytest-cov coveralls --use-mirrors
 # command to run tests
 #script: nosetests --with-doctest
 #script: py.test run-tests.py ./pywb/ --doctest-modules --ignore=setup.py
diff --git a/pywb/utils/timeutils.py b/pywb/utils/timeutils.py
index 7af3401f..f93f324d 100644
--- a/pywb/utils/timeutils.py
+++ b/pywb/utils/timeutils.py
@@ -162,6 +162,10 @@ def timestamp_to_datetime(string):
     >>> timestamp_to_datetime('40001965252477')
     datetime.datetime(2999, 12, 31, 23, 24, 59)
 
+    # not a number!
+    >>> timestamp_to_datetime('2010abc')
+    datetime.datetime(2010, 12, 31, 23, 59, 59)
+
     """
 
     # pad to 6 digits

From 5a41f59f39807575dcd1b4ec5f5e78235bf0a3cc Mon Sep 17 00:00:00 2001
From: Ilya Kreymer <ilya@archive.org>
Date: Wed, 26 Feb 2014 18:02:01 -0800
Subject: [PATCH 2/8] new unified config system, via rules.yaml! contains
 configs for cdx canon, fuzzy matching and rewriting! rewriting: ability to
 add custom regexs per domain also, ability to toggle js rewriting and custom
 rewriting file (default is wombat.js)

---
 .coveragerc                             |  3 +
 pywb/cdx/cdxdomainspecific.py           | 68 ++++++++++++---------
 pywb/cdx/rules.yaml                     | 24 --------
 pywb/rewrite/regex_rewriters.py         | 80 ++++++++++++++++---------
 pywb/rewrite/rewrite_content.py         | 53 +++++++++-------
 pywb/rewrite/rewrite_live.py            | 34 ++++++++++-
 pywb/rewrite/test/test_rewrite.py       |  2 +-
 pywb/rewrite/test/test_rewrite_live.py  | 34 ++++++++++-
 pywb/rules.yaml                         | 49 +++++++++++++++
 pywb/utils/loaders.py                   |  1 +
 sample_archive/text_content/sample.html | 14 +++++
 setup.py                                |  3 +-
 12 files changed, 253 insertions(+), 112 deletions(-)
 delete mode 100644 pywb/cdx/rules.yaml
 create mode 100644 pywb/rules.yaml
 create mode 100644 sample_archive/text_content/sample.html

diff --git a/.coveragerc b/.coveragerc
index 63400c07..d41f9d40 100644
--- a/.coveragerc
+++ b/.coveragerc
@@ -2,6 +2,9 @@
 omit = 
     */test/*
     */tests/*
+    *.html
+    *.js
+    *.css
 
 [report]
 exclude_lines =
diff --git a/pywb/cdx/cdxdomainspecific.py b/pywb/cdx/cdxdomainspecific.py
index 2c733c8d..a9e06778 100644
--- a/pywb/cdx/cdxdomainspecific.py
+++ b/pywb/cdx/cdxdomainspecific.py
@@ -3,31 +3,38 @@ import re
 import logging
 import pkgutil
 
+from pywb.utils.dsrules import BaseRule, RuleSet
+
 from canonicalize import unsurt, UrlCanonicalizer
 
 
 #=================================================================
 def load_domain_specific_cdx_rules(filename, surt_ordered):
-    fh = pkgutil.get_data(__package__, filename)
-    config = yaml.load(fh)
+    #fh = pkgutil.get_data(__package__, filename)
+    #config = yaml.load(fh)
+
+    canon = None
+    fuzzy = None
 
     # Load Canonicalizer Rules
-    rules = StartsWithRule.load_rules(config.get('canon_rules'),
-                                      surt_ordered)
+    rules = RuleSet(CDXDomainSpecificRule, 'canonicalize')
+
+    if not surt_ordered:
+        for rule in rules:
+            rule.unsurt()
 
     if rules:
         canon = CustomUrlCanonicalizer(rules, surt_ordered)
-    else:
-        canon = None
 
     # Load Fuzzy Lookup Rules
-    rules = StartsWithRule.load_rules(config.get('fuzzy_lookup_rules'),
-                                      surt_ordered)
+    rules = RuleSet(CDXDomainSpecificRule, 'fuzzy_lookup')
+
+    if not surt_ordered:
+        for rule in rules:
+            rule.unsurt()
 
     if rules:
         fuzzy = FuzzyQuery(rules)
-    else:
-        fuzzy = None
 
     logging.debug('CANON: ' + str(canon))
     logging.debug('FUZZY: ' + str(fuzzy))
@@ -43,10 +50,7 @@ class CustomUrlCanonicalizer(UrlCanonicalizer):
     def __call__(self, url):
         urlkey = super(CustomUrlCanonicalizer, self).__call__(url)
 
-        for rule in self.rules:
-            if not any(urlkey.startswith(x) for x in rule.starts):
-                continue
-
+        for rule in self.rules.iter_matching(urlkey):
             m = rule.regex.match(urlkey)
             if not m:
                 continue
@@ -68,10 +72,7 @@ class FuzzyQuery:
         urlkey = params['key']
         url = params['url']
 
-        for rule in self.rules:
-            if not any(urlkey.startswith(x) for x in rule.starts):
-                continue
-
+        for rule in self.rules.iter_matching(urlkey):
             m = rule.regex.search(urlkey)
             if not m:
                 continue
@@ -96,20 +97,29 @@ class FuzzyQuery:
 
 
 #=================================================================
-class StartsWithRule:
-    def __init__(self, config, surt_ordered=True):
-        self.starts = config.get('startswith')
-        if not isinstance(self.starts, list):
-            self.starts = [self.starts]
+class CDXDomainSpecificRule(BaseRule):
+    def __init__(self, name, config):
+        super(CDXDomainSpecificRule, self).__init__(name, config)
 
-        self.regex = re.compile(config.get('matches'))
-        self.replace = config.get('replace')
+        if isinstance(config, basestring):
+            self.regex = re.compile(config)
+            self.replace = None
+        else:
+            self.regex = re.compile(config.get('match'))
+            self.replace = config.get('replace')
 
     def unsurt(self):
-        # must convert to non-surt form
-        self.starts = map(unsurt, self.starts)
-        self.regex = unsurt(self.regex)
-        self.replace = unsurt(self.replace)
+        """
+        urlkey is assumed to be in surt format by default
+        In the case of non-surt format, this method is called
+        to desurt any urls
+        """
+        self.url_prefix = map(unsurt, self.url_prefix)
+        if self.regex:
+            self.regex = unsurt(self.regex)
+
+        if self.replace:
+            self.replace = unsurt(self.replace)
 
     @staticmethod
     def load_rules(rules_config, surt_ordered=True):
diff --git a/pywb/cdx/rules.yaml b/pywb/cdx/rules.yaml
deleted file mode 100644
index 1da70582..00000000
--- a/pywb/cdx/rules.yaml
+++ /dev/null
@@ -1,24 +0,0 @@
-
-fuzzy_lookup_rules:
-    - startswith: 'com,twitter)/i/profiles/show/'
-      matches: '/profiles/show/.*with_replies\?.*(max_id=[^&]+)'
-
-    - startswith: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet'
-      matches: 'com,facebook\)/.*[?&]data=(.*(?:[&]|query_type[^,]+))'
-
-    - startswith: ['com,yimg,l)/g/combo', 'com,yahooapis,yui)/combo']
-      matches: '([^/]+(?:\.css|\.js))'
-
-    # matches all urls
-    - startswith: ''
-      matches: '[&?](?:_|uncache)=[\d]+[&]?'
-
-canon_rules:
-    - startswith: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet'
-      matches: 'com,facebook\)/.*[?&]data=([^&]+).*'
-      replace: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet?data=\1'
-
-
-
-
-
diff --git a/pywb/rewrite/regex_rewriters.py b/pywb/rewrite/regex_rewriters.py
index 690775e7..a435b104 100644
--- a/pywb/rewrite/regex_rewriters.py
+++ b/pywb/rewrite/regex_rewriters.py
@@ -4,11 +4,16 @@ import itertools
 
 from url_rewriter import UrlRewriter
 
+
 #=================================================================
 class RegexRewriter(object):
+    #@staticmethod
+    #def comment_out(string):
+    #    return '/*' + string + '*/'
+
     @staticmethod
-    def comment_out(string):
-        return '/*' + string + '*/'
+    def format(template):
+        return lambda string: template.format(string)
 
     @staticmethod
     def remove_https(string):
@@ -20,19 +25,16 @@ class RegexRewriter(object):
 
     @staticmethod
     def archival_rewrite(rewriter):
-        return lambda x: rewriter.rewrite(x)
+        return lambda string: rewriter.rewrite(string)
 
-    @staticmethod
-    def replacer(string):
-        return lambda x: string
+    #@staticmethod
+    #def replacer(other):
+    #    return lambda m, string: other
 
     HTTPX_MATCH_STR = r'https?:\\?/\\?/[A-Za-z0-9:_@.-]+'
 
-
-
     DEFAULT_OP = add_prefix
 
-
     def __init__(self, rules):
         #rules = self.create_rules(http_prefix)
 
@@ -76,52 +78,68 @@ class RegexRewriter(object):
                 op = RegexRewriter.DEFAULT_OP(op)
 
             result = op(m.group(i))
+            final_str = result
 
             # if extracting partial match
             if i != full_m:
-                result = m.string[m.start(full_m):m.start(i)] + result + m.string[m.end(i):m.end(full_m)]
+                final_str = m.string[m.start(full_m):m.start(i)]
+                final_str += result
+                final_str += m.string[m.end(i):m.end(full_m)]
 
+            return final_str
+
+    @staticmethod
+    def parse_rules_from_config(config):
+        def parse_rule(obj):
+            match = obj.get('match')
+            replace = RegexRewriter.format(obj.get('replace', '{0}'))
+            group = obj.get('group', 0)
+            result = (match, replace, group)
             return result
-
+        return map(parse_rule, config)
 
 
 #=================================================================
-class JSLinkRewriter(RegexRewriter):
+class JSLinkOnlyRewriter(RegexRewriter):
     """
     JS Rewriter which rewrites absolute http://, https:// and // urls
     at the beginning of a string
     """
     JS_HTTPX = r'(?<="|\')(?:https?:)?\\{0,2}/\\{0,2}/[A-Za-z0-9:_@.-]+'
 
-    def __init__(self, rewriter, rules = []):
+    def __init__(self, rewriter, rules=[]):
         rules = rules + [(self.JS_HTTPX, rewriter.get_abs_url(), 0)]
-        super(JSLinkRewriter, self).__init__(rules)
+        super(JSLinkOnlyRewriter, self).__init__(rules)
+
 
 #=================================================================
-class JSLocationAndLinkRewriter(JSLinkRewriter):
+class JSLinkAndLocationRewriter(JSLinkOnlyRewriter):
     """
     JS Rewriter which also rewrites location and domain to the
     specified prefix (default: 'WB_wombat_')
     """
 
-    def __init__(self, rewriter, rules = [], prefix = 'WB_wombat_'):
+    def __init__(self, rewriter, rules=[], prefix='WB_wombat_'):
         rules = rules + [
              (r'(?<!/)\blocation\b', prefix, 0),
              (r'(?<=document\.)domain', prefix, 0),
         ]
-        super(JSLocationAndLinkRewriter, self).__init__(rewriter, rules)
+        #import sys
+        #sys.stderr.write('\n\n*** RULES:' + str(rules) + '\n\n')
+        super(JSLinkAndLocationRewriter, self).__init__(rewriter, rules)
+
 
 #=================================================================
 # Set 'default' JSRewriter
-JSRewriter = JSLocationAndLinkRewriter
+JSRewriter = JSLinkAndLocationRewriter
 
 
 #=================================================================
 class XMLRewriter(RegexRewriter):
-    def __init__(self, rewriter, extra = []):
+    def __init__(self, rewriter, extra=[]):
         rules = self._create_rules(rewriter.get_abs_url())
 
-        RegexRewriter.__init__(self, rules)
+        super(XMLRewriter, self).__init__(rules)
 
     # custom filter to reject 'xmlns' attr
     def filter(self, m):
@@ -133,24 +151,28 @@ class XMLRewriter(RegexRewriter):
 
     def _create_rules(self, http_prefix):
         return [
-             ('([A-Za-z:]+[\s=]+)?["\'\s]*(' + RegexRewriter.HTTPX_MATCH_STR + ')', http_prefix, 2),
+             ('([A-Za-z:]+[\s=]+)?["\'\s]*(' +
+              RegexRewriter.HTTPX_MATCH_STR + ')', http_prefix, 2),
         ]
 
+
 #=================================================================
 class CSSRewriter(RegexRewriter):
+
     CSS_URL_REGEX = "url\\s*\\(\\s*[\\\\\"']*([^)'\"]+)[\\\\\"']*\\s*\\)"
-    CSS_IMPORT_NO_URL_REGEX = "@import\\s+(?!url)\\(?\\s*['\"]?(?!url[\\s\\(])([\w.:/\\\\-]+)"
+
+    CSS_IMPORT_NO_URL_REGEX = ("@import\\s+(?!url)\\(?\\s*['\"]?" +
+                               "(?!url[\\s\\(])([\w.:/\\\\-]+)")
 
     def __init__(self, rewriter):
         rules = self._create_rules(rewriter)
-
-        RegexRewriter.__init__(self, rules)
-
+        super(CSSRewriter, self).__init__(rules)
 
     def _create_rules(self, rewriter):
         return [
-             (CSSRewriter.CSS_URL_REGEX, RegexRewriter.archival_rewrite(rewriter), 1),
-             (CSSRewriter.CSS_IMPORT_NO_URL_REGEX, RegexRewriter.archival_rewrite(rewriter), 1),
+             (CSSRewriter.CSS_URL_REGEX,
+              RegexRewriter.archival_rewrite(rewriter), 1),
+
+             (CSSRewriter.CSS_IMPORT_NO_URL_REGEX,
+              RegexRewriter.archival_rewrite(rewriter), 1),
         ]
-
-
diff --git a/pywb/rewrite/rewrite_content.py b/pywb/rewrite/rewrite_content.py
index 81cd23c9..80daf7e3 100644
--- a/pywb/rewrite/rewrite_content.py
+++ b/pywb/rewrite/rewrite_content.py
@@ -1,30 +1,24 @@
 import chardet
+import pkgutil
+import yaml
 
-from url_rewriter import UrlRewriter
-from html_rewriter import HTMLRewriter
-from regex_rewriters import RegexRewriter, JSRewriter, CSSRewriter, XMLRewriter
-from header_rewriter import HeaderRewriter, RewrittenStatusAndHeaders
+from header_rewriter import RewrittenStatusAndHeaders
 
+from rewriterules import RewriteRules
+
+from pywb.utils.dsrules import RuleSet
 from pywb.utils.statusandheaders import StatusAndHeaders
 from pywb.utils.bufferedreaders import DecompressingBufferedReader, ChunkedDataReader
 
+
 class RewriteContent:
+    def __init__(self, config=None):
+        self.ruleset = RuleSet(RewriteRules, 'rewrite', config, {})
 
-    DEFAULT_CONTENT_REWRITERS = {
-      'header': HeaderRewriter,
-      'js': JSRewriter,
-      'css': CSSRewriter,
-      'xml': XMLRewriter,
-      'html': HTMLRewriter
-    }
+    def rewrite_headers(self, urlrewriter, status_headers, stream, urlkey=''):
+        header_rewriter_class = self.ruleset.get_first_match(urlkey).rewriters['header']
 
-
-    def __init__(self, rewriters = {}):
-        self.rewriters = dict(self.DEFAULT_CONTENT_REWRITERS.items() + rewriters.items())
-
-
-    def rewrite_headers(self, urlrewriter, status_headers, stream):
-        rewritten_headers = self.rewriters['header']().rewrite(status_headers, urlrewriter)
+        rewritten_headers = header_rewriter_class().rewrite(status_headers, urlrewriter)
 
         # note: since chunking may be broken, approach taken here is to *always* attempt
         # to dechunk if transfer-encoding: chunked is present
@@ -37,7 +31,8 @@ class RewriteContent:
 
         return (rewritten_headers, stream)
 
-    def rewrite_content(self, urlrewriter, headers, stream, head_insert_str = None):
+    def rewrite_content(self, urlrewriter, headers, stream, head_insert_str=None, urlkey=''):
+
         # see if we've already rewritten headers
         if isinstance(headers, RewrittenStatusAndHeaders):
             rewritten_headers = headers
@@ -50,9 +45,11 @@ class RewriteContent:
                 return (status_headers, gen)
 
         status_headers = rewritten_headers.status_headers
+
         # Handle text content rewriting
         # =========================================================================
         # special case -- need to ungzip the body
+
         if (rewritten_headers.contains_removed_header('content-encoding', 'gzip')):
             stream = DecompressingBufferedReader(stream, decomp_type='gzip')
 
@@ -68,13 +65,25 @@ class RewriteContent:
 
         text_type = rewritten_headers.text_type
 
-        rewriter_class = self.rewriters.get(text_type)
-        if not rewriter_class:
+        #rewriter_class = self.rewriters.get(text_type)
+        rule = self.ruleset.get_first_match(urlkey)
+
+        try:
+            rewriter_class = rule.rewriters[text_type]
+        except KeyError:
             raise Exception('Unknown Text Type for Rewrite: ' + text_type)
 
+        #import sys
+        #sys.stderr.write(str(vars(self.ruleset.get_first_match(urlkey))))
 
         if text_type == 'html':
-            rewriter = rewriter_class(urlrewriter, outstream = None, head_insert = head_insert_str)
+            head_insert_str = rule.create_head_inserts() + head_insert_str
+
+            rewriter = rewriter_class(urlrewriter,
+                                      outstream=None,
+                                      js_rewriter_class=rule.rewriters['js'],
+                                      css_rewriter_class=rule.rewriters['css'],
+                                      head_insert=head_insert_str)
         else:
             rewriter = rewriter_class(urlrewriter)
 
diff --git a/pywb/rewrite/rewrite_live.py b/pywb/rewrite/rewrite_live.py
index 1865f98e..9d752d10 100644
--- a/pywb/rewrite/rewrite_live.py
+++ b/pywb/rewrite/rewrite_live.py
@@ -2,12 +2,17 @@ import urllib2
 import os
 import sys
 import datetime
+import mimetypes
 
+from pywb.utils.loaders import is_http
 from pywb.utils.timeutils import datetime_to_timestamp
 from pywb.utils.statusandheaders import StatusAndHeaders
+
 from pywb.rewrite.url_rewriter import UrlRewriter
 from pywb.rewrite.rewrite_content import RewriteContent
 
+from pywb.cdx.canonicalize import canonicalize
+
 """
 Fetch a url from live web and apply rewriting rules
 """
@@ -26,10 +31,33 @@ def get_status_and_stream(url):
     return (status_headers, stream)
 
 #=================================================================
-def get_rewritten(url, urlrewriter):
-    (status_headers, stream) = get_status_and_stream(url)
+def get_local_file(uri):
+    fh = open(uri)
 
-    status_headers, gen = RewriteContent().rewrite_content(urlrewriter, status_headers, stream)
+    content_type, _ = mimetypes.guess_type(uri)
+
+    # create fake headers for local file
+    status_headers = StatusAndHeaders('200 OK', [('Content-Type', content_type)])
+    stream = fh
+
+    return (status_headers, stream)
+
+#=================================================================
+def get_rewritten(url, urlrewriter, urlkey=None):
+    if is_http(url):
+        (status_headers, stream) = get_status_and_stream(url)
+    else:
+        (status_headers, stream) = get_local_file(url)
+
+    # explicit urlkey may be passed in (say for testing)
+    if not urlkey:
+        urlkey = canonicalize(url)
+
+    status_headers, gen = RewriteContent().rewrite_content(urlrewriter,
+                                                           status_headers,
+                                                           stream,
+                                                           head_insert_str='',
+                                                           urlkey=urlkey)
 
     buff = ''
     for x in gen:
diff --git a/pywb/rewrite/test/test_rewrite.py b/pywb/rewrite/test/test_rewrite.py
index d9fe8bfa..7498e601 100644
--- a/pywb/rewrite/test/test_rewrite.py
+++ b/pywb/rewrite/test/test_rewrite.py
@@ -121,7 +121,7 @@ r"""
 'document_domain = "anotherdomain.com"; window.document.WB_wombat_domain = "example.com"'
 
 # custom rules added
->>> _test_js('window.location = "http://example.com/abc.html"; some_func(); ', [('some_func\(\).*', RegexRewriter.comment_out, 0)])
+>>> _test_js('window.location = "http://example.com/abc.html"; some_func(); ', [('some_func\(\).*', RegexRewriter.format('/*{0}*/'), 0)])
 'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"; /*some_func(); */'
 
 # scheme-agnostic
diff --git a/pywb/rewrite/test/test_rewrite_live.py b/pywb/rewrite/test/test_rewrite_live.py
index 6d66ce60..f3a7667a 100644
--- a/pywb/rewrite/test/test_rewrite_live.py
+++ b/pywb/rewrite/test/test_rewrite_live.py
@@ -1,12 +1,39 @@
 from pywb.rewrite.rewrite_live import get_rewritten
 from pywb.rewrite.url_rewriter import UrlRewriter
 
+from pywb import get_test_dir
+
 # This module has some rewriting tests against the 'live web'
 # As such, the content may change and the test may break
 
 urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/pywb/')
 
 
+def test_local_1():
+    status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html', urlrewriter, 'com,example,test)/')
+
+    # wombat insert added
+    assert '<head><script src="/static/default/wombat.js"> </script>' in buff
+
+    # location rewritten
+    assert 'window.WB_wombat_location = "/other.html"' in buff
+
+    # link rewritten
+    assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff
+
+
+def test_local_2_no_js_location_rewrite():
+    status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html', urlrewriter, 'example,example,test)/nolocation_rewrite')
+
+    # no wombat insert
+    assert '<head><script src="/static/default/wombat.js"> </script>' not in buff
+
+    # no location rewrite
+    assert 'window.location = "/other.html"' in buff
+
+    # still link rewrite
+    assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff
+
 def test_example_1():
     status_headers, buff = get_rewritten('http://example.com/', urlrewriter)
 
@@ -24,9 +51,10 @@ def test_example_2():
 
 
 
-#def test_example_3():
-#    status_headers, buff = get_rewritten('http://archive.org/', urlrewriter)
+def test_example_domain_specific_3():
+    status_headers, buff = get_rewritten('http://facebook.com/digitalpreservation', urlrewriter)
 
-#    assert '/pywb/20131226101010/http://example.com/about/terms.php' in buff, buff
+    # comment out bootloader
+    assert '/* Bootloader.configurePage' in buff, buff
 
 
diff --git a/pywb/rules.yaml b/pywb/rules.yaml
new file mode 100644
index 00000000..5cf29154
--- /dev/null
+++ b/pywb/rules.yaml
@@ -0,0 +1,49 @@
+
+rules:
+
+    # twitter rules
+    #=================================================================
+    - url_prefix: 'com,twitter)/i/profiles/show/'
+
+      fuzzy_lookup: '/profiles/show/.*with_replies\?.*(max_id=[^&]+)'
+
+
+    # facebook rules
+    #=================================================================
+    - url_prefix: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet'
+
+      fuzzy_lookup: 'com,facebook\)/.*[?&]data=(.*(?:[&]|query_type[^,]+))'
+
+      canonicalize:
+        match: 'com,facebook\)/.*[?&]data=([^&]+).*'
+        replace: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet?data=\1'
+
+
+    - url_prefix: 'com,facebook)/'
+      rewrite:
+        js_regexs:
+            - match: 'Bootloader\.configurePage.*'
+              replace: '/* {0} */'
+
+
+    # yahoo rules
+    #=================================================================
+    - url_prefix: ['com,yimg,l)/g/combo', 'com,yahooapis,yui)/combo']
+
+      fuzzy_lookup: '([^/]+(?:\.css|\.js))'
+
+
+    # testing rules -- not for valid domain
+    #=================================================================
+    # this rule block is a non-existent prefix merely for testing
+    - url_prefix: 'example,example,test)/nolocation_rewrite'
+
+      rewrite:
+        js_rewrite_location: False
+
+
+    # all domain rules -- fallback to this dataset
+    #=================================================================
+    # Applies to all urls -- should be last
+    - url_prefix: ''
+      fuzzy_lookup: '[&?](?:_|uncache)=[\d]+[&]?'
diff --git a/pywb/utils/loaders.py b/pywb/utils/loaders.py
index a117f539..7813ded8 100644
--- a/pywb/utils/loaders.py
+++ b/pywb/utils/loaders.py
@@ -9,6 +9,7 @@ import urllib2
 import time
 
 
+#=================================================================
 def is_http(filename):
     return any(filename.startswith(x) for x in ['http://', 'https://'])
 
diff --git a/sample_archive/text_content/sample.html b/sample_archive/text_content/sample.html
new file mode 100644
index 00000000..c4f3ce35
--- /dev/null
+++ b/sample_archive/text_content/sample.html
@@ -0,0 +1,14 @@
+<html>
+<head>
+<title>Sample Page For Rewrite Test</title>
+</head>
+<body>
+<script>
+var some_val = false;
+if (some_val) {
+    window.location = "/other.html";
+}
+</script>
+Test Content
+<a href="another.html">Some Link</a>
+</body>
diff --git a/setup.py b/setup.py
index 20ac8518..dac8a907 100755
--- a/setup.py
+++ b/setup.py
@@ -15,7 +15,8 @@ setuptools.setup(name='pywb',
         provides=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite'],
         package_data={'pywb': ['ui/*', 'static/*'], 'pywb.cdx': ['*.yaml']},
         data_files = [('sample_archive/cdx/', glob.glob('sample_archive/cdx/*')),
-                      ('sample_archive/warcs/', glob.glob('sample_archive/warcs/*'))],
+                      ('sample_archive/warcs/', glob.glob('sample_archive/warcs/*')),
+                      ('sample_archive/text_content/', glob.glob('sample_archive/text_content/*'))],
         install_requires=['uwsgi', 'rfc3987', 'chardet', 'redis', 'jinja2', 'surt', 'pyyaml', 'WebTest','pytest'],
 #        tests_require=['WebTest', 'pytest'],
         zip_safe=False)

From 453ab678ed47101b6a27422e5b83084d715ec5c6 Mon Sep 17 00:00:00 2001
From: Ilya Kreymer <ilya@archive.org>
Date: Wed, 26 Feb 2014 22:04:37 -0800
Subject: [PATCH 3/8] refactor domain specific rules: - head insert callback
 passed in with rule, up to template to handle additional inserts based on
 rule properties - ability to pass in custom rules config to both cdx server
 and content rewriter - move canonicalize to utils pkg - add wombat, modify
 wb.js to remove wombat-related settings

---
 pywb/cdx/cdxdomainspecific.py          |  10 +-
 pywb/cdx/cdxserver.py                  |  10 +-
 pywb/config_utils.py                   |   8 +-
 pywb/pywb_init.py                      |   9 +-
 pywb/replay_views.py                   |  25 ++-
 pywb/rewrite/rewrite_content.py        |  17 +-
 pywb/rewrite/rewrite_live.py           |  18 +-
 pywb/rewrite/rewriterules.py           |  53 ++++++
 pywb/rewrite/test/test_rewrite_live.py |  18 +-
 pywb/static/wb.js                      |  25 +--
 pywb/static/wombat.js                  | 219 +++++++++++++++++++++++++
 pywb/ui/head_insert.html               |  11 +-
 pywb/{cdx => utils}/canonicalize.py    |  10 +-
 pywb/utils/dsrules.py                  |  98 +++++++++++
 pywb/wbapp.py                          |   4 +-
 setup.py                               |   2 +-
 16 files changed, 482 insertions(+), 55 deletions(-)
 create mode 100644 pywb/rewrite/rewriterules.py
 create mode 100644 pywb/static/wombat.js
 rename pywb/{cdx => utils}/canonicalize.py (95%)
 create mode 100644 pywb/utils/dsrules.py

diff --git a/pywb/cdx/cdxdomainspecific.py b/pywb/cdx/cdxdomainspecific.py
index a9e06778..006dd88d 100644
--- a/pywb/cdx/cdxdomainspecific.py
+++ b/pywb/cdx/cdxdomainspecific.py
@@ -5,11 +5,11 @@ import pkgutil
 
 from pywb.utils.dsrules import BaseRule, RuleSet
 
-from canonicalize import unsurt, UrlCanonicalizer
+from pywb.utils.canonicalize import unsurt, UrlCanonicalizer
 
 
 #=================================================================
-def load_domain_specific_cdx_rules(filename, surt_ordered):
+def load_domain_specific_cdx_rules(ds_rules_file, surt_ordered):
     #fh = pkgutil.get_data(__package__, filename)
     #config = yaml.load(fh)
 
@@ -17,7 +17,8 @@ def load_domain_specific_cdx_rules(filename, surt_ordered):
     fuzzy = None
 
     # Load Canonicalizer Rules
-    rules = RuleSet(CDXDomainSpecificRule, 'canonicalize')
+    rules = RuleSet(CDXDomainSpecificRule, 'canonicalize',
+                    ds_rules_file=ds_rules_file)
 
     if not surt_ordered:
         for rule in rules:
@@ -27,7 +28,8 @@ def load_domain_specific_cdx_rules(filename, surt_ordered):
         canon = CustomUrlCanonicalizer(rules, surt_ordered)
 
     # Load Fuzzy Lookup Rules
-    rules = RuleSet(CDXDomainSpecificRule, 'fuzzy_lookup')
+    rules = RuleSet(CDXDomainSpecificRule, 'fuzzy_lookup',
+                    ds_rules_file=ds_rules_file)
 
     if not surt_ordered:
         for rule in rules:
diff --git a/pywb/cdx/cdxserver.py b/pywb/cdx/cdxserver.py
index 1a68f7e4..7f548ec4 100644
--- a/pywb/cdx/cdxserver.py
+++ b/pywb/cdx/cdxserver.py
@@ -1,4 +1,4 @@
-from canonicalize import UrlCanonicalizer, calc_search_range
+from pywb.utils.canonicalize import UrlCanonicalizer, calc_search_range
 
 from cdxops import cdx_load
 from cdxsource import CDXSource, CDXFile, RemoteCDXSource, RedisCDXSource
@@ -17,13 +17,13 @@ import urlparse
 #=================================================================
 class BaseCDXServer(object):
     def __init__(self, **kwargs):
-        ds_rules = kwargs.get('ds_rules')
+        ds_rules_file = kwargs.get('ds_rules_file')
         surt_ordered = kwargs.get('surt_ordered', True)
 
         # load from domain-specific rules
-        if ds_rules:
+        if ds_rules_file:
             self.url_canon, self.fuzzy_query = (
-                load_domain_specific_cdx_rules(ds_rules, surt_ordered))
+                load_domain_specific_cdx_rules(ds_rules_file, surt_ordered))
         # or custom passed in canonicalizer
         else:
             self.url_canon = kwargs.get('url_canon')
@@ -166,7 +166,7 @@ def create_cdx_server(config, ds_rules_file=None):
     return server_cls(paths,
                       config=pass_config,
                       surt_ordered=surt_ordered,
-                      ds_rules=ds_rules_file,
+                      ds_rules_file=ds_rules_file,
                       perms_checker=perms_checker)
 
 
diff --git a/pywb/config_utils.py b/pywb/config_utils.py
index 672e8735..05844a2e 100644
--- a/pywb/config_utils.py
+++ b/pywb/config_utils.py
@@ -18,17 +18,19 @@ def load_template_file(file, desc = None, view_class = views.J2TemplateView):
     return file
 
 #=================================================================
-def create_wb_handler(cdx_server, config):
+def create_wb_handler(cdx_server, config, ds_rules_file=None):
 
     record_loader = ArcWarcRecordLoader(cookie_maker = config.get('cookie_maker'))
     paths = config.get('archive_paths')
 
-    resolving_loader = ResolvingLoader(paths = paths, cdx_server = cdx_server, record_loader = record_loader)
+    resolving_loader = ResolvingLoader(paths=paths,
+                                       cdx_server=cdx_server,
+                                       record_loader=record_loader)
 
     replayer = replay_views.ReplayView(
         content_loader = resolving_loader,
 
-        content_rewriter = RewriteContent(),
+        content_rewriter = RewriteContent(ds_rules_file=ds_rules_file),
 
         head_insert_view = load_template_file(config.get('head_insert_html'), 'Head Insert'),
 
diff --git a/pywb/pywb_init.py b/pywb/pywb_init.py
index be4bdded..bd63bfd5 100644
--- a/pywb/pywb_init.py
+++ b/pywb/pywb_init.py
@@ -57,12 +57,13 @@ def pywb_config_manual(passed_config = {}):
 
         route_config = DictChain(value, config)
 
-        ds_rules = route_config.get('domain_specific_rules', None)
-        cdx_server = IndexReader(route_config, ds_rules)
+        ds_rules_file = route_config.get('domain_specific_rules', None)
+        cdx_server = IndexReader(route_config, ds_rules_file)
 
         wb_handler = config_utils.create_wb_handler(
-            cdx_server = cdx_server,
-            config = route_config,
+            cdx_server=cdx_server,
+            config=route_config,
+            ds_rules_file=ds_rules_file,
         )
 
         logging.debug('Adding Collection: ' + name)
diff --git a/pywb/replay_views.py b/pywb/replay_views.py
index 4c6907eb..9113ad5f 100644
--- a/pywb/replay_views.py
+++ b/pywb/replay_views.py
@@ -7,6 +7,7 @@ from wbrequestresponse import WbResponse
 from wbexceptions import CaptureException, InternalRedirect
 from pywb.warc.recordloader import ArchiveLoadFailed
 
+
 #=================================================================
 class ReplayView:
     def __init__(self, content_loader, content_rewriter, head_insert_view = None,
@@ -99,20 +100,34 @@ class ReplayView:
     def rewrite_content(self, wbrequest, cdx, status_headers, stream):
         urlrewriter = wbrequest.urlrewriter
 
-        (rewritten_headers, stream) = self.content_rewriter.rewrite_headers(urlrewriter, status_headers, stream)
+        result = self.content_rewriter.rewrite_headers(urlrewriter,
+                                                       status_headers,
+                                                       stream,
+                                                       cdx['urlkey'])
+        (rewritten_headers, stream) = result
 
         # no rewriting needed!
         if rewritten_headers.text_type is None:
             response_iter = self.stream_to_iter(stream)
             return WbResponse(rewritten_headers.status_headers, response_iter)
 
-        # do head insert
+        def make_head_insert(rule):
+            return (self.head_insert_view.render_to_string(wbrequest=wbrequest,
+                                                           cdx=cdx,
+                                                           rule=rule))
+         # do head insert
         if self.head_insert_view:
-            head_insert_str = self.head_insert_view.render_to_string(wbrequest = wbrequest, cdx = cdx)
+            head_insert_func = make_head_insert
         else:
-            head_insert_str = None
+            head_insert_func = None
 
-        (status_headers, response_gen) = self.content_rewriter.rewrite_content(urlrewriter, rewritten_headers, stream, head_insert_str)
+        result = self.content_rewriter.rewrite_content(urlrewriter,
+                                                       rewritten_headers,
+                                                       stream,
+                                                       head_insert_func,
+                                                       cdx['urlkey'])
+
+        (status_headers, response_gen) = result
 
         if self.buffer_response:
             if wbrequest.wb_url.mod == 'id_':
diff --git a/pywb/rewrite/rewrite_content.py b/pywb/rewrite/rewrite_content.py
index 80daf7e3..1ba3d321 100644
--- a/pywb/rewrite/rewrite_content.py
+++ b/pywb/rewrite/rewrite_content.py
@@ -11,9 +11,12 @@ from pywb.utils.statusandheaders import StatusAndHeaders
 from pywb.utils.bufferedreaders import DecompressingBufferedReader, ChunkedDataReader
 
 
+#=================================================================
 class RewriteContent:
-    def __init__(self, config=None):
-        self.ruleset = RuleSet(RewriteRules, 'rewrite', config, {})
+    def __init__(self, ds_rules_file=None):
+        self.ruleset = RuleSet(RewriteRules, 'rewrite',
+                               default_rule_config={},
+                               ds_rules_file=ds_rules_file)
 
     def rewrite_headers(self, urlrewriter, status_headers, stream, urlkey=''):
         header_rewriter_class = self.ruleset.get_first_match(urlkey).rewriters['header']
@@ -31,7 +34,7 @@ class RewriteContent:
 
         return (rewritten_headers, stream)
 
-    def rewrite_content(self, urlrewriter, headers, stream, head_insert_str=None, urlkey=''):
+    def rewrite_content(self, urlrewriter, headers, stream, head_insert_func=None, urlkey=''):
 
         # see if we've already rewritten headers
         if isinstance(headers, RewrittenStatusAndHeaders):
@@ -65,7 +68,6 @@ class RewriteContent:
 
         text_type = rewritten_headers.text_type
 
-        #rewriter_class = self.rewriters.get(text_type)
         rule = self.ruleset.get_first_match(urlkey)
 
         try:
@@ -74,10 +76,13 @@ class RewriteContent:
             raise Exception('Unknown Text Type for Rewrite: ' + text_type)
 
         #import sys
-        #sys.stderr.write(str(vars(self.ruleset.get_first_match(urlkey))))
+        #sys.stderr.write(str(vars(rule)))
 
         if text_type == 'html':
-            head_insert_str = rule.create_head_inserts() + head_insert_str
+            head_insert_str = ''
+
+            if head_insert_func:
+                head_insert_str = head_insert_func(rule)
 
             rewriter = rewriter_class(urlrewriter,
                                       outstream=None,
diff --git a/pywb/rewrite/rewrite_live.py b/pywb/rewrite/rewrite_live.py
index 9d752d10..63783234 100644
--- a/pywb/rewrite/rewrite_live.py
+++ b/pywb/rewrite/rewrite_live.py
@@ -7,11 +7,11 @@ import mimetypes
 from pywb.utils.loaders import is_http
 from pywb.utils.timeutils import datetime_to_timestamp
 from pywb.utils.statusandheaders import StatusAndHeaders
+from pywb.utils.canonicalize import canonicalize
 
 from pywb.rewrite.url_rewriter import UrlRewriter
 from pywb.rewrite.rewrite_content import RewriteContent
 
-from pywb.cdx.canonicalize import canonicalize
 
 """
 Fetch a url from live web and apply rewriting rules
@@ -43,7 +43,7 @@ def get_local_file(uri):
     return (status_headers, stream)
 
 #=================================================================
-def get_rewritten(url, urlrewriter, urlkey=None):
+def get_rewritten(url, urlrewriter, urlkey=None, head_insert_func=None):
     if is_http(url):
         (status_headers, stream) = get_status_and_stream(url)
     else:
@@ -53,11 +53,15 @@ def get_rewritten(url, urlrewriter, urlkey=None):
     if not urlkey:
         urlkey = canonicalize(url)
 
-    status_headers, gen = RewriteContent().rewrite_content(urlrewriter,
-                                                           status_headers,
-                                                           stream,
-                                                           head_insert_str='',
-                                                           urlkey=urlkey)
+    rewriter = RewriteContent()
+
+    result = rewriter.rewrite_content(urlrewriter,
+                                      status_headers,
+                                      stream,
+                                      head_insert_func=head_insert_func,
+                                      urlkey=urlkey)
+
+    status_headers, gen = result
 
     buff = ''
     for x in gen:
diff --git a/pywb/rewrite/rewriterules.py b/pywb/rewrite/rewriterules.py
new file mode 100644
index 00000000..e1584162
--- /dev/null
+++ b/pywb/rewrite/rewriterules.py
@@ -0,0 +1,53 @@
+from pywb.utils.dsrules import BaseRule
+
+from regex_rewriters import RegexRewriter, CSSRewriter, XMLRewriter
+from regex_rewriters import JSLinkAndLocationRewriter, JSLinkOnlyRewriter
+from html_rewriter import HTMLRewriter
+from header_rewriter import HeaderRewriter
+
+import itertools
+
+class RewriteRules(BaseRule):
+    def __init__(self, url_prefix, config={}):
+        super(RewriteRules, self).__init__(url_prefix, config)
+
+        self.rewriters = {}
+
+        #self._script_head_inserts = config.get('script_head_inserts', {})
+
+        self.rewriters['header'] = config.get('header_class', HeaderRewriter)
+        self.rewriters['css'] = config.get('css_class', CSSRewriter)
+        self.rewriters['xml'] = config.get('xml_class', XMLRewriter)
+        self.rewriters['html'] = config.get('html_class', HTMLRewriter)
+
+        # Custom handling for js rewriting, often the most complex
+        self.js_rewrite_location = config.get('js_rewrite_location', True)
+        self.js_rewrite_location = bool(self.js_rewrite_location)
+
+        # ability to toggle rewriting
+        if self.js_rewrite_location:
+            js_default_class = JSLinkAndLocationRewriter
+        else:
+            js_default_class = JSLinkOnlyRewriter
+
+        # set js class, using either default or override from config
+        self.rewriters['js'] = config.get('js_class', js_default_class)
+
+        # add any regexs for js rewriter
+        self._add_custom_regexs('js', config)
+
+    def _add_custom_regexs(self, field, config):
+        regexs = config.get(field + '_regexs')
+        if not regexs:
+            return
+
+        rewriter_cls = self.rewriters[field]
+
+        rule_def_tuples = RegexRewriter.parse_rules_from_config(regexs)
+
+        def extend_rewriter_with_regex(urlrewriter):
+            #import sys
+            #sys.stderr.write('\n\nEXTEND: ' + str(rule_def_tuples))
+            return rewriter_cls(urlrewriter, rule_def_tuples)
+
+        self.rewriters[field] = extend_rewriter_with_regex
diff --git a/pywb/rewrite/test/test_rewrite_live.py b/pywb/rewrite/test/test_rewrite_live.py
index f3a7667a..36e74848 100644
--- a/pywb/rewrite/test/test_rewrite_live.py
+++ b/pywb/rewrite/test/test_rewrite_live.py
@@ -8,9 +8,18 @@ from pywb import get_test_dir
 
 urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/pywb/')
 
+def head_insert_func(rule):
+    if rule.js_rewrite_location == True:
+        return '<script src="/static/default/wombat.js"> </script>'
+    else:
+        return ''
+
 
 def test_local_1():
-    status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html', urlrewriter, 'com,example,test)/')
+    status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html',
+                                         urlrewriter,
+                                         'com,example,test)/',
+                                         head_insert_func)
 
     # wombat insert added
     assert '<head><script src="/static/default/wombat.js"> </script>' in buff
@@ -23,7 +32,10 @@ def test_local_1():
 
 
 def test_local_2_no_js_location_rewrite():
-    status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html', urlrewriter, 'example,example,test)/nolocation_rewrite')
+    status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html',
+                                         urlrewriter,
+                                         'example,example,test)/nolocation_rewrite',
+                                         head_insert_func)
 
     # no wombat insert
     assert '<head><script src="/static/default/wombat.js"> </script>' not in buff
@@ -55,6 +67,6 @@ def test_example_domain_specific_3():
     status_headers, buff = get_rewritten('http://facebook.com/digitalpreservation', urlrewriter)
 
     # comment out bootloader
-    assert '/* Bootloader.configurePage' in buff, buff
+    assert '/* Bootloader.configurePage' in buff
 
 
diff --git a/pywb/static/wb.js b/pywb/static/wb.js
index a7b39370..c4798da8 100644
--- a/pywb/static/wb.js
+++ b/pywb/static/wb.js
@@ -1,18 +1,21 @@
+/*
+Copyright(c) 2013-2014 Ilya Kreymer. Released under the GNU General Public License.
 
+This file is part of pywb.
 
-// Rewritten location and domain obj setup
-window.WB_wombat_location = window.location
+    pywb is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
 
-if (window.top != window) {
-    window.top.WB_wombat_location = window.top.location
-}
-
-if (window.opener) {
-    window.opener.WB_wombat_location = window.opener.location
-}
-
-document.WB_wombat_domain = document.domain
+    pywb is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
 
+    You should have received a copy of the GNU General Public License
+    along with pywb.  If not, see <http://www.gnu.org/licenses/>.
+*/
 
 function initBanner()
 {
diff --git a/pywb/static/wombat.js b/pywb/static/wombat.js
new file mode 100644
index 00000000..d2b7d12c
--- /dev/null
+++ b/pywb/static/wombat.js
@@ -0,0 +1,219 @@
+/*
+Copyright(c) 2013-2014 Ilya Kreymer. Released under the GNU General Public License.
+
+This file is part of pywb.
+
+    pywb is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    pywb is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with pywb.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+//============================================
+// Wombat JS-Rewriting Library
+//============================================
+
+var WB_wombat_replayPrefix;
+var WB_wombat_replayDatePrefix;
+var WB_wombat_captureDatePart;
+var WB_wombat_origHost;
+
+	
+function WB_StripPort(str)
+{
+  var hostWithPort = str.match(/^http:\/\/[\w\d@.-]+:\d+/);
+  if (hostWithPort) {
+     var hostName = hostWithPort[0].substr(0, hostWithPort[0].lastIndexOf(':'));
+     return hostName + str.substr(hostWithPort[0].length);
+  }
+
+  return str;
+}
+
+function WB_IsHostUrl(str)
+{
+  // Good guess that's its a hostname
+  if (str.indexOf("www.") == 0) {
+    return true;
+  }
+  
+  // hostname:port (port required)
+  var matches = str.match(/^[\w-]+(\.[\w-_]+)+(:\d+)(\/|$)/);
+  if (matches && (matches[0].length < 64)) {
+    return true;
+  }
+  
+  // ip:port
+  matches = str.match(/^\d+\.\d+\.\d+\.\d+(:\d+)?(\/|$)/);
+  if (matches && (matches[0].length < 64)) {
+    return true;
+  }
+
+  return false;
+}
+
+function WB_RewriteUrl(url)
+{
+  var httpPrefix = "http://";
+
+  // If not dealing with a string, just return it
+  if (!url || (typeof url) != "string") {
+    return url;
+  }
+  
+  // If starts with prefix, no rewriting needed
+  // Only check replay prefix (no date) as date may be different for each capture
+  if (url.indexOf(WB_wombat_replayPrefix) == 0) {
+    return url;
+  }
+  
+  // If server relative url, add prefix and original host
+  if (url.charAt(0) == "/") {
+    
+    // Already a relative url, don't make any changes!
+    if (url.indexOf(WB_wombat_captureDatePart) >= 0) {
+      return url;
+    }
+    
+    return WB_wombat_replayDatePrefix + WB_wombat_origHost + url;
+  }
+  
+  // If full url starting with http://, add prefix
+  if (url.indexOf(httpPrefix) == 0) {
+    return WB_wombat_replayDatePrefix + url;
+  }
+  
+  // May or may not be a hostname, call function to determine
+  // If it is, add the prefix and make sure port is removed
+  if (WB_IsHostUrl(url)) {
+    return WB_wombat_replayDatePrefix + httpPrefix + url;
+  }
+
+  return url;
+}
+
+function WB_CopyObjectFields(obj)
+{
+  var newObj = {};
+  
+  for (prop in obj) {
+    if ((typeof obj[prop]) != "function") {
+      newObj[prop] = obj[prop];
+    }
+  }
+  
+  return newObj;
+}
+
+function WB_ExtractOrig(href)
+{
+  if (!href) {
+    return "";
+  }
+  href = href.toString();
+  var index = href.indexOf("/http", 1);
+  if (index > 0) {
+    return href.substr(index + 1);
+  } else {
+    return href;
+  }
+}
+  
+function WB_CopyLocationObj(loc)
+{
+  var newLoc = WB_CopyObjectFields(loc);
+  
+  newLoc._origLoc = loc;
+  newLoc._origHref = loc.href;
+  
+  // Rewrite replace and assign functions
+  newLoc.replace = function(url) { this._origLoc.replace(WB_RewriteUrl(url)); }
+  newLoc.assign = function(url) { this._origLoc.assign(WB_RewriteUrl(url)); }
+  newLoc.reload = loc.reload;
+  newLoc.href = WB_ExtractOrig(newLoc._origHref);
+  newLoc.toString = function() { return this.href; }
+  
+  return newLoc;
+}
+
+function WB_wombat_updateLoc(reqHref, origHref, location)
+{
+  if (reqHref && (WB_ExtractOrig(origHref) != WB_ExtractOrig(reqHref))) {      
+    var finalHref = WB_RewriteUrl(reqHref);
+    
+    location.href = finalHref;
+  }  
+}
+
+function WB_wombat_checkLocationChange(wbLoc, isTop)
+{
+  var locType = (typeof wbLoc);
+  
+  var location = (isTop ? window.top.location : window.location);
+	
+  // String has been assigned to location, so assign it
+  if (locType == "string") {
+    WB_wombat_updateLoc(wbLoc, location.href, location)
+    
+  } else if (locType == "object") {
+    WB_wombat_updateLoc(wbLoc.href, wbLoc._origHref, location);
+  }
+}
+
+var wombat_updating = false;
+
+function WB_wombat_checkLocations()
+{
+  if (wombat_updating) {
+    return false;
+  }
+  
+  wombat_updating = true;
+  
+  WB_wombat_checkLocationChange(window.WB_wombat_location, false);
+  
+  if (window.self.location != window.top.location) {
+    WB_wombat_checkLocationChange(window.top.WB_wombat_location, true);
+  }
+  
+  wombat_updating = false;
+}
+
+function WB_wombat_Init(replayPrefix, captureDate, origHost)
+{
+  WB_wombat_replayPrefix = replayPrefix;
+  WB_wombat_replayDatePrefix = replayPrefix + captureDate + "/";
+  WB_wombat_captureDatePart = "/" + captureDate + "/";
+  
+  WB_wombat_origHost = "http://" + origHost;
+
+  window.WB_wombat_location = WB_CopyLocationObj(window.self.location);
+
+
+  if (window.self.location != window.top.location) {
+    window.top.WB_wombat_location = WB_CopyLocationObj(window.top.location);
+  }
+
+  if (window.opener) {
+    window.opener.WB_wombat_location = (window.opener ? WB_CopyLocationObj(window.opener.location) : null);
+  }
+
+
+  document.WB_wombat_domain = origHost;
+
+}
+
+// Check quickly after page load
+setTimeout(WB_wombat_checkLocations, 100);
+
+
+// Check periodically every few seconds
+setInterval(WB_wombat_checkLocations, 500);
diff --git a/pywb/ui/head_insert.html b/pywb/ui/head_insert.html
index b30cd015..aa910442 100644
--- a/pywb/ui/head_insert.html
+++ b/pywb/ui/head_insert.html
@@ -1,7 +1,14 @@
 <!-- WB Insert -->
+{% if rule.js_rewrite_location %}
+<script src='{{ wbrequest.host_prefix }}/static/default/wombat.js'> </script>
 <script>
-wbinfo = {}
-wbinfo.capture_str = "{{ cdx['timestamp'] | format_ts }}";
+  WB_wombat_Init("{{wbrequest.wb_prefix}}", "{{cdx['timestamp']}}", "{{cdx['original'] | host}}");
+</script>
+{% endif %}
+
+<script>
+  wbinfo = {}
+  wbinfo.capture_str = "{{ cdx['timestamp'] | format_ts }}";
 </script>
 <script src='{{ wbrequest.host_prefix }}/static/default/wb.js'> </script>
 <link rel='stylesheet' href='{{ wbrequest.host_prefix }}/static/default/wb.css'/>
diff --git a/pywb/cdx/canonicalize.py b/pywb/utils/canonicalize.py
similarity index 95%
rename from pywb/cdx/canonicalize.py
rename to pywb/utils/canonicalize.py
index e2f818b9..bd21e4ca 100644
--- a/pywb/cdx/canonicalize.py
+++ b/pywb/utils/canonicalize.py
@@ -3,8 +3,6 @@
 
 import surt
 import urlparse
-from cdxobject import CDXException
-
 
 #=================================================================
 class UrlCanonicalizer(object):
@@ -15,6 +13,12 @@ class UrlCanonicalizer(object):
         return canonicalize(url, self.surt_ordered)
 
 
+#=================================================================
+class UrlCanonicalizeException(Exception):
+    def status(self):
+        return '400 Bad Request'
+
+
 #=================================================================
 def canonicalize(url, surt_ordered=True):
     """
@@ -31,7 +35,7 @@ def canonicalize(url, surt_ordered=True):
     try:
         key = surt.surt(url)
     except Exception as e:
-        raise CDXException('Invalid Url: ' + url)
+        raise UrlCanonicalizeException('Invalid Url: ' + url)
 
     # if not surt, unsurt the surt to get canonicalized non-surt url
     if not surt_ordered:
diff --git a/pywb/utils/dsrules.py b/pywb/utils/dsrules.py
new file mode 100644
index 00000000..2e6f9626
--- /dev/null
+++ b/pywb/utils/dsrules.py
@@ -0,0 +1,98 @@
+import yaml
+import pkgutil
+
+#=================================================================
+
+DEFAULT_RULES_FILE = 'rules.yaml'
+DEFAULT_RULES_PKG = 'pywb'
+
+
+#=================================================================
+class RuleSet(object):
+    DEFAULT_KEY = ''
+
+    def __init__(self, rule_cls, fieldname, **kwargs):
+        """
+        A domain specific rules block, inited via config map.
+        If config map not specified, it is loaded from default location.
+
+        The rules are represented as a map by domain.
+        Each rules configuration will load is own field type
+        from the list and given a specified rule_cls.
+        """
+
+        self.rules = []
+
+        ds_rules_file = kwargs.get('ds_rules_file')
+        default_rule_config = kwargs.get('default_rule_config')
+
+        config = self.load_default_rules(ds_rules_file)
+
+        rulesmap = config.get('rules') if config else None
+
+        # if default_rule_config provided, always init a default ruleset
+        if not rulesmap and default_rule_config is not None:
+            self.rules = [rule_cls(self.DEFAULT_KEY, default_rule_config)]
+            return
+
+        def_key_found = False
+
+        # iterate over master rules file
+        for value in rulesmap:
+            url_prefix = value.get('url_prefix')
+            rules_def = value.get(fieldname)
+            if not rules_def:
+                continue
+
+            if url_prefix == self.DEFAULT_KEY:
+                def_key_found = True
+
+            self.rules.append(rule_cls(url_prefix, rules_def))
+
+        # if default_rule_config provided, always init a default ruleset
+        if not def_key_found and default_rule_config is not None:
+            self.rules.append(rule_cls(self.DEFAULT_KEY, default_rule_config))
+
+    @staticmethod
+    def load_default_rules(filename=None, pkg=None):
+        config = None
+
+        if not filename:
+            filename = DEFAULT_RULES_FILE
+
+        if not pkg:
+            pkg = DEFAULT_RULES_PKG
+
+        if filename:
+            yaml_str = pkgutil.get_data(pkg, filename)
+            config = yaml.load(yaml_str)
+
+        return config
+
+    def iter_matching(self, urlkey):
+        """
+        Iterate over all matching rules for given urlkey
+        """
+        for rule in self.rules:
+            if rule.applies(urlkey):
+                yield rule
+
+    def get_first_match(self, urlkey):
+        for rule in self.rules:
+            if rule.applies(urlkey):
+                return rule
+
+
+#=================================================================
+class BaseRule(object):
+    """
+    Base rule class -- subclassed to handle specific
+    rules for given url_prefix key
+    """
+    def __init__(self, url_prefix, rules):
+        self.url_prefix = url_prefix
+        if not isinstance(self.url_prefix, list):
+            self.url_prefix = [self.url_prefix]
+
+    def applies(self, urlkey):
+        return any(urlkey.startswith(x) for x in self.url_prefix)
diff --git a/pywb/wbapp.py b/pywb/wbapp.py
index 0befa172..ac51ba9d 100644
--- a/pywb/wbapp.py
+++ b/pywb/wbapp.py
@@ -2,6 +2,7 @@ from wbexceptions import WbException, NotFoundException, InternalRedirect
 from wbrequestresponse import WbResponse, StatusAndHeaders
 
 from pywb.cdx.cdxserver import CDXException
+from pywb.utils.canonicalize import UrlCanonicalizeException
 from pywb.warc.recordloader import ArchiveLoadFailed
 
 import os
@@ -55,7 +56,8 @@ def create_wb_app(wb_router):
         except InternalRedirect as ir:
             response = WbResponse(StatusAndHeaders(ir.status, ir.httpHeaders))
 
-        except (WbException, CDXException, ArchiveLoadFailed) as e:
+        except (WbException, CDXException,
+                UrlCanonicalizeException, ArchiveLoadFailed) as e:
             response = handle_exception(env, wb_router.error_view, e, False)
 
         except Exception as e:
diff --git a/setup.py b/setup.py
index dac8a907..0750fe55 100755
--- a/setup.py
+++ b/setup.py
@@ -13,7 +13,7 @@ setuptools.setup(name='pywb',
         license='GPL',
         packages=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite'],
         provides=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite'],
-        package_data={'pywb': ['ui/*', 'static/*'], 'pywb.cdx': ['*.yaml']},
+        package_data={'pywb': ['ui/*', 'static/*'], 'pywb': ['*.yaml']},
         data_files = [('sample_archive/cdx/', glob.glob('sample_archive/cdx/*')),
                       ('sample_archive/warcs/', glob.glob('sample_archive/warcs/*')),
                       ('sample_archive/text_content/', glob.glob('sample_archive/text_content/*'))],

From 22f1f78fcabbc5deec3c441d9fc0ffce3d43f178 Mon Sep 17 00:00:00 2001
From: Ilya Kreymer <ilya@archive.org>
Date: Thu, 27 Feb 2014 18:22:10 +0000
Subject: [PATCH 4/8] cdx: clean up filters, add '~' modifier for contains
 rules: fix regex to be lazy not greedy, turn off unneeded custom
 canonicalizer (need tests for custom canon) cleanup fuzzy match query fix
 data package in setup.py

---
 pywb/cdx/cdxdomainspecific.py   | 17 +++++++++++------
 pywb/cdx/cdxops.py              | 33 ++++++++++++++++++++++++---------
 pywb/cdx/cdxserver.py           | 15 +++++++--------
 pywb/cdx/test/cdxserver_test.py | 10 ++++++++++
 pywb/rules.yaml                 |  9 +++++----
 setup.py                        |  2 +-
 6 files changed, 58 insertions(+), 28 deletions(-)

diff --git a/pywb/cdx/cdxdomainspecific.py b/pywb/cdx/cdxdomainspecific.py
index 006dd88d..54654b5e 100644
--- a/pywb/cdx/cdxdomainspecific.py
+++ b/pywb/cdx/cdxdomainspecific.py
@@ -38,8 +38,8 @@ def load_domain_specific_cdx_rules(ds_rules_file, surt_ordered):
     if rules:
         fuzzy = FuzzyQuery(rules)
 
-    logging.debug('CANON: ' + str(canon))
-    logging.debug('FUZZY: ' + str(fuzzy))
+    logging.debug('CustomCanonilizer? ' + str(bool(canon)))
+    logging.debug('FuzzyMatcher? ' + str(bool(canon)))
     return (canon, fuzzy)
 
 
@@ -73,6 +73,8 @@ class FuzzyQuery:
 
         urlkey = params['key']
         url = params['url']
+        filter_ = params.get('filter', [])
+        output = params.get('output')
 
         for rule in self.rules.iter_matching(urlkey):
             m = rule.regex.search(urlkey)
@@ -82,7 +84,7 @@ class FuzzyQuery:
             matched_rule = rule
 
             if len(m.groups()) == 1:
-                params['filter'] = '=urlkey:' + m.group(1)
+                filter_.append('~urlkey:' + m.group(1))
 
             break
 
@@ -91,10 +93,13 @@ class FuzzyQuery:
 
         inx = url.find('?')
         if inx > 0:
-            params['url'] = url[:inx + 1]
+            url = url[:inx + 1]
+
+        params = {'url': url,
+                  'matchType': 'prefix',
+                  'filter': filter_,
+                  'output': output}
 
-        params['matchType'] = 'prefix'
-        params['key'] = None
         return params
 
 
diff --git a/pywb/cdx/cdxops.py b/pywb/cdx/cdxops.py
index 247f3d18..1a90d7ca 100644
--- a/pywb/cdx/cdxops.py
+++ b/pywb/cdx/cdxops.py
@@ -157,9 +157,15 @@ def cdx_filter(cdx_iter, filter_strings):
             if self.invert:
                 string = string[1:]
 
-            self.exact = string.startswith('=')
-            if self.exact:
+            # exact match
+            if string.startswith('='):
                 string = string[1:]
+                self.compare_func = self.exact
+            elif string.startswith('~'):
+                string = string[1:]
+                self.compare_func = self.contains
+            else:
+                self.compare_func = self.regex
 
             parts = string.split(':', 1)
             # no field set, apply filter to entire cdx
@@ -170,19 +176,28 @@ def cdx_filter(cdx_iter, filter_strings):
                 self.field = parts[0]
                 string = parts[1]
 
-            if self.exact:
-                self.exact_str = string
-            else:
+            # make regex if regex mode
+            if self.compare_func == self.regex:
                 self.regex = re.compile(string)
+            else:
+                self.filter_str = string
 
         def __call__(self, cdx):
             val = cdx[self.field] if self.field else str(cdx)
-            if self.exact:
-                matched = (self.exact_str == val)
-            else:
-                matched = self.regex.match(val) is not None
+
+            matched = self.compare_func(val)
+
             return matched ^ self.invert
 
+        def exact(self, val):
+            return (self.filter_str == val)
+
+        def contains(self, val):
+            return (self.filter_str in val)
+
+        def regex(self, val):
+            return self.regex.match(val) is not None
+
     filters = map(Filter, filter_strings)
 
     for cdx in cdx_iter:
diff --git a/pywb/cdx/cdxserver.py b/pywb/cdx/cdxserver.py
index 7f548ec4..8eff842c 100644
--- a/pywb/cdx/cdxserver.py
+++ b/pywb/cdx/cdxserver.py
@@ -50,14 +50,14 @@ class BaseCDXServer(object):
 
         url = params['url']
 
-        if self.fuzzy_query and params.get('allowFuzzy'):
-            if not 'key' in params:
-                params['key'] = self.url_canon(url)
+        # check if fuzzy is allowed and ensure that its an
+        # exact match
+        if (self.fuzzy_query and params.get('allowFuzzy') and
+            params.get('matchType', 'exact') == 'exact'):
 
-            params = self.fuzzy_query(params)
-            if params:
-                params['allowFuzzy'] = False
-                return self.load_cdx(**params)
+            fuzzy_params = self.fuzzy_query(params)
+            if fuzzy_params:
+                return self.load_cdx(**fuzzy_params)
 
         msg = 'No Captures found for: ' + url
         raise CaptureNotFoundException(msg)
@@ -95,7 +95,6 @@ class CDXServer(BaseCDXServer):
                 msg = 'A url= param must be specified to query the cdx server'
                 raise CDXException(msg)
 
-            #params['key'] = self.url_canon(url)
             match_type = params.get('matchType', 'exact')
 
             key, end_key = calc_search_range(url=url,
diff --git a/pywb/cdx/test/cdxserver_test.py b/pywb/cdx/test/cdxserver_test.py
index 0e799ce9..384d7187 100644
--- a/pywb/cdx/test/cdxserver_test.py
+++ b/pywb/cdx/test/cdxserver_test.py
@@ -54,6 +54,16 @@ com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit
 com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz
 com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz
 
+# Filter contains
+>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '~urlkey:example=1')
+com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz
+com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz
+
+# Filter contains invert
+>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '!~urlkey:example=1')
+com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz
+com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz
+
 # Collapse by timestamp
 # unresolved revisits, different statuscode results in an extra repeat
 >>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', collapseTime = 11)
diff --git a/pywb/rules.yaml b/pywb/rules.yaml
index 5cf29154..8927d2f1 100644
--- a/pywb/rules.yaml
+++ b/pywb/rules.yaml
@@ -12,11 +12,12 @@ rules:
     #=================================================================
     - url_prefix: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet'
 
-      fuzzy_lookup: 'com,facebook\)/.*[?&]data=(.*(?:[&]|query_type[^,]+))'
+      fuzzy_lookup: 'com,facebook\)/.*[?&]data=(.*?(?:[&]|query_type[^,]+))'
 
-      canonicalize:
-        match: 'com,facebook\)/.*[?&]data=([^&]+).*'
-        replace: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet?data=\1'
+# not actually needed, fuzzy match is used instead here
+#      canonicalize:
+#        match: 'com,facebook\)/.*[?&]data=([^&]+).*'
+#        replace: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet?data=\1'
 
 
     - url_prefix: 'com,facebook)/'
diff --git a/setup.py b/setup.py
index 0750fe55..94c1bca7 100755
--- a/setup.py
+++ b/setup.py
@@ -13,7 +13,7 @@ setuptools.setup(name='pywb',
         license='GPL',
         packages=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite'],
         provides=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite'],
-        package_data={'pywb': ['ui/*', 'static/*'], 'pywb': ['*.yaml']},
+        package_data={'pywb': ['ui/*', 'static/*', '*.yaml']},
         data_files = [('sample_archive/cdx/', glob.glob('sample_archive/cdx/*')),
                       ('sample_archive/warcs/', glob.glob('sample_archive/warcs/*')),
                       ('sample_archive/text_content/', glob.glob('sample_archive/text_content/*'))],

From 7863b2bade76443823a702dc81e9fc76128a9f7d Mon Sep 17 00:00:00 2001
From: Ilya Kreymer <ilya@archive.org>
Date: Thu, 27 Feb 2014 20:10:44 +0000
Subject: [PATCH 5/8] add sample data for zipnum #17

---
 sample_archive/zipcdx/zipnum-sample.cdx.gz | Bin 0 -> 9768 bytes
 sample_archive/zipcdx/zipnum-sample.idx    |  38 +++++++++++++++++++++
 sample_archive/zipcdx/zipnum-sample.loc    |   1 +
 3 files changed, 39 insertions(+)
 create mode 100644 sample_archive/zipcdx/zipnum-sample.cdx.gz
 create mode 100644 sample_archive/zipcdx/zipnum-sample.idx
 create mode 100644 sample_archive/zipcdx/zipnum-sample.loc

diff --git a/sample_archive/zipcdx/zipnum-sample.cdx.gz b/sample_archive/zipcdx/zipnum-sample.cdx.gz
new file mode 100644
index 0000000000000000000000000000000000000000..8687b97a4f9db618b9b3defac200e2ffb89e865b
GIT binary patch
literal 9768
zcmZ9SRa6~okcIJJ32?B3yW4@_?tXB0cS~@0hu}_dcS3Nt;O?%$B{)HcduOd#!+q<Q
ze)y{PzjxK&O%w?aL6?l84E6dwUe`Jh1PngEqx;HpaQRFU$o$hrSEjH*t*{jxTPs(t
zc8UJ+Mp==&!DVgO6Mb-qOI|L1cS-Pkr(*HQ8itw@AH6J$6vMcNxu{93WF=H83o`!>
z&Q`VoHo7Z!#uUT7+s0I&)Zk=n&T(_h{eatt(_|Ws;|6t<!XPRzrBV3Y05b(a$5)4f
zDgl-&c{Nj{&_~|2v{d&Mg8>FQNJIK5*?Xh%Oj5#M^Yb;ZAN$l4T+@O8=P~g5=g((e
z>^ozr7iemxg+NxHXcQ*0V{PeS&ycb*12VsQfcLS{-eUtcm^uTIOD{MwESiw$1lBba
z-F27YJA1m4VSQ(w^P!^vWZpB>a|{#&1kwN1q&==%E>I9P<ea}>4@}>BQiF6OPE+|a
zSP8`7#tdUgp_W{uE#>85X^f&4Egqe=WAbvIfiq>pI9xG2&YUO85he#-o2Ah>&?|ia
z?074qkHMXmeSirf{e$cQ%Zw5*74UGu!kmy{X;oaCP}5KRJx5R70AiZKEVorxSF&;}
zU1x<iJG|P$cEw79RQ!>76+^y|UKN`=#+XM3Og##`n>PI!a4}d<*6T|7!r4ZMKZwkk
zxB8Unl_3+(!)?K>7bz{Jqt;wfuxM7cQ+A^mtt?DGETG~^wEaEE#JQc}N;WC4hg3cI
zNNNsa$v8!BlK#({A-8Lw+nI+A&a#L(H?(;dG_X1z<k3!I2)C>a^mZbC{hHPx=i4+o
zMAlI_f&7h=BOJk(3}si~cW$fKMnP12k}srhzO)h(^!qxF$n^+f2;TYh+kkEMr<!NB
zt9Z*SXcS0cIm{|xGd5*s(iZ!w8!5g(ei4E<bjEgc+{@I_wLh0G-DU6@iSCY4m?(71
zPhx9QX;u?IFG#9tz?<d&Dx79P1u2+4uXYFI0PncKs4uMe67P^nk{#nO^t)P0N8)2p
zB}_T+5l}=PS|E>Ta^mw2tmiNR&PlY#rd|4ly1aLl3Ec>nUuT(DPbZ@J<+Co<*?#+z
z>ca$dA6~Z5CdLhOo_pMFEkk=x$H~oPH>8S%9=;lysGbh%EGyAfIBIJQEVjhzGi2vD
zU&jjrW9o{f8ZZ=r;+Ac)`$I?0gx$vJr;gM&rKn-IP*^$@@b^n%5>#BJkZyi%U4B<c
zu=k=m91y^5J_AB7<a<~zcIxTz_jOAE%k&Cjh~2$JG-9TwaS+)9*80V23RTmM^6bbf
zZIKgYhmhf(ZsZVkF?6H`A9TlmJGeKLn}g@n8lQ~!@l&WV(4P#n&zty8wbknxmzQ^_
zu3jLvDikHj(2V1{Qtpa9#~(Y)p#ztyUZAh=K^c%&4&B$PjXP^EtzV);;n#npWKREX
z)2SHlF|kblJwmE4(B4t~*<*ABHQX-|k|pG4kmEv^!OcYUbE99nh99{?W}j9xc`Pxw
z4FYnFCYm$Z4}{K?(`^7a8UVy9BM}Foz?119JwfsG#7p$QN1g7r>Jdcs+w#mdstPu9
zHFx2#ZYvDU$-`1+ZNQeQ{N-mt(F#16;XvcX@Q9DSLVo?zz1ZSLb4BaB*V^uw2_qfy
zWI=b{)Q(S^qRx2#W8Z%78Ev^1AW4FNM$$d>IFfo*WT~-W(7^aI)K;H%$9Uy2!Rp?=
zf3v04GA#l@G{il3;gvpywL~eq%_+5?;Z(osQw89Ys&k=$sH6xIN2O_6JHoBFcmII`
zT>u37p^C?;{2+orto#6n3M3Cyl!qvo>Vkk3&+C4B#KreBNMt|Ve}R00dI)#?4{CqB
zjQpFc44nH!w5P*AZq{yojJ5!0W4K=%sY%DdX#QwTD#%_B?(1XKvr6|!@0ImQ*ci=@
zJSs9{Xbq_jPK+hCNdum_m6Px4E=kWz??iKGE0+>X+Ts~*A4%K5tS24Wk(@lP;{z$N
z=a^E1e^q=KUE?S+&-2TBsxYnegS`Ah@ipL(+&il<91l(i2u`qIn_w@WbdONtGlCfC
z_T!$t;{~1J0?nM8wtcbxFwqM_6s9U2%R*sYibJ|jVY^}V?;^uJ0A6EBunl(6ux;*i
zY2?vSP3!GbIUk_DvYm>C9()LQ)>X!qoofqjQz&F908l(m(xJM^&Le%5-(LEfW^=+h
z(n~*x@O|Hs*F9DCkUF^5_L2V&{WcNBF;Z9p=hxMD;^?0ldlLsn$d-5YwhhwQQ+8UP
zE}C|bRv{dlok?seoV2)E5uC>jWtG-J70H@!Td4V$%xXfbl|@ev&JaP^op=Y2uVCca
z?<16pl;j;1`~nqJ%vm|w)W3qUqAXSZ5bn4OTdp(|OrY+U>r0PR{XCPg-;l1nkvJNW
zAvl6Z;I*HZVaEJxRKI+p+Ci7)5)%xn?q|=Q*kdZnAom*-q=_aCTxsqQDy!zX`w(L2
z;S~atBvp0qhn)3oLcwo2X9}8SR{(?xGA-@f0@Cze;NXLf`9)t$bRdgnWea!SqfQ9P
z^n!rDE?Tom)eXUy9pLq!tskeD6*zR#%2Uw9)sgaXq<(G|H{KL%jv1Y!yoP39-Bi?8
zwqu1F!$VON;P(yHGsC-<E*q@G-k16CXipgP*81d<6Xz!4pn67MuwdsMbMu=tUEMGh
zI_ocgDlE1y9Dm6FGv|q(J_u-7wOw!v?C&+J^Hl8bXj@^DW{vsnSlgnzmf;f*s>CEq
zqZ?jaSex9u8Ay6Wf~GQnAgh!bx&U%*k|PgyZ-zywya<q)CYq<FO#zAPvjPku!e-@v
zQJK_Ma{UK0UgjTX3x0F9R?Jb}GhVyt?_ttHwzawJcEbwOz*yNdoR9a%V5j`e@yV*Q
zy*~u}XBWB67P|Z6e_gQ2BuFq5yv|!OngwQcopeWq*V@`!x|EiCV<(042&($<^^~|r
zInN)nqJeuNS05~yW7WtGFFusI8V-m&um55ff@?gUgQ~7+(`Q_mXjxU^$QVI9%=Mt%
zU%2AICB*^e`z6+>b*rxp8jKC3y@tYN35uY_s;TV+;cDT(;=!B9>Mhc+MI=NGV<l70
zDr|eIT$B(3s25BwHwd$FfBAggec#d<uzU{)*!9-ZJazQOjtZtxJXx5$9w($IU0DKC
zMt^6`NSUW=EIA8a1M;nx-B;dD);U{u<smJ--i02ZB<d$G<b{?7xFhj7@%dlYs?95d
z^NlVaS7F3b;psuS@NL>FtHO)U-#aJnTNr$4#&l?wQuMw_f44zeEQ=Xd!E!TTl{X_N
zJygb@a7;HFd93IpH)_!-NZ8s{Y(hf;HL_qBLa4Qh<M?L(8JVE-j_4_|y(!`-H-@8v
z|04c6Ecs3Ohb6lziEW{E2~v55xlQ@*{HdT{Ox=Z-7f19qJ?Y=gE%%y(V)v|%n!FK3
ztXw6ay-Xw2SdhcdUP$veguYjb#|vR9O^r!t)&frQ{h-ZtG2*DtAqQL}-4w(m20hQX
zw*;54{!$nkkMHX?4Yq#s$+t-G?bktac6C`GsB30xP@&$1mLby%kcvXn;%JfyetEEv
zXSl}de*E(Z<#R(N;ZYvV+26Cx{|!GAythb5jHbiIZn~^nUx^{RdVzVC3=@tQ&CCs{
zyG!BP*IlU1?X}T<PSCn3b&@RXRMwp$bkp{ktGY#npwPU`;$_%z#v1XuXwkGG+PLKP
zoIg%+!zp@}gwV2{6sIlRG~bvsn7jMljzDSw1rKRhkcf{?N)=#18b~G_6^kw?Oq`Cu
zyl0IQoS;K4tU~#K)C`p-d53oTo^<M;oPQ&2?KG+quG?r$xSlUE5-dr=_}DC|H3KYJ
ziq}QHKVEZF`eOzvZbm-;18T6xvYk_1xc+;9j~OM?b2#L(uIO3HqwdW3^+wyXW*>X|
z8KEb_NGBDPHvi}7*;MKfgluwxy28;}a3(@ar2O}8R^disdEva$norc=2dU>M`-+o#
zL7vVAUq!as;}3ef>ofMXGTsW&oL=xQK(TwFy;dkahVBs?9{U0r6`C5>h=oLS6f%xE
zM~RYMSr{V&GQe+{+-f=|X!%tHAZS9pBtJ+M$8j0<9<@Cd{7?9WoWJ}Veq5~wl{x+v
z(knTpz5ND-l6jLgc+4*z{)9H^zbvBlsLpw>Mpu6R)=XOXOVgo`6Nr<$TIjYmJHKhB
zJWE*}Z5)wul<V%}RH>WKPYQ4PH4K2lr`gJk4RqDr|3utqIRagwIUx<F_G9Ge(j|Ch
zyPhRU>{EQ)x8x~P{88sFmRGZ_;)E6Y8Fdg!pa!{jpAZBTO^lR+MH>fb_42uZ>wc)H
z((3p&kiec4Cu&{a(wA|0du*G*Kgaas*un$5cp(zP%o5j3r`!S2f0`xZ-be*Qi3LW~
z?Zi!NQ$S($_(D6r3wB%fE24x9E$OR}lH1V}HkkI|F5XX2sp>ZU2|}P`Crsy^7TV+G
zkpcuU`p&&0Tt8a9p0Ctvdw>EWJ{7tl#DSP-%gL_~S6ZCmg=>!gnC*SA$5m%6Ya8Fc
zR&pZvM+cV2Wi3>C5KPp+U=e>su{s<-x`@4Xe)l(NG+ie!bQN<}_gq!|rY7i$UzVeF
zhJT5W%v&;^Goduxm79ge>{#J%0)<6*PV}@|56^Rw(C|RTUxH~CHm(9#TLuXp!uXxt
zSh!e(4OT*>gCP<&;uL?}*E7FJ0ULlKjE_1|th|PJ&oI|}o~d@kx4{l(bG>*`ZL+n!
zJ^SV)DwwoH!$KGrfs`3cnqtsZ%(1Kq514dW#Nl70-OVSZtIOrhI?@8rG9<Qz<k^T~
z3t|IRcHyTILS<%jij^8{>9QCUgP%ld$9kkft}06+;eI(RiVl5k-{>n}TeaJ9i#rO!
zw&vOIE36xIqB5>Y?{<$q*{pwAMg8lo;2o`EF;*S7>lasf*&Y{hzb}a-J1+Y`c8DSg
zH;<lQDxmb}+qObV=Ow`pQT`mW@gAo>?*H`LoUVa&YdwkgvuI8uR%B$jAC-I%cJPe$
zmp)U!jL0+w*dyJ9<vfFG;%7@s=@n%sWpyxfPz#|W@AP3y?Rbr!VYhC~u%XgfZ)^P;
z0g3kpPiW%I(UEA)gtL&-U(Jl=9;jQz6806{M1D#cwBkc<W5fmRT2YER;@#dMC>V6V
z_%FkTHUdtZb9bsdMDfj>!s#+f>rkX$`+ODiLJ0|I87l!o(%HQ$66)!nqlzX?mt<91
z;v?!}C8V@{G%vNcgqx7gvQ@FfGC!b1h{@A_bNjJ8avy9b`U4lo8w3BIxHp^Z5~jew
z*FUo)GnfVlE$&T8_4)!SYb)l>^S;IL|F$^jZFJY}UOhd1rf`vlfeqK9o)Q;^XO51O
z0d3BBY9V6L))Kgw3Ai-4{4N<DUy3EVD%uJXX6epY#q3XWhi%JKh$Rt?gp;%N#Bp4A
zCh%O;I30E!XLRIp?XR~g!-7h+qpEZEkOUM`MygPbI3<;U+dh}`jonb*l0jPF_#={u
zew?_MfTObyd_b3-5-1{l^JhcfuEYjcSwtc2POwi5PTmpGOu#2Hz%}$9k@VdHD>ZEU
z(;fkMbZbTAJ<)XSn`n{YHrI*=X%|~%(pzs=6A=-WoE6F<6P2v6=@yK-iyK?GI7?>P
zc3=O(m+COn&Ny->4pdZ8&|Jd~b33~GV6DNlAkdMENCy{e&9bJ{Cfcq<NXV*Gxuw`B
zr%0vfkF?ly1s=MvXj6r9nq|=g1oSA&Q#q!_D{fgAGD@^B1eGD+mf+wg9wb7k?E;UW
zi^Pt;ACdtB4Gnw@;!m@20%}W6?qpTcJ<Wfh__I9qvZ1vO{QZ!?TIXZB4ygjkT>KxT
zseczNxXo61X7%n9GgmX$C=Ri1;cyR&@b*Mhk<tD^JDVJN`@GFD5#?>mO>?vR&1xNH
zni*y8<c+jDrBKA+c>XsmfGw1*aPD-&cdgI(#}B*-KP+*lzK0KEPB3Q7noj=~C}gsR
zH@INdw@s+#q)hLm0CrMkq`HjAw=NymFEOknK_HWW#7!_)*IflfFL=scdipHfilEpd
zE(B!LJAy33k#Ee6+?du-4n^Vi28nMSe{8GQ%VCLd>Wnx0QddtT1Kj_^H4T=5W=4JE
zddtqdjESLw3!Pjb8fNKyB#iE~;DTg}Q@_6bd_Hy|y$quv<>+G<d5T+Bk0<AErqW0q
zgwnKK<HDuisTsC->&rd2q44#g!n8`lu|kWt&jYagy`Y*4t`UkCJ~C&LhPL%_{#M=Z
z)SDrM2Y-E>y_+u^BPukP%j@;+o|f&Ob_n={lw<0aA@kVgd*l~Uhv_2qss2)TU^?OZ
zM*S}2EAT#|JJxqGX3a@}S|fLAPEBC!Mm1Wy#c;CSFF*n%!{1<B2ZJ1^F@frVzTT<!
zp)e00%iqiLt%toJWQa-6PT_?fqY*3m0X}nuBj$PLA4MsEmQA>Z<|Yz<Z|?*GPZ}Ev
zDu42VP>eiZi?VYo#=EhUlprgTMEuV~=6^`I=I0~``kSvnvq~y`L8&w$#w%ILO1Iuj
zzg*DrI{EUew}pPSSdW8|5O}Zf&h^ygW9aULZJg}pgGXf4o2J|C-FSYI=deNMQ_60+
zPH~FdXKUW$eJ?7aGrrM1P@77;h6NQTkLezm6c>=y%vr&|7SrKpaS-s0b>+<M9UWm4
zCQ2d}zek%M`xkBVF-v$sXosl($6A&)^$l=^TDNWl-qrznO(!$9E6YdDjt5hGP>LAg
zP=s2vvt;D_GR}8iKPx|$E5U2fWMNaprF7U4Q**{$l8u7=2!lea>F23!zx&3WjfFYM
z!{g@%mMpi5H$T=M+|)e38CbcstaD~6`B*u2wmBcn)6~=6d1P!g<OkoKLlF;R#5DuG
zUZK8-4QL(?2PnpW<w4-nAD7Y(kaA!io14qLzNQhk7ETYyLi#+N@i%J?ar%DY1t}au
z8?NL%T>OYt;M@Ls=N++>thT?c^K+!2G+nJv!`NAwJSiNNsU*5kUZ1HPP8y78YQUMf
zmHE13Jgl{DCKadCyFi~3OI%S3{6z3uVOVJ?#a)YnMYUWw;3bJvmQyGU9q!gvdrjzV
z{^v$6F=g1d8M^EUnBj*RHI<Z=xUwD4TGGSlMBU^4n_+<mH=@xLBY22{mTK+CTxAFn
zca~~b{=Kj24%yKKg&x7!Pn2i)ov@D5+ViL8QxvWsHnM-?9j&cT{sx5@GhF%Dh1cOm
zG1Zc{^Hm&?_*QarIKu*9`KbG<M+}3}YhgW}*x>lld_zaR$JqoT;=%gcS++k8ANIZI
zIlY!RSB~sK>gz{!;unD?WQ9Njr^~cX2wLPplc@KlnwEJRe^4PdJ>QPA(3#Agp+xCV
z#;M*sMiR6Y;pW`D$!h!Kc?x*Lea_JxW1#k79L=y%nEYN9Ga0KWMuwwea`%{3<dIt7
zh<Ff7H+&pxi1iHbueW(HbiHy=?LpmDUBz5SS4%kq4r*}vhFRN;C39S@-23YyRmN|x
zfy+!L$`x>n0|n$CNl0vI5NtN)IgX^jhg*HSlD02}7|iDbszHxi(wkp)^Qck~^%pQj
zK|>y2)2J9f2pQjbZLDJdF*&WZeor*XhtvalGpIHFFXaXaBZX2?`S*&tefCaqqyXq;
zNd~N^8#Rd!LWsICt)4{dv8~_a>wb=*yc}`n#>zVQ%GS}hz+f*vZ{?)_qLxI@!-Uz-
zYr5fN8i{Y0*Bo4{K4uw<;T7O}TEj*aV$nNE&@|e0V9mCFY_;{}%CNC$SDWd!;>uE^
zLn~SIo5z*vrrLfEY&NHR8cA9NE~xnzVX5aRMLP@ypp_I>3}DXM7Oh3dsO47<6c#lx
zr=})8TiwAn9r@k<hw1GKt49I#-}RksdHpW`41x~U8r7oG$r#tBiX%5>Rn6+@21_j@
zxG&a;Enz70v%Xx>kB4~^Q<sk$vbiKesV%^CTealzs_CSa6Sjk5WJA&iX&@Alt@g%0
zQud5(Nt*L5Z5_4C9lI-HW^fP$=@xdA#?Ja`FeY43;#6bTVFL!de26NGuYCF;GZBcn
zlrsZ%!UIFgJFCllcRYGGS%`yo5p_dk;<&fjMf%EXy0H7`YWspdVN`!85G8bbt+?7l
zu4J31`SC!mMb60+`JinTA-i95Zr2f^Yai4V6g-uHFIMi95W-4w(RZic9~P#-J3)Hu
zCQ5-phXSff<op{<G`DoaX!X9?@k4Jm_O{-5BkCeW*E1b5eAL~ZMm4s`)C;hRDlZ6+
zBHx}au0GLefffE_XdriG&Ceh0e3LY#?TwIPrrfJ;y?}Q*FQdF{3&10W{~Q!f437}e
ztqniM7G%3ZsY_p5#h*9}zOs`hh*}fY4xvT%a!6Y5qe4qvn_05p?@S^eE5XOM<i{(X
zIs86K$(oG?KsKt)Rq!+7H$lE$(K;P2cxkH{P&STGls}E>J980KKaJubVZnzTPJ>eV
z65J;=o)k(T8(=6<dnqsqc<Ef~EIn5ffTL@l>2NASXjkPn-1Xlhy)@mW@ntl*xNHJ4
zTz=`U%Mfd69L`rEUkWeD%tBX%kmgi9VAa48*sah@i$A!l^??K{mxGMoSL)qCZ>g)+
zJXI6A&LIw(WUO@qQK+b}Jlv!;=7_k(a*csv&Ou^Hqe>YoG_7WA=!1nLOU0XJY)+-Z
zjFQB-at_w%U&l388tgU{Rfei*w3F{x8^Bt-Cx<4thR03PBSqSw#EsH6CV~Sy59FV^
zxmdSMi5T(>Fpp+fmi=dfr{P$0<6)ks=hbR6+9>-wH7vz{;YvxP^m>$t_W=TKqc%lq
zcRKu0g4$YnQR%KzHBiE=H}*FcIbfT3SPb7oO@jY%FS!48ua?$2eY-})JQRQ^{3#&v
zgR-y~MZSKvd(@{bp*_}Kp5ER>=axYm!~NZK4j7spMzU=G435dh02UR=QbYL0hRWn1
zr?)*e{D+*c?NV8bdAM!e4`O+$MwAtDXSMogE<`UThXIk_k1G)JTU^sqzk$`;4(LM$
z1VmkjGLo>;ioI;e5<J>eb`R-EuSQd{;WvUfCioJDI7vp!6IQeo*gG}IH;Eb_af45;
z@@qI5=Ap;x|8(#_!ckLea{Y&E`rj*Sh<}ExYRY8qQ5NMk|Bt1>+Azg~ew!MhFdBH3
zS$(Fj%wSn>{6Rv>x}~(gEK6lW@`!b815X_`%>V%XkN{J~4^xq{U=>bL*iW7(Ef0xT
z9p{x7VspD2s%b4gv@A22bIG0K(pN^y6*Nb77F57Y>-r#I?_|uZsaJ-=j}FN-R(_e2
zK^bUvyl)NRaV}vncp0}vLlsZYB@Ya06UBYwI9s60Yz<Uaqm}TVU15{+e_R=)(qD)m
zxCaScZi@J97_wb6_H2C{p^}~xZp+Vd7M#y#on`$cM(~Kx5**sYrn_9f)Q7dr%^ko_
zE*ruqoiW`(YYp^Bo4)Vz{~x7=T+Pj0&GGExP$NRwEn&;C-<Db~;97z1a<TnM<)jEg
z-!bTQf<ty^L{l|0vUGK!l3YcLT2p@`=0$my;Tk$HA{BMXiqm<`l*ANXXjM4x;`c_g
z!hPWNKulRBcd7`!{lV|K%H*>z-&w9!W*CPW%USp6hf<atJ6S$h!?SYPVmu{oaEtzB
zvwQ(N4U(bLK%tXZaio@<3I3FYvjo+4+U-hM*Np#|770hO5T~?)!6U)B%DDfCA=~f#
z->-CI|A@E${29~n<5yR6U5UmF5&+>t2>)C57KK*K#XlU+*a?NAp0I*OJ)id4SIGle
z`3Jl^CJ-56myz9F+6l%2O<`w|C=}u`dPD-0HT6O=)m8g((MG_Ry6XLVU1JwnSe6J_
z_b*zE$#$$?2g+l^*b{U?8{)b0bq4(D;A3<o{6fDV;rKu7_PiRGC%-$%q>HUZ@!+Lo
z*02=wpF|=|tTVL8<~N+9Bn7od8W?bc^IPPnF!et3qv|KXkZ=7L=xvtxC#9Yvmf+m=
z*FC9LXh^s6P9XW%Nl1z!E6OQuj_b5w8H8)*x<LD*16;vUA2hw99-UHBR6us<a#vRr
z%pCD4ll0^}!W;=mK_aT(#S`b9!C9nAsn@Ml-^Q}ichi$JVX>Odq(`xV9-Mk`TE{mg
z#}n*^n@5A$?o&UxY6YcWsTID!GF=sakUDSQHW!`Z%lk;3W^{^XyN^Oc>%VLnO&zLB
zCM{T5(7Qsqe4^{LWIMyDGM10>>jUbPBsb`t@{*=Uz6O9W;5NOpNms^I0K$?rJd8>~
z5yaFf;80pxIlAP&<=eyZdhG90hfRZ&j^o+qnNn+`c4bDII=!K=p}?UC^4|Wp&2zmF
z_QMKNfXS!{rE%fMJ>d#Fl2se_=_;wRjHSa4@Kltca)(+QAP3fouRAcT8P3rtLn-~C
z8Plw)a;6J0I;WGt5;6&tqofL*a8tHB%_yAituxozvt6qfNd)MJXBL%@zYWgwLU&-y
z%5GqQpXB*<<~T7gktG84S*lG>c*h@4RE~wUWf&W>kp@mwSp^KOZ)uL|pvpee;?9`7
z15hhWZJ-1*V=^!z*6xLhg4HI_U&H_62?_yCMx&hZ_j(6^8ZAU|o1IArEi}1S@+0ZE
zc8Wwp)Pq%}nXb>XDw;}u4S0w$P7wv+uMh~-c_n{R68>O_ztgb>R;o_8xm0jPA9!fk
zIgP7l=Qhu#=VBW8AgIZ81>HdxgDIV2AQ4`2zCv;)mY-PIoN}mXQe<Ue^7s)D0BNkz
zd!liHU{gzxLG!LN!P7GKZ-?i@A5VdPy$qG553P~yib1M^3R%O^w2J7k`Q+(74|d+7
zlIWv^dn~U*Om)fx3q&hzqZ>)8?%bF%ek4jHJ+f+0T#K_eTi2!3`&Z4c3k%iXjwP&h
zb*xO4&s|wyx}A36wldK_lUSz0&U?jw^1j97R0M>2aKiEZ1?}N(W;*=pa$e;UV0=8z
zZag|Hx&7O~%SNowdyV*azp1f-gf}ZB<hW3eouUtu=E@dL%PkjIBsZO;QwR<oI)s?w
z2q`R{2*urzL1V6kwef}Ba!*?m%Wag9rzP^V%QVeDL9~Q3liLVmP09p<lR!{YFE&D^
zL*OJLR5}ntG3`i+k5nG$4_w6Q<(y|mV~t!27<D!;Xfd9+Gjg;^-A2sMl*!}`d5rqB
z=%RXi^yI0|`iep_%3~X?;%QGd5mq^vb^lraU@t%{kt<w?l#rr|Z1j3~1~r}|xP(%^
zyuMN2193xT_1*s&-zRnL|2+<m&k^g4OE7X=3}@dEBVu#C;DdSC!_#FoV@FLpedZ`4
zZH8E7m^>WUZ@KJes?ONws#@lf#~PMTZkNp#&?Y}BbNH3w9mH0&p4|w|d<yN|<m9nf
zR<&;{-{7%bv(;Bopo$MbHRO!^{5h>v4>y`Q)rj*po|pj|E6&ro>!+VX!j}=!Vh$A5
zM_vu@Rn|^H+xF^}Mi2L@>8U5>o9~A+Mn7#i;;E4m4`hMpg648z<HEbOAD=|?2WhsB
zn0B!(!H#;#{H&Yy^ItmTX}{tbH4I)VqIu+9L&uFrx`&eqpim|RKUqb7{MZv)-cv<l
zIZ~V@&EO(cNV=RBbtHH&3ll?0k34|JOPRL&0~xVZQXD|0cu$p9DjJ2?Rq5MZVQnCQ
z>&R>A*k$=G?L20O6<cvQNE@zpgl~6f$W$XSHy#q=)-)SfO<R8XuE5cgLv}qN4A@EU
z&Yl(NU~<BQGbk8%oKz$H#X?kKfWdB4;n6~hkQf*i^K#pEy`A4fsZuY(CWQ|Vsd+3y
z*OHp1UUBb;Or657C>V@1y%QAcUqAn--ph&q=|;JPaqh@PZa;fj<M?!M(EzHE0rlGy
zZ~zkj=0HI-K`78|@}@bZN%xc0Kq!45gvj78O3oA_7Pji@j;!6JCeMuyRaHugdGOqX
z73MT1!d9JMOi?azuZFA{c*?Hj%RnTjjF^MJ!2le*)*$5_@uLny;j&q)yB$We-aHnq
z_K{ZU=V!gGC9LM5$IhGxxiA4)rDXs*peb4o8+|p?g8?ndizRrFCqLJb7w-$O@Wk-?
zurY78zfGF`!gyEzf|m2NF+Vn}i8(5s8AW7+C};=rH@P(0(XZ>Tre;EdHV2D$-O^TE
zDH(_{Ypn;)%@G$8Z5&rk8Mj>Lf??)yF;t{mfqK$2dhwae7>W@PF4QKztQ&wHw+P~Q
zzB0ljy)Yy@Dl<FFG&BmHs~nz?VNe6HfLNJ9tSp~dKp+lw^+9SHnwseaCMKo@XqvgV
zHzd?_<Fyp?vK8Yqpl8i<3;?DkXLvYgMEGcy7+Zkd156A%Lc(1<Bdb01t!*PfK@L)o
kI3;B=Vv$+0*<KN%0oe$8Wc<x^c@xCuF(gF{1O&wY08tP_<^TWy

literal 0
HcmV?d00001

diff --git a/sample_archive/zipcdx/zipnum-sample.idx b/sample_archive/zipcdx/zipnum-sample.idx
new file mode 100644
index 00000000..a70d8e87
--- /dev/null
+++ b/sample_archive/zipcdx/zipnum-sample.idx
@@ -0,0 +1,38 @@
+com,example)/ 20140127171200	zipnum	0	276
+org,iana)/ 20140127171238	zipnum	276	328
+org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201055	zipnum	604	312
+org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200718	zipnum	916	235
+org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200912	zipnum	1151	235
+org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201240	zipnum	1386	306
+org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200654	zipnum	1692	235
+org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200816	zipnum	1927	231
+org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126201128	zipnum	2158	236
+org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140127171240	zipnum	2394	312
+org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126200805	zipnum	2706	234
+org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126201055	zipnum	2940	235
+org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126201308	zipnum	3175	289
+org,iana)/_css/2013.1/print.css 20140126200737	zipnum	3464	208
+org,iana)/_css/2013.1/print.css 20140126200929	zipnum	3672	207
+org,iana)/_css/2013.1/print.css 20140126201248	zipnum	3879	276
+org,iana)/_css/2013.1/screen.css 20140126200706	zipnum	4155	210
+org,iana)/_css/2013.1/screen.css 20140126200825	zipnum	4365	211
+org,iana)/_css/2013.1/screen.css 20140126201227	zipnum	4576	216
+org,iana)/_img/2013.1/iana-logo-header.svg 20140126200654	zipnum	4792	236
+org,iana)/_img/2013.1/iana-logo-header.svg 20140126200816	zipnum	5028	219
+org,iana)/_img/2013.1/iana-logo-header.svg 20140126201128	zipnum	5247	221
+org,iana)/_img/2013.1/iana-logo-homepage.png 20140126200625	zipnum	5468	299
+org,iana)/_img/2013.1/icann-logo.svg 20140126200719	zipnum	5767	210
+org,iana)/_img/2013.1/icann-logo.svg 20140126200912	zipnum	5977	212
+org,iana)/_img/2013.1/icann-logo.svg 20140126201240	zipnum	6189	281
+org,iana)/_img/bookmark_icon.ico 20140126200631	zipnum	6470	298
+org,iana)/_js/2013.1/iana.js 20140126200716	zipnum	6768	213
+org,iana)/_js/2013.1/iana.js 20140126200912	zipnum	6981	216
+org,iana)/_js/2013.1/iana.js 20140126201239	zipnum	7197	270
+org,iana)/_js/2013.1/jquery.js 20140126200653	zipnum	7467	215
+org,iana)/_js/2013.1/jquery.js 20140126200816	zipnum	7682	209
+org,iana)/_js/2013.1/jquery.js 20140126201127	zipnum	7891	210
+org,iana)/_js/2013.1/jquery.js 20140127171239	zipnum	8101	410
+org,iana)/dnssec 20140126201307	zipnum	8511	373
+org,iana)/domains/int 20140126201239	zipnum	8884	353
+org,iana)/domains/root/servers 20140126201227	zipnum	9237	386
+org,iana)/time-zones 20140126200737	zipnum	9623	145
diff --git a/sample_archive/zipcdx/zipnum-sample.loc b/sample_archive/zipcdx/zipnum-sample.loc
new file mode 100644
index 00000000..249e1071
--- /dev/null
+++ b/sample_archive/zipcdx/zipnum-sample.loc
@@ -0,0 +1 @@
+zipnum  ./sample_archive/zipcdx/zipnum-sample.cdx.gz

From bff39626b52c322d15856e7098e77162d84c8c53 Mon Sep 17 00:00:00 2001
From: Ilya Kreymer <ilya@archive.org>
Date: Thu, 27 Feb 2014 12:33:11 -0800
Subject: [PATCH 6/8] add first set of zipnum tests #17 still need to test
 timed reload, multi sources

---
 pywb/cdx/cdxserver.py                   |  2 +-
 pywb/cdx/test/cdxserver_test.py         |  5 +--
 pywb/cdx/test/zipnum_test.py            | 44 +++++++++++++++++++++++++
 sample_archive/zipcdx/zipnum-sample.loc |  2 +-
 setup.py                                |  1 +
 5 files changed, 50 insertions(+), 4 deletions(-)
 create mode 100644 pywb/cdx/test/zipnum_test.py

diff --git a/pywb/cdx/cdxserver.py b/pywb/cdx/cdxserver.py
index 8eff842c..fd0c14e9 100644
--- a/pywb/cdx/cdxserver.py
+++ b/pywb/cdx/cdxserver.py
@@ -214,7 +214,7 @@ def create_cdx_source(filename, config):
     if filename.endswith('.cdx'):
         return CDXFile(filename)
 
-    if filename.endswith('.summary'):
+    if filename.endswith(('.summary', '.idx')):
         return ZipNumCluster(filename, config)
 
     return None
diff --git a/pywb/cdx/test/cdxserver_test.py b/pywb/cdx/test/cdxserver_test.py
index 384d7187..44483ca4 100644
--- a/pywb/cdx/test/cdxserver_test.py
+++ b/pywb/cdx/test/cdxserver_test.py
@@ -142,8 +142,8 @@ org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db tex
  ('filename', 'dupes.warc.gz')]
 
 # NOTE: external dependency -- need self-contained test
-#>>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'example.com', output = 'raw', limit = '2')
-#>>> pprint.pprint(x.next().items())
+>>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'example.com', output = 'raw', limit = '2')
+>>> pprint.pprint(x.next().items())
 [('urlkey', 'com,example)/'),
  ('timestamp', '20020120142510'),
  ('original', 'http://example.com:80/'),
@@ -172,6 +172,7 @@ def cdx_ops_test(url, sources = [test_cdx_dir + 'iana.cdx'], **kwparams):
     results = server.load_cdx(**kwparams)
 
     for x in results:
+        x = x.replace('\t', '    ')
         sys.stdout.write(x)
 
 
diff --git a/pywb/cdx/test/zipnum_test.py b/pywb/cdx/test/zipnum_test.py
new file mode 100644
index 00000000..7c98309a
--- /dev/null
+++ b/pywb/cdx/test/zipnum_test.py
@@ -0,0 +1,44 @@
+"""
+>>> zip_ops_test(url = 'http://iana.org')
+org,iana)/ 20140126200624 http://www.iana.org/ text/html 200 OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 2258 334 iana.warc.gz
+org,iana)/ 20140127171238 http://iana.org unk 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 343 1858 dupes.warc.gz
+org,iana)/ 20140127171238 http://www.iana.org/ warc/revisit - OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 536 2678 dupes.warc.gz
+
+# test idx index (tabs replacad with 4 spaces)
+>>> zip_ops_test(url = 'http://iana.org/domains/', matchType = 'prefix', showPagedIndex = True)
+org,iana)/dnssec 20140126201307    zipnum    8511    373
+org,iana)/domains/int 20140126201239    zipnum    8884    353
+org,iana)/domains/root/servers 20140126201227    zipnum    9237    386
+
+>>> zip_ops_test(url = 'http://iana.org/domains/', matchType = 'prefix')
+org,iana)/domains/arpa 20140126201248 http://www.iana.org/domains/arpa text/html 200 QOFZZRN6JIKAL2JRL6ZC2VVG42SPKGHT - - 2939 759039 iana.warc.gz
+org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
+org,iana)/domains/idn-tables 20140126201127 http://www.iana.org/domains/idn-tables text/html 200 HNCUFTJMOQOGAEY6T56KVC3T7TVLKGEW - - 8118 715878 iana.warc.gz
+org,iana)/domains/int 20140126201239 http://www.iana.org/domains/int text/html 200 X32BBNNORV4SPEHTQF5KI5NFHSKTZK6Q - - 2482 746788 iana.warc.gz
+org,iana)/domains/reserved 20140126201054 http://www.iana.org/domains/reserved text/html 200 R5AAEQX5XY5X5DG66B23ODN5DUBWRA27 - - 3573 701457 iana.warc.gz
+org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz
+org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz
+org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz
+org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz
+
+"""
+
+
+
+
+from cdxserver_test import cdx_ops_test
+
+from pywb import get_test_dir
+test_zipnum = get_test_dir() + 'zipcdx/zipnum-sample.idx'
+print test_zipnum
+
+def zip_ops_test(url, **kwargs):
+    sources = test_zipnum
+    cdx_ops_test(url, sources, **kwargs)
+
+
+if __name__ == "__main__":
+    import doctest
+    doctest.testmod()
+
+
diff --git a/sample_archive/zipcdx/zipnum-sample.loc b/sample_archive/zipcdx/zipnum-sample.loc
index 249e1071..df4f3196 100644
--- a/sample_archive/zipcdx/zipnum-sample.loc
+++ b/sample_archive/zipcdx/zipnum-sample.loc
@@ -1 +1 @@
-zipnum  ./sample_archive/zipcdx/zipnum-sample.cdx.gz
+zipnum	./sample_archive/zipcdx/zipnum-sample.cdx.gz
diff --git a/setup.py b/setup.py
index 94c1bca7..307506fe 100755
--- a/setup.py
+++ b/setup.py
@@ -15,6 +15,7 @@ setuptools.setup(name='pywb',
         provides=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite'],
         package_data={'pywb': ['ui/*', 'static/*', '*.yaml']},
         data_files = [('sample_archive/cdx/', glob.glob('sample_archive/cdx/*')),
+                      ('sample_archive/zipcdx/', glob.glob('sample_archive/zipcdx/*')),
                       ('sample_archive/warcs/', glob.glob('sample_archive/warcs/*')),
                       ('sample_archive/text_content/', glob.glob('sample_archive/text_content/*'))],
         install_requires=['uwsgi', 'rfc3987', 'chardet', 'redis', 'jinja2', 'surt', 'pyyaml', 'WebTest','pytest'],

From 921b2eb2e1135cdf62aa9b770a7c50597ef35494 Mon Sep 17 00:00:00 2001
From: Ilya Kreymer <ilya@archive.org>
Date: Thu, 27 Feb 2014 18:43:55 -0800
Subject: [PATCH 7/8] improve testing and a few fixes: archivalrouter: support
 empty collection, with and without SCRIPT_NAME cdx: remove cdx source test,
 including access denied replay: when content-type present, limit the
 decompressed stream to content-length (this ensures last 4 bytes in warc/arc
 record are not read) integration tests for identity replay

---
 pywb/archivalrouter.py           |  6 +++--
 pywb/cdx/cdxsource.py            |  4 ++-
 pywb/cdx/perms.py                |  2 +-
 pywb/cdx/test/cdxserver_test.py  |  6 ++++-
 pywb/replay_views.py             | 12 +++++++++
 pywb/test/test_archivalrouter.py | 14 +++++++++++
 pywb/utils/canonicalize.py       | 13 +++++++---
 tests/test_integration.py        | 42 +++++++++++++++-----------------
 8 files changed, 67 insertions(+), 32 deletions(-)

diff --git a/pywb/archivalrouter.py b/pywb/archivalrouter.py
index 4d28b57e..5d3dc9f4 100644
--- a/pywb/archivalrouter.py
+++ b/pywb/archivalrouter.py
@@ -50,7 +50,10 @@ class Route:
 
     def __init__(self, regex, handler, coll_group = 0, config = {}, lookahead = SLASH_QUERY_LOOKAHEAD):
         self.path = regex
-        self.regex = re.compile(regex + lookahead)
+        if regex:
+            self.regex = re.compile(regex + lookahead)
+        else:
+            self.regex = re.compile('')
         self.handler = handler
         # collection id from regex group (default 0)
         self.coll_group = coll_group
@@ -70,7 +73,6 @@ class Route:
             return None
 
         matched_str = matcher.group(0)
-
         if matched_str:
             rel_prefix = env['SCRIPT_NAME'] + '/' + matched_str + '/'
             wb_url_str = request_uri[len(matched_str) + 2:] # remove the '/' + rel_prefix part of uri
diff --git a/pywb/cdx/cdxsource.py b/pywb/cdx/cdxsource.py
index 783cf36b..ba5f8b3b 100644
--- a/pywb/cdx/cdxsource.py
+++ b/pywb/cdx/cdxsource.py
@@ -1,6 +1,8 @@
 from pywb.utils.binsearch import iter_range
 from pywb.utils.loaders import SeekableTextFileReader
 
+from cdxobject import AccessException
+
 import urllib
 import urllib2
 import itertools
@@ -93,7 +95,7 @@ class RedisCDXSource(CDXSource):
         self.key_prefix = self.DEFAULT_KEY_PREFIX
         if config:
             self.key_prefix = config.get('redis_key_prefix', self.key_prefix)
-        
+
 
     def load_cdx(self, params):
         """
diff --git a/pywb/cdx/perms.py b/pywb/cdx/perms.py
index a7b90eb4..ad6ea00d 100644
--- a/pywb/cdx/perms.py
+++ b/pywb/cdx/perms.py
@@ -1,7 +1,7 @@
 
 
 #=================================================================
-class AllowAllPerms:
+class AllowAllPerms(object):
     """
     Sample Perm Checker which allows all
     """
diff --git a/pywb/cdx/test/cdxserver_test.py b/pywb/cdx/test/cdxserver_test.py
index 44483ca4..e5fac6b3 100644
--- a/pywb/cdx/test/cdxserver_test.py
+++ b/pywb/cdx/test/cdxserver_test.py
@@ -141,7 +141,7 @@ org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db tex
  ('offset', '334'),
  ('filename', 'dupes.warc.gz')]
 
-# NOTE: external dependency -- need self-contained test
+# NOTE: external dependency -- need self-contained test TODO
 >>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'example.com', output = 'raw', limit = '2')
 >>> pprint.pprint(x.next().items())
 [('urlkey', 'com,example)/'),
@@ -152,6 +152,10 @@ org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db tex
  ('digest', 'HT2DYGA5UKZCPBSFVCV3JOBXGW2G5UUA'),
  ('length', '1792')]
 
+
+>>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'facebook.com', output = 'raw', limit = '2')
+Traceback (most recent call last):
+AccessException: Blocked By Robots
 """
 
 #=================================================================
diff --git a/pywb/replay_views.py b/pywb/replay_views.py
index 9113ad5f..31e7af9a 100644
--- a/pywb/replay_views.py
+++ b/pywb/replay_views.py
@@ -7,6 +7,7 @@ from wbrequestresponse import WbResponse
 from wbexceptions import CaptureException, InternalRedirect
 from pywb.warc.recordloader import ArchiveLoadFailed
 
+from pywb.utils.loaders import LimitReader
 
 #=================================================================
 class ReplayView:
@@ -54,10 +55,21 @@ class ReplayView:
 
                 response = None
 
+                # if Content-Length for payload is present, ensure we don't read past it
+                content_len = status_headers.get_header('content-length')
+                try:
+                    content_len=int(content_len)
+                    if content_len > 0:
+                        stream = LimitReader(stream, content_len)
+                except ValueError:
+                    pass
+
                 if self.content_rewriter and wbrequest.wb_url.mod != 'id_':
                     response = self.rewrite_content(wbrequest, cdx, status_headers, stream)
                 else:
                     (status_headers, stream) = self.sanitize_content(status_headers, stream)
+                    #status_headers.remove_header('content-length')
+
                     response_iter = self.stream_to_iter(stream)
                     response = WbResponse(status_headers, response_iter)
 
diff --git a/pywb/test/test_archivalrouter.py b/pywb/test/test_archivalrouter.py
index 4379fbfd..229fafb6 100644
--- a/pywb/test/test_archivalrouter.py
+++ b/pywb/test/test_archivalrouter.py
@@ -15,6 +15,13 @@
  'wb_prefix': 'https://localhost:8081/my_pywb/web/',
  'wb_url': ('replay', '2013', 'im_', 'http://test.example.com', '2013im_/http://test.example.com')}
 
+# route with no collection
+>>> print_req(Route('', BaseHandler())({'REL_REQUEST_URI': 'http://example.com', 'SCRIPT_NAME': '/pywb'}, False))
+{'coll': '',
+ 'request_uri': 'http://example.com',
+ 'wb_prefix': '/pywb/',
+ 'wb_url': None}
+
 # not matching route -- skipped
 >>> Route('web', BaseHandler())({'REL_REQUEST_URI': '/other/test.example.com', 'SCRIPT_NAME': ''}, False)
 
@@ -67,6 +74,13 @@ False
 >>> _test_redir('http://localhost:8080/', '/../other.html', 'http://localhost:8080/extra/coll/20131010/http://example.com/path/page.html', '/extr')
 False
 
+# With no collection
+>>> _test_redir('http://localhost:8080/', '/other.html', 'http://localhost:8080/2013/http://example.com/path/page.html', coll='')
+'http://localhost:8080/2013/http://example.com/other.html'
+
+# With SCRIPT_NAME but no collection
+>>> _test_redir('http://localhost:8080/', '/other.html', 'http://localhost:8080/pywb-access/http://example.com/path/page.html', '/pywb-access', coll='')
+'http://localhost:8080/pywb-access/http://example.com/other.html'
 
 """
 
diff --git a/pywb/utils/canonicalize.py b/pywb/utils/canonicalize.py
index bd21e4ca..73555ca6 100644
--- a/pywb/utils/canonicalize.py
+++ b/pywb/utils/canonicalize.py
@@ -118,10 +118,15 @@ def calc_search_range(url, match_type, surt_ordered=True, url_canon=None):
     >>> calc_search_range('http://example.com/path/file.html', 'host', False)
     ('example.com/', 'example.com0')
 
-    # domain range not supported
+    # errors: domain range not supported
     >>> calc_search_range('http://example.com/path/file.html', 'domain', False)
     Traceback (most recent call last):
-    Exception: matchType=domain unsupported for non-surt
+    UrlCanonicalizeException: matchType=domain unsupported for non-surt
+
+    >>> calc_search_range('http://example.com/path/file.html', 'blah', False)
+    Traceback (most recent call last):
+    UrlCanonicalizeException: Invalid match_type: blah
+
     """
     def inc_last_char(x):
         return x[0:-1] + chr(ord(x[-1]) + 1)
@@ -159,7 +164,7 @@ def calc_search_range(url, match_type, surt_ordered=True, url_canon=None):
 
     elif match_type == 'domain':
         if not surt_ordered:
-            raise Exception('matchType=domain unsupported for non-surt')
+            raise UrlCanonicalizeException('matchType=domain unsupported for non-surt')
 
         host = start_key.split(')/')[0]
 
@@ -172,7 +177,7 @@ def calc_search_range(url, match_type, surt_ordered=True, url_canon=None):
 
         end_key = host + '-'
     else:
-        raise Exception('Invalid match_type: ' + match_type)
+        raise UrlCanonicalizeException('Invalid match_type: ' + match_type)
 
     return (start_key, end_key)
 
diff --git a/tests/test_integration.py b/tests/test_integration.py
index 1a7a943c..5a165041 100644
--- a/tests/test_integration.py
+++ b/tests/test_integration.py
@@ -2,6 +2,7 @@ import webtest
 from pywb.pywb_init import pywb_config
 from pywb.wbapp import create_wb_app
 from pywb.cdx.cdxobject import CDXObject
+from pywb.cdx.perms import AllowAllPerms
 
 class TestWb:
     TEST_CONFIG = 'test_config.yaml'
@@ -73,7 +74,19 @@ class TestWb:
 
         assert 'Mon, Jan 27 2014 17:12:38' in resp.body
         assert 'wb.js' in resp.body
-        assert '/pywb/20140127171238/http://www.iana.org/time-zones' in resp.body
+        assert '/pywb/20140127171238/http://www.iana.org/time-zones"' in resp.body
+
+    def test_replay_identity_1(self):
+        resp = self.testapp.get('/pywb/20140127171251id_/http://example.com')
+        #resp = self.testapp.get('/pywb/20140126200654id_/http://www.iana.org/_img/2013.1/rir-map.svg')
+        #resp = self.testapp.get('/pywb/20140127171239id_/http://www.iana.org/_css/2013.1/screen.css')
+        #self._assert_basic_html(resp)
+
+        # no wb header insertion
+        assert 'wb.js' not in resp.body
+
+        # original unrewritten url present
+        assert '"http://www.iana.org/domains/example"' in resp.body
 
     def test_replay_content_length_1(self):
         # test larger file, rewritten file (svg!)
@@ -198,38 +211,21 @@ class TestWb:
 # Reporter callback for replay view
 class PrintReporter:
     def __call__(self, wbrequest, cdx, response):
-        print wbrequest
-        print cdx
+        #print wbrequest
+        #print cdx
         pass
 
 #=================================================================
-class TestExclusionPerms:
+class TestExclusionPerms(AllowAllPerms):
     """
-    Sample Perm Checker which allows all
+    Sample Perm Checker with hard-coded exclusion
     """
     def allow_url_lookup(self, urlkey, url):
         """
         Return true/false if url or urlkey (canonicalized url)
         should be allowed
         """
-        print urlkey
         if urlkey == 'org,iana)/_img/bookmark_icon.ico':
             return False
 
-        return True
-
-    def allow_capture(self, cdx):
-        """
-        Return true/false is specified capture (cdx) should be
-        allowed
-        """
-        return True
-
-    def filter_fields(self, cdx):
-        """
-        Filter out any forbidden cdx fields from cdx dictionary
-        """
-        return cdx
-
-
-
+        return super(TestExclusionPerms, self).allow_url_lookup(urlkey, url)

From 304a33aa5b188751e3f69e7930969cbb72d7cbc7 Mon Sep 17 00:00:00 2001
From: Ilya Kreymer <ilya@archive.org>
Date: Thu, 27 Feb 2014 18:52:41 -0800
Subject: [PATCH 8/8] add coverage badge

---
 README.md                 | 1 +
 tests/test_integration.py | 5 ++---
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 726d9709..83f1aa28 100644
--- a/README.md
+++ b/README.md
@@ -2,6 +2,7 @@ PyWb 0.2 Beta
 ==============
 
 [![Build Status](https://travis-ci.org/ikreymer/pywb.png?branch=master)](https://travis-ci.org/ikreymer/pywb)
+[![Coverage Status](https://coveralls.io/repos/ikreymer/pywb/badge.png?branch=master)](https://coveralls.io/r/ikreymer/pywb?branch=master)
 
 pywb is a Python re-implementation of the Wayback Machine software.
 
diff --git a/tests/test_integration.py b/tests/test_integration.py
index 5a165041..5f6bb666 100644
--- a/tests/test_integration.py
+++ b/tests/test_integration.py
@@ -211,9 +211,8 @@ class TestWb:
 # Reporter callback for replay view
 class PrintReporter:
     def __call__(self, wbrequest, cdx, response):
-        #print wbrequest
-        #print cdx
-        pass
+        print wbrequest
+        print cdx
 
 #=================================================================
 class TestExclusionPerms(AllowAllPerms):