Merge remote-tracking branch 'origin/master' into cdx-server

Conflicts: pywb/cdx/cdxdomainspecific.py pywb/cdx/cdxserver.py pywb/cdx/test/cdxserver_test.py setup.py tests/test_integration.py
2025-03-15 00:03:28 +01:00 · 2014-02-28 19:47:24 +00:00 · 2014-02-28 19:47:24 +00:00 · 1f65eff828
commit 1f65eff828
parent 9eda5ad97e 304a33aa5b
38 changed files with 931 additions and 203 deletions
--- a/.coveragerc
+++ b/.coveragerc
@ -2,6 +2,9 @@
 omit = 
    */test/*
    */tests/*
+    *.html
+    *.js
+    *.css

 [report]
 exclude_lines =
--- a/.travis.yml
+++ b/.travis.yml
@ -3,9 +3,8 @@ python:
  - "2.7"
 # command to install dependencies
 install:
-  - "python setup.py -q install"
-  - "pip install python-coveralls"
-  - "pip install pytest-cov"
+  - python setup.py -q install
+  - pip install coverage pytest-cov coveralls --use-mirrors
 # command to run tests
 #script: nosetests --with-doctest
 #script: py.test run-tests.py ./pywb/ --doctest-modules --ignore=setup.py
--- a/README.md
+++ b/README.md
@ -2,6 +2,7 @@ PyWb 0.2 Beta
 ==============

 [![Build Status](https://travis-ci.org/ikreymer/pywb.png?branch=master)](https://travis-ci.org/ikreymer/pywb)
+[![Coverage Status](https://coveralls.io/repos/ikreymer/pywb/badge.png?branch=master)](https://coveralls.io/r/ikreymer/pywb?branch=master)

 pywb is a Python re-implementation of the Wayback Machine software.

--- a/pywb/archivalrouter.py
+++ b/pywb/archivalrouter.py
@ -50,7 +50,10 @@ class Route:

    def __init__(self, regex, handler, coll_group = 0, config = {}, lookahead = SLASH_QUERY_LOOKAHEAD):
        self.path = regex
-        self.regex = re.compile(regex + lookahead)
+        if regex:
+            self.regex = re.compile(regex + lookahead)
+        else:
+            self.regex = re.compile('')
        self.handler = handler
        # collection id from regex group (default 0)
        self.coll_group = coll_group
@ -70,7 +73,6 @@ class Route:
            return None

        matched_str = matcher.group(0)
-
        if matched_str:
            rel_prefix = env['SCRIPT_NAME'] + '/' + matched_str + '/'
            wb_url_str = request_uri[len(matched_str) + 2:] # remove the '/' + rel_prefix part of uri
--- a/pywb/cdx/cdxdomainspecific.py
+++ b/pywb/cdx/cdxdomainspecific.py
@ -3,34 +3,43 @@ import re
 import logging
 import pkg_resources

-from canonicalize import unsurt, UrlCanonicalizer
+from pywb.utils.dsrules import BaseRule, RuleSet
+
+from pywb.utils.canonicalize import unsurt, UrlCanonicalizer


 #=================================================================
-def load_domain_specific_cdx_rules(filename, surt_ordered):
-    fh = pkg_resources.resource_string(__name__, filename)
-    config = yaml.load(fh)
+def load_domain_specific_cdx_rules(ds_rules_file, surt_ordered):
+    #fh = pkg_resources.resource_string(__name__, filename)
+    #config = yaml.load(fh)
+
+    canon = None
+    fuzzy = None

    # Load Canonicalizer Rules
-    rules = StartsWithRule.load_rules(config.get('canon_rules'),
-                                      surt_ordered)
+    rules = RuleSet(CDXDomainSpecificRule, 'canonicalize',
+                    ds_rules_file=ds_rules_file)
+
+    if not surt_ordered:
+        for rule in rules:
+            rule.unsurt()

    if rules:
        canon = CustomUrlCanonicalizer(rules, surt_ordered)
-    else:
-        canon = None

    # Load Fuzzy Lookup Rules
-    rules = StartsWithRule.load_rules(config.get('fuzzy_lookup_rules'),
-                                      surt_ordered)
+    rules = RuleSet(CDXDomainSpecificRule, 'fuzzy_lookup',
+                    ds_rules_file=ds_rules_file)
+
+    if not surt_ordered:
+        for rule in rules:
+            rule.unsurt()

    if rules:
        fuzzy = FuzzyQuery(rules)
-    else:
-        fuzzy = None

-    logging.debug('CANON: ' + str(canon))
-    logging.debug('FUZZY: ' + str(fuzzy))
+    logging.debug('CustomCanonilizer? ' + str(bool(canon)))
+    logging.debug('FuzzyMatcher? ' + str(bool(canon)))
    return (canon, fuzzy)


@ -43,10 +52,7 @@ class CustomUrlCanonicalizer(UrlCanonicalizer):
    def __call__(self, url):
        urlkey = super(CustomUrlCanonicalizer, self).__call__(url)

-        for rule in self.rules:
-            if not any(urlkey.startswith(x) for x in rule.starts):
-                continue
-
+        for rule in self.rules.iter_matching(urlkey):
            m = rule.regex.match(urlkey)
            if not m:
                continue
@ -67,11 +73,10 @@ class FuzzyQuery:

        urlkey = params['key']
        url = params['url']
+        filter_ = params.get('filter', [])
+        output = params.get('output')

-        for rule in self.rules:
-            if not any(urlkey.startswith(x) for x in rule.starts):
-                continue
-
+        for rule in self.rules.iter_matching(urlkey):
            m = rule.regex.search(urlkey)
            if not m:
                continue
@ -79,7 +84,7 @@ class FuzzyQuery:
            matched_rule = rule

            if len(m.groups()) == 1:
-                params['filter'] = '=urlkey:' + m.group(1)
+                filter_.append('~urlkey:' + m.group(1))

            break

@ -88,28 +93,40 @@ class FuzzyQuery:

        inx = url.find('?')
        if inx > 0:
-            params['url'] = url[:inx + 1]
+            url = url[:inx + 1]
+
+        params = {'url': url,
+                  'matchType': 'prefix',
+                  'filter': filter_,
+                  'output': output}

-        params['matchType'] = 'prefix'
-        params['key'] = None
        return params


 #=================================================================
-class StartsWithRule:
-    def __init__(self, config, surt_ordered=True):
-        self.starts = config.get('startswith')
-        if not isinstance(self.starts, list):
-            self.starts = [self.starts]
+class CDXDomainSpecificRule(BaseRule):
+    def __init__(self, name, config):
+        super(CDXDomainSpecificRule, self).__init__(name, config)

-        self.regex = re.compile(config.get('matches'))
-        self.replace = config.get('replace')
+        if isinstance(config, basestring):
+            self.regex = re.compile(config)
+            self.replace = None
+        else:
+            self.regex = re.compile(config.get('match'))
+            self.replace = config.get('replace')

    def unsurt(self):
-        # must convert to non-surt form
-        self.starts = map(unsurt, self.starts)
-        self.regex = unsurt(self.regex)
-        self.replace = unsurt(self.replace)
+        """
+        urlkey is assumed to be in surt format by default
+        In the case of non-surt format, this method is called
+        to desurt any urls
+        """
+        self.url_prefix = map(unsurt, self.url_prefix)
+        if self.regex:
+            self.regex = unsurt(self.regex)
+
+        if self.replace:
+            self.replace = unsurt(self.replace)

    @staticmethod
    def load_rules(rules_config, surt_ordered=True):
--- a/pywb/cdx/cdxops.py
+++ b/pywb/cdx/cdxops.py
@ -151,9 +151,15 @@ def cdx_filter(cdx_iter, filter_strings):
            if self.invert:
                string = string[1:]

-            self.exact = string.startswith('=')
-            if self.exact:
+            # exact match
+            if string.startswith('='):
                string = string[1:]
+                self.compare_func = self.exact
+            elif string.startswith('~'):
+                string = string[1:]
+                self.compare_func = self.contains
+            else:
+                self.compare_func = self.regex

            parts = string.split(':', 1)
            # no field set, apply filter to entire cdx
@ -164,19 +170,28 @@ def cdx_filter(cdx_iter, filter_strings):
                self.field = parts[0]
                string = parts[1]

-            if self.exact:
-                self.exact_str = string
-            else:
+            # make regex if regex mode
+            if self.compare_func == self.regex:
                self.regex = re.compile(string)
+            else:
+                self.filter_str = string

        def __call__(self, cdx):
            val = cdx[self.field] if self.field else str(cdx)
-            if self.exact:
-                matched = (self.exact_str == val)
-            else:
-                matched = self.regex.match(val) is not None
+
+            matched = self.compare_func(val)
+
            return matched ^ self.invert

+        def exact(self, val):
+            return (self.filter_str == val)
+
+        def contains(self, val):
+            return (self.filter_str in val)
+
+        def regex(self, val):
+            return self.regex.match(val) is not None
+
    filters = map(Filter, filter_strings)

    for cdx in cdx_iter:
--- a/pywb/cdx/cdxserver.py
+++ b/pywb/cdx/cdxserver.py
@ -1,4 +1,4 @@
-from canonicalize import UrlCanonicalizer, calc_search_range
+from pywb.utils.canonicalize import UrlCanonicalizer, calc_search_range

 from cdxops import cdx_load
 from cdxsource import CDXSource, CDXFile, RemoteCDXSource, RedisCDXSource
@ -17,13 +17,13 @@ import urlparse
 #=================================================================
 class BaseCDXServer(object):
    def __init__(self, **kwargs):
-        ds_rules = kwargs.get('ds_rules')
+        ds_rules_file = kwargs.get('ds_rules_file')
        surt_ordered = kwargs.get('surt_ordered', True)

        # load from domain-specific rules
-        if ds_rules:
+        if ds_rules_file:
            self.url_canon, self.fuzzy_query = (
-                load_domain_specific_cdx_rules(ds_rules, surt_ordered))
+                load_domain_specific_cdx_rules(ds_rules_file, surt_ordered))
        # or custom passed in canonicalizer
        else:
            self.url_canon = kwargs.get('url_canon')
@ -50,14 +50,14 @@ class BaseCDXServer(object):

        url = params['url']

-        if self.fuzzy_query and params.get('allowFuzzy'):
-            if not 'key' in params:
-                params['key'] = self.url_canon(url)
+        # check if fuzzy is allowed and ensure that its an
+        # exact match
+        if (self.fuzzy_query and params.get('allowFuzzy') and
+            params.get('matchType', 'exact') == 'exact'):

-            params = self.fuzzy_query(params)
-            if params:
-                params['allowFuzzy'] = False
-                return self.load_cdx(**params)
+            fuzzy_params = self.fuzzy_query(params)
+            if fuzzy_params:
+                return self.load_cdx(**fuzzy_params)

        msg = 'No Captures found for: ' + url
        raise CaptureNotFoundException(msg)
@ -98,7 +98,6 @@ class CDXServer(BaseCDXServer):
                msg = 'A url= param must be specified to query the cdx server'
                raise CDXException(msg)

-            #params['key'] = self.url_canon(url)
            match_type = params.get('matchType', 'exact')

            key, end_key = calc_search_range(url=url,
@ -159,7 +158,7 @@ class CDXServer(BaseCDXServer):
        if filename.endswith('.cdx'):
            return CDXFile(filename)

-        if filename.endswith('.summary'):
+        if filename.endswith(('.summary', '.idx')):
            return ZipNumCluster(filename, config)

        logging.warn('skipping unrecognized URI:%s', filename)
@ -218,7 +217,7 @@ def create_cdx_server(config, ds_rules_file=None):
    return server_cls(paths,
                      config=pass_config,
                      surt_ordered=surt_ordered,
-                      ds_rules=ds_rules_file,
+                      ds_rules_file=ds_rules_file,
                      perms_checker=perms_checker)

 #=================================================================
--- a/pywb/cdx/cdxsource.py
+++ b/pywb/cdx/cdxsource.py
@ -1,6 +1,8 @@
 from pywb.utils.binsearch import iter_range
 from pywb.utils.loaders import SeekableTextFileReader

+from cdxobject import AccessException
+
 import urllib
 import urllib2
 import itertools
@ -93,7 +95,7 @@ class RedisCDXSource(CDXSource):
        self.key_prefix = self.DEFAULT_KEY_PREFIX
        if config:
            self.key_prefix = config.get('redis_key_prefix', self.key_prefix)
-        
+

    def load_cdx(self, params):
        """
--- a/pywb/cdx/perms.py
+++ b/pywb/cdx/perms.py
@ -1,7 +1,7 @@


 #=================================================================
-class AllowAllPerms:
+class AllowAllPerms(object):
    """
    Sample Perm Checker which allows all
    """
--- a/pywb/cdx/rules.yaml
+++ b/pywb/cdx/rules.yaml
@ -1,24 +0,0 @@
-
-fuzzy_lookup_rules:
-    - startswith: 'com,twitter)/i/profiles/show/'
-      matches: '/profiles/show/.*with_replies\?.*(max_id=[^&]+)'
-
-    - startswith: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet'
-      matches: 'com,facebook\)/.*[?&]data=(.*(?:[&]|query_type[^,]+))'
-
-    - startswith: ['com,yimg,l)/g/combo', 'com,yahooapis,yui)/combo']
-      matches: '([^/]+(?:\.css|\.js))'
-
-    # matches all urls
-    - startswith: ''
-      matches: '[&?](?:_|uncache)=[\d]+[&]?'
-
-canon_rules:
-    - startswith: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet'
-      matches: 'com,facebook\)/.*[?&]data=([^&]+).*'
-      replace: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet?data=\1'
-
-
-
-
-
--- a/pywb/cdx/test/cdxserver_test.py
+++ b/pywb/cdx/test/cdxserver_test.py
@ -54,6 +54,16 @@ com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit
 com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz
 com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz

+# Filter contains
+>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '~urlkey:example=1')
+com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz
+com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz
+
+# Filter contains invert
+>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '!~urlkey:example=1')
+com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz
+com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz
+
 # Collapse by timestamp
 # unresolved revisits, different statuscode results in an extra repeat
 >>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', collapseTime = 11)
@ -131,9 +141,9 @@ org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db tex
 ('offset', '334'),
 ('filename', 'dupes.warc.gz')]

-# NOTE: external dependency -- need self-contained test
-#>>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'example.com', output = 'raw', limit = '2')
-#>>> pprint.pprint(x.next().items())
+# NOTE: external dependency -- need self-contained test TODO
+>>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'example.com', output = 'raw', limit = '2')
+>>> pprint.pprint(x.next().items())
 [('urlkey', 'com,example)/'),
 ('timestamp', '20020120142510'),
 ('original', 'http://example.com:80/'),
@ -142,6 +152,10 @@ org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db tex
 ('digest', 'HT2DYGA5UKZCPBSFVCV3JOBXGW2G5UUA'),
 ('length', '1792')]

+
+>>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'facebook.com', output = 'raw', limit = '2')
+Traceback (most recent call last):
+AccessException: Blocked By Robots
 """

 #=================================================================
@ -169,7 +183,8 @@ def cdx_ops_test(url, sources = [test_cdx_dir + 'iana.cdx'], **kwparams):
    results = server.load_cdx(**kwparams)

    for x in results:
-        sys.stdout.write(x.to_text(fields))
+        l = x.to_text(fields).replace('\t', '    ')
+        sys.stdout.write(l)

 #================================================================

--- a/pywb/cdx/test/zipnum_test.py
+++ b/pywb/cdx/test/zipnum_test.py
@ -0,0 +1,44 @@
+"""
+>>> zip_ops_test(url = 'http://iana.org')
+org,iana)/ 20140126200624 http://www.iana.org/ text/html 200 OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 2258 334 iana.warc.gz
+org,iana)/ 20140127171238 http://iana.org unk 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 343 1858 dupes.warc.gz
+org,iana)/ 20140127171238 http://www.iana.org/ warc/revisit - OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 536 2678 dupes.warc.gz
+
+# test idx index (tabs replacad with 4 spaces)
+>>> zip_ops_test(url = 'http://iana.org/domains/', matchType = 'prefix', showPagedIndex = True)
+org,iana)/dnssec 20140126201307    zipnum    8511    373
+org,iana)/domains/int 20140126201239    zipnum    8884    353
+org,iana)/domains/root/servers 20140126201227    zipnum    9237    386
+
+>>> zip_ops_test(url = 'http://iana.org/domains/', matchType = 'prefix')
+org,iana)/domains/arpa 20140126201248 http://www.iana.org/domains/arpa text/html 200 QOFZZRN6JIKAL2JRL6ZC2VVG42SPKGHT - - 2939 759039 iana.warc.gz
+org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
+org,iana)/domains/idn-tables 20140126201127 http://www.iana.org/domains/idn-tables text/html 200 HNCUFTJMOQOGAEY6T56KVC3T7TVLKGEW - - 8118 715878 iana.warc.gz
+org,iana)/domains/int 20140126201239 http://www.iana.org/domains/int text/html 200 X32BBNNORV4SPEHTQF5KI5NFHSKTZK6Q - - 2482 746788 iana.warc.gz
+org,iana)/domains/reserved 20140126201054 http://www.iana.org/domains/reserved text/html 200 R5AAEQX5XY5X5DG66B23ODN5DUBWRA27 - - 3573 701457 iana.warc.gz
+org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz
+org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz
+org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz
+org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz
+
+"""
+
+
+
+
+from cdxserver_test import cdx_ops_test
+
+from pywb import get_test_dir
+test_zipnum = get_test_dir() + 'zipcdx/zipnum-sample.idx'
+print test_zipnum
+
+def zip_ops_test(url, **kwargs):
+    sources = test_zipnum
+    cdx_ops_test(url, sources, **kwargs)
+
+
+if __name__ == "__main__":
+    import doctest
+    doctest.testmod()
+
+
--- a/pywb/config_utils.py
+++ b/pywb/config_utils.py
@ -18,17 +18,19 @@ def load_template_file(file, desc = None, view_class = views.J2TemplateView):
    return file

 #=================================================================
-def create_wb_handler(cdx_server, config):
+def create_wb_handler(cdx_server, config, ds_rules_file=None):

    record_loader = ArcWarcRecordLoader(cookie_maker = config.get('cookie_maker'))
    paths = config.get('archive_paths')

-    resolving_loader = ResolvingLoader(paths = paths, cdx_server = cdx_server, record_loader = record_loader)
+    resolving_loader = ResolvingLoader(paths=paths,
+                                       cdx_server=cdx_server,
+                                       record_loader=record_loader)

    replayer = replay_views.ReplayView(
        content_loader = resolving_loader,

-        content_rewriter = RewriteContent(),
+        content_rewriter = RewriteContent(ds_rules_file=ds_rules_file),

        head_insert_view = load_template_file(config.get('head_insert_html'), 'Head Insert'),

--- a/pywb/pywb_init.py
+++ b/pywb/pywb_init.py
@ -57,12 +57,13 @@ def pywb_config_manual(passed_config = {}):

        route_config = DictChain(value, config)

-        ds_rules = route_config.get('domain_specific_rules', None)
-        cdx_server = IndexReader(route_config, ds_rules)
+        ds_rules_file = route_config.get('domain_specific_rules', None)
+        cdx_server = IndexReader(route_config, ds_rules_file)

        wb_handler = config_utils.create_wb_handler(
-            cdx_server = cdx_server,
-            config = route_config,
+            cdx_server=cdx_server,
+            config=route_config,
+            ds_rules_file=ds_rules_file,
        )

        logging.debug('Adding Collection: ' + name)
--- a/pywb/replay_views.py
+++ b/pywb/replay_views.py
@ -7,6 +7,8 @@ from wbrequestresponse import WbResponse
 from wbexceptions import CaptureException, InternalRedirect
 from pywb.warc.recordloader import ArchiveLoadFailed

+from pywb.utils.loaders import LimitReader
+
 #=================================================================
 class ReplayView:
    def __init__(self, content_loader, content_rewriter, head_insert_view = None,
@ -53,10 +55,21 @@ class ReplayView:

                response = None

+                # if Content-Length for payload is present, ensure we don't read past it
+                content_len = status_headers.get_header('content-length')
+                try:
+                    content_len=int(content_len)
+                    if content_len > 0:
+                        stream = LimitReader(stream, content_len)
+                except ValueError:
+                    pass
+
                if self.content_rewriter and wbrequest.wb_url.mod != 'id_':
                    response = self.rewrite_content(wbrequest, cdx, status_headers, stream)
                else:
                    (status_headers, stream) = self.sanitize_content(status_headers, stream)
+                    #status_headers.remove_header('content-length')
+
                    response_iter = self.stream_to_iter(stream)
                    response = WbResponse(status_headers, response_iter)

@ -99,20 +112,34 @@ class ReplayView:
    def rewrite_content(self, wbrequest, cdx, status_headers, stream):
        urlrewriter = wbrequest.urlrewriter

-        (rewritten_headers, stream) = self.content_rewriter.rewrite_headers(urlrewriter, status_headers, stream)
+        result = self.content_rewriter.rewrite_headers(urlrewriter,
+                                                       status_headers,
+                                                       stream,
+                                                       cdx['urlkey'])
+        (rewritten_headers, stream) = result

        # no rewriting needed!
        if rewritten_headers.text_type is None:
            response_iter = self.stream_to_iter(stream)
            return WbResponse(rewritten_headers.status_headers, response_iter)

-        # do head insert
+        def make_head_insert(rule):
+            return (self.head_insert_view.render_to_string(wbrequest=wbrequest,
+                                                           cdx=cdx,
+                                                           rule=rule))
+         # do head insert
        if self.head_insert_view:
-            head_insert_str = self.head_insert_view.render_to_string(wbrequest = wbrequest, cdx = cdx)
+            head_insert_func = make_head_insert
        else:
-            head_insert_str = None
+            head_insert_func = None

-        (status_headers, response_gen) = self.content_rewriter.rewrite_content(urlrewriter, rewritten_headers, stream, head_insert_str)
+        result = self.content_rewriter.rewrite_content(urlrewriter,
+                                                       rewritten_headers,
+                                                       stream,
+                                                       head_insert_func,
+                                                       cdx['urlkey'])
+
+        (status_headers, response_gen) = result

        if self.buffer_response:
            if wbrequest.wb_url.mod == 'id_':
--- a/pywb/rewrite/regex_rewriters.py
+++ b/pywb/rewrite/regex_rewriters.py
@ -4,11 +4,16 @@ import itertools

 from url_rewriter import UrlRewriter

+
 #=================================================================
 class RegexRewriter(object):
+    #@staticmethod
+    #def comment_out(string):
+    #    return '/*' + string + '*/'
+
    @staticmethod
-    def comment_out(string):
-        return '/*' + string + '*/'
+    def format(template):
+        return lambda string: template.format(string)

    @staticmethod
    def remove_https(string):
@ -20,19 +25,16 @@ class RegexRewriter(object):

    @staticmethod
    def archival_rewrite(rewriter):
-        return lambda x: rewriter.rewrite(x)
+        return lambda string: rewriter.rewrite(string)

-    @staticmethod
-    def replacer(string):
-        return lambda x: string
+    #@staticmethod
+    #def replacer(other):
+    #    return lambda m, string: other

    HTTPX_MATCH_STR = r'https?:\\?/\\?/[A-Za-z0-9:_@.-]+'

-
-
    DEFAULT_OP = add_prefix

-
    def __init__(self, rules):
        #rules = self.create_rules(http_prefix)

@ -76,52 +78,68 @@ class RegexRewriter(object):
                op = RegexRewriter.DEFAULT_OP(op)

            result = op(m.group(i))
+            final_str = result

            # if extracting partial match
            if i != full_m:
-                result = m.string[m.start(full_m):m.start(i)] + result + m.string[m.end(i):m.end(full_m)]
+                final_str = m.string[m.start(full_m):m.start(i)]
+                final_str += result
+                final_str += m.string[m.end(i):m.end(full_m)]

+            return final_str
+
+    @staticmethod
+    def parse_rules_from_config(config):
+        def parse_rule(obj):
+            match = obj.get('match')
+            replace = RegexRewriter.format(obj.get('replace', '{0}'))
+            group = obj.get('group', 0)
+            result = (match, replace, group)
            return result
-
+        return map(parse_rule, config)


 #=================================================================
-class JSLinkRewriter(RegexRewriter):
+class JSLinkOnlyRewriter(RegexRewriter):
    """
    JS Rewriter which rewrites absolute http://, https:// and // urls
    at the beginning of a string
    """
    JS_HTTPX = r'(?<="|\')(?:https?:)?\\{0,2}/\\{0,2}/[A-Za-z0-9:_@.-]+'

-    def __init__(self, rewriter, rules = []):
+    def __init__(self, rewriter, rules=[]):
        rules = rules + [(self.JS_HTTPX, rewriter.get_abs_url(), 0)]
-        super(JSLinkRewriter, self).__init__(rules)
+        super(JSLinkOnlyRewriter, self).__init__(rules)
+

 #=================================================================
-class JSLocationAndLinkRewriter(JSLinkRewriter):
+class JSLinkAndLocationRewriter(JSLinkOnlyRewriter):
    """
    JS Rewriter which also rewrites location and domain to the
    specified prefix (default: 'WB_wombat_')
    """

-    def __init__(self, rewriter, rules = [], prefix = 'WB_wombat_'):
+    def __init__(self, rewriter, rules=[], prefix='WB_wombat_'):
        rules = rules + [
             (r'(?<!/)\blocation\b', prefix, 0),
             (r'(?<=document\.)domain', prefix, 0),
        ]
-        super(JSLocationAndLinkRewriter, self).__init__(rewriter, rules)
+        #import sys
+        #sys.stderr.write('\n\n*** RULES:' + str(rules) + '\n\n')
+        super(JSLinkAndLocationRewriter, self).__init__(rewriter, rules)
+

 #=================================================================
 # Set 'default' JSRewriter
-JSRewriter = JSLocationAndLinkRewriter
+JSRewriter = JSLinkAndLocationRewriter


 #=================================================================
 class XMLRewriter(RegexRewriter):
-    def __init__(self, rewriter, extra = []):
+    def __init__(self, rewriter, extra=[]):
        rules = self._create_rules(rewriter.get_abs_url())

-        RegexRewriter.__init__(self, rules)
+        super(XMLRewriter, self).__init__(rules)

    # custom filter to reject 'xmlns' attr
    def filter(self, m):
@ -133,24 +151,28 @@ class XMLRewriter(RegexRewriter):

    def _create_rules(self, http_prefix):
        return [
-             ('([A-Za-z:]+[\s=]+)?["\'\s]*(' + RegexRewriter.HTTPX_MATCH_STR + ')', http_prefix, 2),
+             ('([A-Za-z:]+[\s=]+)?["\'\s]*(' +
+              RegexRewriter.HTTPX_MATCH_STR + ')', http_prefix, 2),
        ]

+
 #=================================================================
 class CSSRewriter(RegexRewriter):
+
    CSS_URL_REGEX = "url\\s*\\(\\s*[\\\\\"']*([^)'\"]+)[\\\\\"']*\\s*\\)"
-    CSS_IMPORT_NO_URL_REGEX = "@import\\s+(?!url)\\(?\\s*['\"]?(?!url[\\s\\(])([\w.:/\\\\-]+)"
+
+    CSS_IMPORT_NO_URL_REGEX = ("@import\\s+(?!url)\\(?\\s*['\"]?" +
+                               "(?!url[\\s\\(])([\w.:/\\\\-]+)")

    def __init__(self, rewriter):
        rules = self._create_rules(rewriter)
-
-        RegexRewriter.__init__(self, rules)
-
+        super(CSSRewriter, self).__init__(rules)

    def _create_rules(self, rewriter):
        return [
-             (CSSRewriter.CSS_URL_REGEX, RegexRewriter.archival_rewrite(rewriter), 1),
-             (CSSRewriter.CSS_IMPORT_NO_URL_REGEX, RegexRewriter.archival_rewrite(rewriter), 1),
+             (CSSRewriter.CSS_URL_REGEX,
+              RegexRewriter.archival_rewrite(rewriter), 1),
+
+             (CSSRewriter.CSS_IMPORT_NO_URL_REGEX,
+              RegexRewriter.archival_rewrite(rewriter), 1),
        ]
-
-
--- a/pywb/rewrite/rewrite_content.py
+++ b/pywb/rewrite/rewrite_content.py
@ -1,30 +1,27 @@
 import chardet
+import pkgutil
+import yaml

-from url_rewriter import UrlRewriter
-from html_rewriter import HTMLRewriter
-from regex_rewriters import RegexRewriter, JSRewriter, CSSRewriter, XMLRewriter
-from header_rewriter import HeaderRewriter, RewrittenStatusAndHeaders
+from header_rewriter import RewrittenStatusAndHeaders

+from rewriterules import RewriteRules
+
+from pywb.utils.dsrules import RuleSet
 from pywb.utils.statusandheaders import StatusAndHeaders
 from pywb.utils.bufferedreaders import DecompressingBufferedReader, ChunkedDataReader

+
+#=================================================================
 class RewriteContent:
+    def __init__(self, ds_rules_file=None):
+        self.ruleset = RuleSet(RewriteRules, 'rewrite',
+                               default_rule_config={},
+                               ds_rules_file=ds_rules_file)

-    DEFAULT_CONTENT_REWRITERS = {
-      'header': HeaderRewriter,
-      'js': JSRewriter,
-      'css': CSSRewriter,
-      'xml': XMLRewriter,
-      'html': HTMLRewriter
-    }
+    def rewrite_headers(self, urlrewriter, status_headers, stream, urlkey=''):
+        header_rewriter_class = self.ruleset.get_first_match(urlkey).rewriters['header']

-
-    def __init__(self, rewriters = {}):
-        self.rewriters = dict(self.DEFAULT_CONTENT_REWRITERS.items() + rewriters.items())
-
-
-    def rewrite_headers(self, urlrewriter, status_headers, stream):
-        rewritten_headers = self.rewriters['header']().rewrite(status_headers, urlrewriter)
+        rewritten_headers = header_rewriter_class().rewrite(status_headers, urlrewriter)

        # note: since chunking may be broken, approach taken here is to *always* attempt
        # to dechunk if transfer-encoding: chunked is present
@ -37,7 +34,8 @@ class RewriteContent:

        return (rewritten_headers, stream)

-    def rewrite_content(self, urlrewriter, headers, stream, head_insert_str = None):
+    def rewrite_content(self, urlrewriter, headers, stream, head_insert_func=None, urlkey=''):
+
        # see if we've already rewritten headers
        if isinstance(headers, RewrittenStatusAndHeaders):
            rewritten_headers = headers
@ -50,9 +48,11 @@ class RewriteContent:
                return (status_headers, gen)

        status_headers = rewritten_headers.status_headers
+
        # Handle text content rewriting
        # =========================================================================
        # special case -- need to ungzip the body
+
        if (rewritten_headers.contains_removed_header('content-encoding', 'gzip')):
            stream = DecompressingBufferedReader(stream, decomp_type='gzip')

@ -68,13 +68,27 @@ class RewriteContent:

        text_type = rewritten_headers.text_type

-        rewriter_class = self.rewriters.get(text_type)
-        if not rewriter_class:
+        rule = self.ruleset.get_first_match(urlkey)
+
+        try:
+            rewriter_class = rule.rewriters[text_type]
+        except KeyError:
            raise Exception('Unknown Text Type for Rewrite: ' + text_type)

+        #import sys
+        #sys.stderr.write(str(vars(rule)))

        if text_type == 'html':
-            rewriter = rewriter_class(urlrewriter, outstream = None, head_insert = head_insert_str)
+            head_insert_str = ''
+
+            if head_insert_func:
+                head_insert_str = head_insert_func(rule)
+
+            rewriter = rewriter_class(urlrewriter,
+                                      outstream=None,
+                                      js_rewriter_class=rule.rewriters['js'],
+                                      css_rewriter_class=rule.rewriters['css'],
+                                      head_insert=head_insert_str)
        else:
            rewriter = rewriter_class(urlrewriter)

--- a/pywb/rewrite/rewrite_live.py
+++ b/pywb/rewrite/rewrite_live.py
@ -2,12 +2,17 @@ import urllib2
 import os
 import sys
 import datetime
+import mimetypes

+from pywb.utils.loaders import is_http
 from pywb.utils.timeutils import datetime_to_timestamp
 from pywb.utils.statusandheaders import StatusAndHeaders
+from pywb.utils.canonicalize import canonicalize
+
 from pywb.rewrite.url_rewriter import UrlRewriter
 from pywb.rewrite.rewrite_content import RewriteContent

+
 """
 Fetch a url from live web and apply rewriting rules
 """
@ -26,10 +31,37 @@ def get_status_and_stream(url):
    return (status_headers, stream)

 #=================================================================
-def get_rewritten(url, urlrewriter):
-    (status_headers, stream) = get_status_and_stream(url)
+def get_local_file(uri):
+    fh = open(uri)

-    status_headers, gen = RewriteContent().rewrite_content(urlrewriter, status_headers, stream)
+    content_type, _ = mimetypes.guess_type(uri)
+
+    # create fake headers for local file
+    status_headers = StatusAndHeaders('200 OK', [('Content-Type', content_type)])
+    stream = fh
+
+    return (status_headers, stream)
+
+#=================================================================
+def get_rewritten(url, urlrewriter, urlkey=None, head_insert_func=None):
+    if is_http(url):
+        (status_headers, stream) = get_status_and_stream(url)
+    else:
+        (status_headers, stream) = get_local_file(url)
+
+    # explicit urlkey may be passed in (say for testing)
+    if not urlkey:
+        urlkey = canonicalize(url)
+
+    rewriter = RewriteContent()
+
+    result = rewriter.rewrite_content(urlrewriter,
+                                      status_headers,
+                                      stream,
+                                      head_insert_func=head_insert_func,
+                                      urlkey=urlkey)
+
+    status_headers, gen = result

    buff = ''
    for x in gen:
--- a/pywb/rewrite/rewriterules.py
+++ b/pywb/rewrite/rewriterules.py
@ -0,0 +1,53 @@
+from pywb.utils.dsrules import BaseRule
+
+from regex_rewriters import RegexRewriter, CSSRewriter, XMLRewriter
+from regex_rewriters import JSLinkAndLocationRewriter, JSLinkOnlyRewriter
+from html_rewriter import HTMLRewriter
+from header_rewriter import HeaderRewriter
+
+import itertools
+
+class RewriteRules(BaseRule):
+    def __init__(self, url_prefix, config={}):
+        super(RewriteRules, self).__init__(url_prefix, config)
+
+        self.rewriters = {}
+
+        #self._script_head_inserts = config.get('script_head_inserts', {})
+
+        self.rewriters['header'] = config.get('header_class', HeaderRewriter)
+        self.rewriters['css'] = config.get('css_class', CSSRewriter)
+        self.rewriters['xml'] = config.get('xml_class', XMLRewriter)
+        self.rewriters['html'] = config.get('html_class', HTMLRewriter)
+
+        # Custom handling for js rewriting, often the most complex
+        self.js_rewrite_location = config.get('js_rewrite_location', True)
+        self.js_rewrite_location = bool(self.js_rewrite_location)
+
+        # ability to toggle rewriting
+        if self.js_rewrite_location:
+            js_default_class = JSLinkAndLocationRewriter
+        else:
+            js_default_class = JSLinkOnlyRewriter
+
+        # set js class, using either default or override from config
+        self.rewriters['js'] = config.get('js_class', js_default_class)
+
+        # add any regexs for js rewriter
+        self._add_custom_regexs('js', config)
+
+    def _add_custom_regexs(self, field, config):
+        regexs = config.get(field + '_regexs')
+        if not regexs:
+            return
+
+        rewriter_cls = self.rewriters[field]
+
+        rule_def_tuples = RegexRewriter.parse_rules_from_config(regexs)
+
+        def extend_rewriter_with_regex(urlrewriter):
+            #import sys
+            #sys.stderr.write('\n\nEXTEND: ' + str(rule_def_tuples))
+            return rewriter_cls(urlrewriter, rule_def_tuples)
+
+        self.rewriters[field] = extend_rewriter_with_regex
--- a/pywb/rewrite/test/test_rewrite.py
+++ b/pywb/rewrite/test/test_rewrite.py
@ -121,7 +121,7 @@ r"""
 'document_domain = "anotherdomain.com"; window.document.WB_wombat_domain = "example.com"'

 # custom rules added
->>> _test_js('window.location = "http://example.com/abc.html"; some_func(); ', [('some_func\(\).*', RegexRewriter.comment_out, 0)])
+>>> _test_js('window.location = "http://example.com/abc.html"; some_func(); ', [('some_func\(\).*', RegexRewriter.format('/*{0}*/'), 0)])
 'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"; /*some_func(); */'

 # scheme-agnostic
--- a/pywb/rewrite/test/test_rewrite_live.py
+++ b/pywb/rewrite/test/test_rewrite_live.py
@ -1,11 +1,50 @@
 from pywb.rewrite.rewrite_live import get_rewritten
 from pywb.rewrite.url_rewriter import UrlRewriter

+from pywb import get_test_dir
+
 # This module has some rewriting tests against the 'live web'
 # As such, the content may change and the test may break

 urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/pywb/')

+def head_insert_func(rule):
+    if rule.js_rewrite_location == True:
+        return '<script src="/static/default/wombat.js"> </script>'
+    else:
+        return ''
+
+
+def test_local_1():
+    status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html',
+                                         urlrewriter,
+                                         'com,example,test)/',
+                                         head_insert_func)
+
+    # wombat insert added
+    assert '<head><script src="/static/default/wombat.js"> </script>' in buff
+
+    # location rewritten
+    assert 'window.WB_wombat_location = "/other.html"' in buff
+
+    # link rewritten
+    assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff
+
+
+def test_local_2_no_js_location_rewrite():
+    status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html',
+                                         urlrewriter,
+                                         'example,example,test)/nolocation_rewrite',
+                                         head_insert_func)
+
+    # no wombat insert
+    assert '<head><script src="/static/default/wombat.js"> </script>' not in buff
+
+    # no location rewrite
+    assert 'window.location = "/other.html"' in buff
+
+    # still link rewrite
+    assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff

 def test_example_1():
    status_headers, buff = get_rewritten('http://example.com/', urlrewriter)
@ -24,9 +63,10 @@ def test_example_2():



-#def test_example_3():
-#    status_headers, buff = get_rewritten('http://archive.org/', urlrewriter)
+def test_example_domain_specific_3():
+    status_headers, buff = get_rewritten('http://facebook.com/digitalpreservation', urlrewriter)

-#    assert '/pywb/20131226101010/http://example.com/about/terms.php' in buff, buff
+    # comment out bootloader
+    assert '/* Bootloader.configurePage' in buff


--- a/pywb/rules.yaml
+++ b/pywb/rules.yaml
@ -0,0 +1,50 @@
+
+rules:
+
+    # twitter rules
+    #=================================================================
+    - url_prefix: 'com,twitter)/i/profiles/show/'
+
+      fuzzy_lookup: '/profiles/show/.*with_replies\?.*(max_id=[^&]+)'
+
+
+    # facebook rules
+    #=================================================================
+    - url_prefix: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet'
+
+      fuzzy_lookup: 'com,facebook\)/.*[?&]data=(.*?(?:[&]|query_type[^,]+))'
+
+# not actually needed, fuzzy match is used instead here
+#      canonicalize:
+#        match: 'com,facebook\)/.*[?&]data=([^&]+).*'
+#        replace: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet?data=\1'
+
+
+    - url_prefix: 'com,facebook)/'
+      rewrite:
+        js_regexs:
+            - match: 'Bootloader\.configurePage.*'
+              replace: '/* {0} */'
+
+
+    # yahoo rules
+    #=================================================================
+    - url_prefix: ['com,yimg,l)/g/combo', 'com,yahooapis,yui)/combo']
+
+      fuzzy_lookup: '([^/]+(?:\.css|\.js))'
+
+
+    # testing rules -- not for valid domain
+    #=================================================================
+    # this rule block is a non-existent prefix merely for testing
+    - url_prefix: 'example,example,test)/nolocation_rewrite'
+
+      rewrite:
+        js_rewrite_location: False
+
+
+    # all domain rules -- fallback to this dataset
+    #=================================================================
+    # Applies to all urls -- should be last
+    - url_prefix: ''
+      fuzzy_lookup: '[&?](?:_|uncache)=[\d]+[&]?'
--- a/pywb/static/wb.js
+++ b/pywb/static/wb.js
@ -1,18 +1,21 @@
+/*
+Copyright(c) 2013-2014 Ilya Kreymer. Released under the GNU General Public License.

+This file is part of pywb.

-// Rewritten location and domain obj setup
-window.WB_wombat_location = window.location
+    pywb is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.

-if (window.top != window) {
-    window.top.WB_wombat_location = window.top.location
-}
-
-if (window.opener) {
-    window.opener.WB_wombat_location = window.opener.location
-}
-
-document.WB_wombat_domain = document.domain
+    pywb is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.

+    You should have received a copy of the GNU General Public License
+    along with pywb.  If not, see <http://www.gnu.org/licenses/>.
+*/

 function initBanner()
 {
--- a/pywb/static/wombat.js
+++ b/pywb/static/wombat.js
@ -0,0 +1,219 @@
+/*
+Copyright(c) 2013-2014 Ilya Kreymer. Released under the GNU General Public License.
+
+This file is part of pywb.
+
+    pywb is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    pywb is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with pywb.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+//============================================
+// Wombat JS-Rewriting Library
+//============================================
+
+var WB_wombat_replayPrefix;
+var WB_wombat_replayDatePrefix;
+var WB_wombat_captureDatePart;
+var WB_wombat_origHost;
+
+	
+function WB_StripPort(str)
+{
+  var hostWithPort = str.match(/^http:\/\/[\w\d@.-]+:\d+/);
+  if (hostWithPort) {
+     var hostName = hostWithPort[0].substr(0, hostWithPort[0].lastIndexOf(':'));
+     return hostName + str.substr(hostWithPort[0].length);
+  }
+
+  return str;
+}
+
+function WB_IsHostUrl(str)
+{
+  // Good guess that's its a hostname
+  if (str.indexOf("www.") == 0) {
+    return true;
+  }
+  
+  // hostname:port (port required)
+  var matches = str.match(/^[\w-]+(\.[\w-_]+)+(:\d+)(\/|$)/);
+  if (matches && (matches[0].length < 64)) {
+    return true;
+  }
+  
+  // ip:port
+  matches = str.match(/^\d+\.\d+\.\d+\.\d+(:\d+)?(\/|$)/);
+  if (matches && (matches[0].length < 64)) {
+    return true;
+  }
+
+  return false;
+}
+
+function WB_RewriteUrl(url)
+{
+  var httpPrefix = "http://";
+
+  // If not dealing with a string, just return it
+  if (!url || (typeof url) != "string") {
+    return url;
+  }
+  
+  // If starts with prefix, no rewriting needed
+  // Only check replay prefix (no date) as date may be different for each capture
+  if (url.indexOf(WB_wombat_replayPrefix) == 0) {
+    return url;
+  }
+  
+  // If server relative url, add prefix and original host
+  if (url.charAt(0) == "/") {
+    
+    // Already a relative url, don't make any changes!
+    if (url.indexOf(WB_wombat_captureDatePart) >= 0) {
+      return url;
+    }
+    
+    return WB_wombat_replayDatePrefix + WB_wombat_origHost + url;
+  }
+  
+  // If full url starting with http://, add prefix
+  if (url.indexOf(httpPrefix) == 0) {
+    return WB_wombat_replayDatePrefix + url;
+  }
+  
+  // May or may not be a hostname, call function to determine
+  // If it is, add the prefix and make sure port is removed
+  if (WB_IsHostUrl(url)) {
+    return WB_wombat_replayDatePrefix + httpPrefix + url;
+  }
+
+  return url;
+}
+
+function WB_CopyObjectFields(obj)
+{
+  var newObj = {};
+  
+  for (prop in obj) {
+    if ((typeof obj[prop]) != "function") {
+      newObj[prop] = obj[prop];
+    }
+  }
+  
+  return newObj;
+}
+
+function WB_ExtractOrig(href)
+{
+  if (!href) {
+    return "";
+  }
+  href = href.toString();
+  var index = href.indexOf("/http", 1);
+  if (index > 0) {
+    return href.substr(index + 1);
+  } else {
+    return href;
+  }
+}
+  
+function WB_CopyLocationObj(loc)
+{
+  var newLoc = WB_CopyObjectFields(loc);
+  
+  newLoc._origLoc = loc;
+  newLoc._origHref = loc.href;
+  
+  // Rewrite replace and assign functions
+  newLoc.replace = function(url) { this._origLoc.replace(WB_RewriteUrl(url)); }
+  newLoc.assign = function(url) { this._origLoc.assign(WB_RewriteUrl(url)); }
+  newLoc.reload = loc.reload;
+  newLoc.href = WB_ExtractOrig(newLoc._origHref);
+  newLoc.toString = function() { return this.href; }
+  
+  return newLoc;
+}
+
+function WB_wombat_updateLoc(reqHref, origHref, location)
+{
+  if (reqHref && (WB_ExtractOrig(origHref) != WB_ExtractOrig(reqHref))) {      
+    var finalHref = WB_RewriteUrl(reqHref);
+    
+    location.href = finalHref;
+  }  
+}
+
+function WB_wombat_checkLocationChange(wbLoc, isTop)
+{
+  var locType = (typeof wbLoc);
+  
+  var location = (isTop ? window.top.location : window.location);
+	
+  // String has been assigned to location, so assign it
+  if (locType == "string") {
+    WB_wombat_updateLoc(wbLoc, location.href, location)
+    
+  } else if (locType == "object") {
+    WB_wombat_updateLoc(wbLoc.href, wbLoc._origHref, location);
+  }
+}
+
+var wombat_updating = false;
+
+function WB_wombat_checkLocations()
+{
+  if (wombat_updating) {
+    return false;
+  }
+  
+  wombat_updating = true;
+  
+  WB_wombat_checkLocationChange(window.WB_wombat_location, false);
+  
+  if (window.self.location != window.top.location) {
+    WB_wombat_checkLocationChange(window.top.WB_wombat_location, true);
+  }
+  
+  wombat_updating = false;
+}
+
+function WB_wombat_Init(replayPrefix, captureDate, origHost)
+{
+  WB_wombat_replayPrefix = replayPrefix;
+  WB_wombat_replayDatePrefix = replayPrefix + captureDate + "/";
+  WB_wombat_captureDatePart = "/" + captureDate + "/";
+  
+  WB_wombat_origHost = "http://" + origHost;
+
+  window.WB_wombat_location = WB_CopyLocationObj(window.self.location);
+
+
+  if (window.self.location != window.top.location) {
+    window.top.WB_wombat_location = WB_CopyLocationObj(window.top.location);
+  }
+
+  if (window.opener) {
+    window.opener.WB_wombat_location = (window.opener ? WB_CopyLocationObj(window.opener.location) : null);
+  }
+
+
+  document.WB_wombat_domain = origHost;
+
+}
+
+// Check quickly after page load
+setTimeout(WB_wombat_checkLocations, 100);
+
+
+// Check periodically every few seconds
+setInterval(WB_wombat_checkLocations, 500);
--- a/pywb/test/test_archivalrouter.py
+++ b/pywb/test/test_archivalrouter.py
@ -15,6 +15,13 @@
 'wb_prefix': 'https://localhost:8081/my_pywb/web/',
 'wb_url': ('replay', '2013', 'im_', 'http://test.example.com', '2013im_/http://test.example.com')}

+# route with no collection
+>>> print_req(Route('', BaseHandler())({'REL_REQUEST_URI': 'http://example.com', 'SCRIPT_NAME': '/pywb'}, False))
+{'coll': '',
+ 'request_uri': 'http://example.com',
+ 'wb_prefix': '/pywb/',
+ 'wb_url': None}
+
 # not matching route -- skipped
 >>> Route('web', BaseHandler())({'REL_REQUEST_URI': '/other/test.example.com', 'SCRIPT_NAME': ''}, False)

@ -67,6 +74,13 @@ False
 >>> _test_redir('http://localhost:8080/', '/../other.html', 'http://localhost:8080/extra/coll/20131010/http://example.com/path/page.html', '/extr')
 False

+# With no collection
+>>> _test_redir('http://localhost:8080/', '/other.html', 'http://localhost:8080/2013/http://example.com/path/page.html', coll='')
+'http://localhost:8080/2013/http://example.com/other.html'
+
+# With SCRIPT_NAME but no collection
+>>> _test_redir('http://localhost:8080/', '/other.html', 'http://localhost:8080/pywb-access/http://example.com/path/page.html', '/pywb-access', coll='')
+'http://localhost:8080/pywb-access/http://example.com/other.html'

 """

--- a/pywb/ui/head_insert.html
+++ b/pywb/ui/head_insert.html
@ -1,7 +1,14 @@
 <!-- WB Insert -->
+{% if rule.js_rewrite_location %}
+<script src='{{ wbrequest.host_prefix }}/static/default/wombat.js'> </script>
 <script>
-wbinfo = {}
-wbinfo.capture_str = "{{ cdx['timestamp'] | format_ts }}";
+  WB_wombat_Init("{{wbrequest.wb_prefix}}", "{{cdx['timestamp']}}", "{{cdx['original'] | host}}");
+</script>
+{% endif %}
+
+<script>
+  wbinfo = {}
+  wbinfo.capture_str = "{{ cdx['timestamp'] | format_ts }}";
 </script>
 <script src='{{ wbrequest.host_prefix }}/static/default/wb.js'> </script>
 <link rel='stylesheet' href='{{ wbrequest.host_prefix }}/static/default/wb.css'/>
--- a/pywb/utils/canonicalize.py
+++ b/pywb/utils/canonicalize.py
@ -3,8 +3,6 @@

 import surt
 import urlparse
-from cdxobject import CDXException
-

 #=================================================================
 class UrlCanonicalizer(object):
@ -15,6 +13,12 @@ class UrlCanonicalizer(object):
        return canonicalize(url, self.surt_ordered)


+#=================================================================
+class UrlCanonicalizeException(Exception):
+    def status(self):
+        return '400 Bad Request'
+
+
 #=================================================================
 def canonicalize(url, surt_ordered=True):
    """
@ -31,7 +35,7 @@ def canonicalize(url, surt_ordered=True):
    try:
        key = surt.surt(url)
    except Exception as e:
-        raise CDXException('Invalid Url: ' + url)
+        raise UrlCanonicalizeException('Invalid Url: ' + url)

    # if not surt, unsurt the surt to get canonicalized non-surt url
    if not surt_ordered:
@ -114,10 +118,15 @@ def calc_search_range(url, match_type, surt_ordered=True, url_canon=None):
    >>> calc_search_range('http://example.com/path/file.html', 'host', False)
    ('example.com/', 'example.com0')

-    # domain range not supported
+    # errors: domain range not supported
    >>> calc_search_range('http://example.com/path/file.html', 'domain', False)
    Traceback (most recent call last):
-    Exception: matchType=domain unsupported for non-surt
+    UrlCanonicalizeException: matchType=domain unsupported for non-surt
+
+    >>> calc_search_range('http://example.com/path/file.html', 'blah', False)
+    Traceback (most recent call last):
+    UrlCanonicalizeException: Invalid match_type: blah
+
    """
    def inc_last_char(x):
        return x[0:-1] + chr(ord(x[-1]) + 1)
@ -155,7 +164,7 @@ def calc_search_range(url, match_type, surt_ordered=True, url_canon=None):

    elif match_type == 'domain':
        if not surt_ordered:
-            raise Exception('matchType=domain unsupported for non-surt')
+            raise UrlCanonicalizeException('matchType=domain unsupported for non-surt')

        host = start_key.split(')/')[0]

@ -168,7 +177,7 @@ def calc_search_range(url, match_type, surt_ordered=True, url_canon=None):

        end_key = host + '-'
    else:
-        raise Exception('Invalid match_type: ' + match_type)
+        raise UrlCanonicalizeException('Invalid match_type: ' + match_type)

    return (start_key, end_key)

--- a/pywb/utils/dsrules.py
+++ b/pywb/utils/dsrules.py
@ -0,0 +1,98 @@
+import yaml
+import pkgutil
+
+#=================================================================
+
+DEFAULT_RULES_FILE = 'rules.yaml'
+DEFAULT_RULES_PKG = 'pywb'
+
+
+#=================================================================
+class RuleSet(object):
+    DEFAULT_KEY = ''
+
+    def __init__(self, rule_cls, fieldname, **kwargs):
+        """
+        A domain specific rules block, inited via config map.
+        If config map not specified, it is loaded from default location.
+
+        The rules are represented as a map by domain.
+        Each rules configuration will load is own field type
+        from the list and given a specified rule_cls.
+        """
+
+        self.rules = []
+
+        ds_rules_file = kwargs.get('ds_rules_file')
+        default_rule_config = kwargs.get('default_rule_config')
+
+        config = self.load_default_rules(ds_rules_file)
+
+        rulesmap = config.get('rules') if config else None
+
+        # if default_rule_config provided, always init a default ruleset
+        if not rulesmap and default_rule_config is not None:
+            self.rules = [rule_cls(self.DEFAULT_KEY, default_rule_config)]
+            return
+
+        def_key_found = False
+
+        # iterate over master rules file
+        for value in rulesmap:
+            url_prefix = value.get('url_prefix')
+            rules_def = value.get(fieldname)
+            if not rules_def:
+                continue
+
+            if url_prefix == self.DEFAULT_KEY:
+                def_key_found = True
+
+            self.rules.append(rule_cls(url_prefix, rules_def))
+
+        # if default_rule_config provided, always init a default ruleset
+        if not def_key_found and default_rule_config is not None:
+            self.rules.append(rule_cls(self.DEFAULT_KEY, default_rule_config))
+
+    @staticmethod
+    def load_default_rules(filename=None, pkg=None):
+        config = None
+
+        if not filename:
+            filename = DEFAULT_RULES_FILE
+
+        if not pkg:
+            pkg = DEFAULT_RULES_PKG
+
+        if filename:
+            yaml_str = pkgutil.get_data(pkg, filename)
+            config = yaml.load(yaml_str)
+
+        return config
+
+    def iter_matching(self, urlkey):
+        """
+        Iterate over all matching rules for given urlkey
+        """
+        for rule in self.rules:
+            if rule.applies(urlkey):
+                yield rule
+
+    def get_first_match(self, urlkey):
+        for rule in self.rules:
+            if rule.applies(urlkey):
+                return rule
+
+
+#=================================================================
+class BaseRule(object):
+    """
+    Base rule class -- subclassed to handle specific
+    rules for given url_prefix key
+    """
+    def __init__(self, url_prefix, rules):
+        self.url_prefix = url_prefix
+        if not isinstance(self.url_prefix, list):
+            self.url_prefix = [self.url_prefix]
+
+    def applies(self, urlkey):
+        return any(urlkey.startswith(x) for x in self.url_prefix)
--- a/pywb/utils/loaders.py
+++ b/pywb/utils/loaders.py
@ -9,6 +9,7 @@ import urllib2
 import time


+#=================================================================
 def is_http(filename):
    return any(filename.startswith(x) for x in ['http://', 'https://'])

--- a/pywb/utils/timeutils.py
+++ b/pywb/utils/timeutils.py
@ -162,6 +162,10 @@ def timestamp_to_datetime(string):
    >>> timestamp_to_datetime('40001965252477')
    datetime.datetime(2999, 12, 31, 23, 24, 59)

+    # not a number!
+    >>> timestamp_to_datetime('2010abc')
+    datetime.datetime(2010, 12, 31, 23, 59, 59)
+
    """

    # pad to 6 digits
--- a/pywb/wbapp.py
+++ b/pywb/wbapp.py
@ -2,6 +2,7 @@ from wbexceptions import WbException, NotFoundException, InternalRedirect
 from wbrequestresponse import WbResponse, StatusAndHeaders

 from pywb.cdx.cdxserver import CDXException
+from pywb.utils.canonicalize import UrlCanonicalizeException
 from pywb.warc.recordloader import ArchiveLoadFailed

 import os
@ -55,7 +56,8 @@ def create_wb_app(wb_router):
        except InternalRedirect as ir:
            response = WbResponse(StatusAndHeaders(ir.status, ir.httpHeaders))

-        except (WbException, CDXException, ArchiveLoadFailed) as e:
+        except (WbException, CDXException,
+                UrlCanonicalizeException, ArchiveLoadFailed) as e:
            response = handle_exception(env, wb_router.error_view, e, False)

        except Exception as e:
--- a/sample_archive/text_content/sample.html
+++ b/sample_archive/text_content/sample.html
@ -0,0 +1,14 @@
+<html>
+<head>
+<title>Sample Page For Rewrite Test</title>
+</head>
+<body>
+<script>
+var some_val = false;
+if (some_val) {
+    window.location = "/other.html";
+}
+</script>
+Test Content
+<a href="another.html">Some Link</a>
+</body>
--- a/sample_archive/zipcdx/zipnum-sample.cdx.gz
+++ b/sample_archive/zipcdx/zipnum-sample.cdx.gz
--- a/sample_archive/zipcdx/zipnum-sample.idx
+++ b/sample_archive/zipcdx/zipnum-sample.idx
@ -0,0 +1,38 @@
+com,example)/ 20140127171200	zipnum	0	276
+org,iana)/ 20140127171238	zipnum	276	328
+org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201055	zipnum	604	312
+org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200718	zipnum	916	235
+org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200912	zipnum	1151	235
+org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201240	zipnum	1386	306
+org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200654	zipnum	1692	235
+org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200816	zipnum	1927	231
+org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126201128	zipnum	2158	236
+org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140127171240	zipnum	2394	312
+org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126200805	zipnum	2706	234
+org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126201055	zipnum	2940	235
+org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126201308	zipnum	3175	289
+org,iana)/_css/2013.1/print.css 20140126200737	zipnum	3464	208
+org,iana)/_css/2013.1/print.css 20140126200929	zipnum	3672	207
+org,iana)/_css/2013.1/print.css 20140126201248	zipnum	3879	276
+org,iana)/_css/2013.1/screen.css 20140126200706	zipnum	4155	210
+org,iana)/_css/2013.1/screen.css 20140126200825	zipnum	4365	211
+org,iana)/_css/2013.1/screen.css 20140126201227	zipnum	4576	216
+org,iana)/_img/2013.1/iana-logo-header.svg 20140126200654	zipnum	4792	236
+org,iana)/_img/2013.1/iana-logo-header.svg 20140126200816	zipnum	5028	219
+org,iana)/_img/2013.1/iana-logo-header.svg 20140126201128	zipnum	5247	221
+org,iana)/_img/2013.1/iana-logo-homepage.png 20140126200625	zipnum	5468	299
+org,iana)/_img/2013.1/icann-logo.svg 20140126200719	zipnum	5767	210
+org,iana)/_img/2013.1/icann-logo.svg 20140126200912	zipnum	5977	212
+org,iana)/_img/2013.1/icann-logo.svg 20140126201240	zipnum	6189	281
+org,iana)/_img/bookmark_icon.ico 20140126200631	zipnum	6470	298
+org,iana)/_js/2013.1/iana.js 20140126200716	zipnum	6768	213
+org,iana)/_js/2013.1/iana.js 20140126200912	zipnum	6981	216
+org,iana)/_js/2013.1/iana.js 20140126201239	zipnum	7197	270
+org,iana)/_js/2013.1/jquery.js 20140126200653	zipnum	7467	215
+org,iana)/_js/2013.1/jquery.js 20140126200816	zipnum	7682	209
+org,iana)/_js/2013.1/jquery.js 20140126201127	zipnum	7891	210
+org,iana)/_js/2013.1/jquery.js 20140127171239	zipnum	8101	410
+org,iana)/dnssec 20140126201307	zipnum	8511	373
+org,iana)/domains/int 20140126201239	zipnum	8884	353
+org,iana)/domains/root/servers 20140126201227	zipnum	9237	386
+org,iana)/time-zones 20140126200737	zipnum	9623	145
--- a/sample_archive/zipcdx/zipnum-sample.loc
+++ b/sample_archive/zipcdx/zipnum-sample.loc
@ -0,0 +1 @@
+zipnum	./sample_archive/zipcdx/zipnum-sample.cdx.gz
--- a/setup.py
+++ b/setup.py
@ -22,7 +22,9 @@ setup(
        },
    data_files = [
        ('sample_archive/cdx/', glob.glob('sample_archive/cdx/*')),
-        ('sample_archive/warcs/', glob.glob('sample_archive/warcs/*'))
+        ('sample_archive/zipcdx/', glob.glob('sample_archive/zipcdx/*')),
+        ('sample_archive/warcs/', glob.glob('sample_archive/warcs/*')),
+        ('sample_archive/text_content/', glob.glob('sample_archive/text_content/*'))],
        ],
    install_requires=[
        'uwsgi',
--- a/tests/fixture.py
+++ b/tests/fixture.py
@ -3,6 +3,8 @@ import pytest

 import yaml

+from pywb.cdx.perms import AllowAllPerms
+
@pytest.fixture
 def testconfig():
    config = yaml.load(open('test_config.yaml'))
@ -25,7 +27,7 @@ class PrintReporter:
        pass

 #================================================================
-class TestExclusionPerms:
+class TestExclusionPerms(AllowAllPerms):
    """
    Perm Checker fixture which can block one URL.
    """
@ -37,20 +39,7 @@ class TestExclusionPerms:
        Return true/false if url or urlkey (canonicalized url)
        should be allowed
        """
-        print "allow_url_lookup:urlkey={}".format(urlkey)
        if urlkey == self.URLKEY_EXCLUDED:
            return False

-        return True
-
-    def allow_capture(self, cdx):
-        """
-        Return True if specified capture (cdx) is allowed.
-        """
-        return True
-
-    def filter_fields(self, cdx):
-        """
-        Filter out any forbidden cdx fields from cdx object
-        """
-        return cdx
+        return super(TestExclusionPerms, self).allow_url_lookup(urlkey, url)
--- a/tests/test_integration.py
+++ b/tests/test_integration.py
@ -2,6 +2,7 @@ import webtest
 from pywb.pywb_init import pywb_config
 from pywb.wbapp import create_wb_app
 from pywb.cdx.cdxobject import CDXObject
+from pywb.cdx.perms import AllowAllPerms

 class TestWb:
    TEST_CONFIG = 'test_config.yaml'
@ -75,7 +76,19 @@ class TestWb:

        assert 'Mon, Jan 27 2014 17:12:38' in resp.body
        assert 'wb.js' in resp.body
-        assert '/pywb/20140127171238/http://www.iana.org/time-zones' in resp.body
+        assert '/pywb/20140127171238/http://www.iana.org/time-zones"' in resp.body
+
+    def test_replay_identity_1(self):
+        resp = self.testapp.get('/pywb/20140127171251id_/http://example.com')
+        #resp = self.testapp.get('/pywb/20140126200654id_/http://www.iana.org/_img/2013.1/rir-map.svg')
+        #resp = self.testapp.get('/pywb/20140127171239id_/http://www.iana.org/_css/2013.1/screen.css')
+        #self._assert_basic_html(resp)
+
+        # no wb header insertion
+        assert 'wb.js' not in resp.body
+
+        # original unrewritten url present
+        assert '"http://www.iana.org/domains/example"' in resp.body

    def test_replay_content_length_1(self):
        # test larger file, rewritten file (svg!)
				`@ -0,0 +1 @@`
				`zipnum ./sample_archive/zipcdx/zipnum-sample.cdx.gz`