Merge branch 'master' into cdx-server

2025-03-24 06:59:52 +01:00 · 2014-02-25 23:14:15 +00:00 · 2014-02-25 23:14:15 +00:00 · 14f4b4d26e
commit 14f4b4d26e
parent bb87d98b73 b5d8accd1d
33 changed files with 1120 additions and 342 deletions
--- a/.coveragerc
+++ b/.coveragerc
@ -0,0 +1,8 @@
 [run]
 omit = 
    */test/*
    */tests/*
 [report]
 exclude_lines =
    if __name__ == .__main__.:
--- a/.travis.yml
+++ b/.travis.yml
@ -4,7 +4,14 @@ python:
 # command to install dependencies
 install:
  - "python setup.py -q install"
  - "pip install python-coveralls"
  - "pip install pytest-cov"
 # command to run tests
 #script: nosetests --with-doctest
 #script: py.test run-tests.py ./pywb/ --doctest-modules --ignore=setup.py
-script: py.test -v --doctest-module ./tests/*.py ./pywb/
+#script: py.test -v --doctest-module ./tests/*.py ./pywb/
 script: 
    py.test --cov-config .coveragerc --cov pywb -v --doctest-module ./pywb/ tests/
 after_success:
    coveralls
--- a/pywb/archivalrouter.py
+++ b/pywb/archivalrouter.py
@ -3,13 +3,13 @@ import re
 from wbrequestresponse import WbRequest, WbResponse
 from pywb.rewrite.url_rewriter import UrlRewriter
-from pywb.rewrite.wburl import WbUrl
+
 #=================================================================
 # ArchivalRouter -- route WB requests in archival mode
 #=================================================================
 class ArchivalRouter:
-    def __init__(self, routes, hostpaths = None, abs_path = True, home_view = None, error_view = None):
+    def __init__(self, routes, hostpaths=None, abs_path=True, home_view=None, error_view=None):
        self.routes = routes
        self.fallback = ReferRedirect(hostpaths)
        self.abs_path = abs_path
@ -69,24 +69,25 @@ class Route:
        if not matcher:
            return None
-        rel_prefix = matcher.group(0)
+        matched_str = matcher.group(0)
-        if rel_prefix:
+        if matched_str:
-            wb_prefix = env['SCRIPT_NAME'] + '/' + rel_prefix + '/'
+            rel_prefix = env['SCRIPT_NAME'] + '/' + matched_str + '/'
-            wb_url_str = request_uri[len(rel_prefix) + 2:] # remove the '/' + rel_prefix part of uri
+            wb_url_str = request_uri[len(matched_str) + 2:] # remove the '/' + rel_prefix part of uri
        else:
-            wb_prefix = env['SCRIPT_NAME'] + '/'
+            rel_prefix = env['SCRIPT_NAME'] + '/'
            wb_url_str = request_uri[1:] # the request_uri is the wb_url, since no coll
        coll = matcher.group(self.coll_group)
        wbrequest = WbRequest(env,
-                              request_uri = request_uri,
+                              request_uri=request_uri,
-                              wb_url_str = wb_url_str,
+                              wb_url_str=wb_url_str,
-                              wb_prefix = wb_prefix,
+                              rel_prefix=rel_prefix,
-                              coll = coll,
+                              coll=coll,
-                              host_prefix = WbRequest.make_host_prefix(env) if use_abs_prefix else '',
+                              use_abs_prefix=use_abs_prefix,
-                              wburl_class = self.handler.get_wburl_type())
+                              wburl_class = self.handler.get_wburl_type(),
                              urlrewriter_class=UrlRewriter)
        # Allow for applying of additional filters
--- a/pywb/cdx/canonicalize.py
+++ b/pywb/cdx/canonicalize.py
@ -2,6 +2,7 @@
 """
 import surt
 import urlparse
 from cdxobject import CDXException
@ -69,6 +70,109 @@ index.html?a=b?c=)/')
        return surt
 #=================================================================
 def calc_search_range(url, match_type, surt_ordered=True, url_canon=None):
    """
    Canonicalize a url (either with custom canonicalizer or
    standard canonicalizer with or without surt)
    Then, compute a start and end search url search range
    for a given match type.
    Support match types:
    * exact
    * prefix
    * host
    * domain (only available when for surt ordering)
    Examples below:
    # surt ranges
    >>> calc_search_range('http://example.com/path/file.html', 'exact')
    ('com,example)/path/file.html', 'com,example)/path/file.html!')
    >>> calc_search_range('http://example.com/path/file.html', 'prefix')
    ('com,example)/path/file.html', 'com,example)/path/file.htmm')
    >>> calc_search_range('http://example.com/path/file.html', 'host')
    ('com,example)/', 'com,example*')
    >>> calc_search_range('http://example.com/path/file.html', 'domain')
    ('com,example)/', 'com,example-')
    special case for tld domain range
    >>> calc_search_range('com', 'domain')
    ('com,', 'com-')
    # non-surt ranges
    >>> calc_search_range('http://example.com/path/file.html', 'exact', False)
    ('example.com/path/file.html', 'example.com/path/file.html!')
    >>> calc_search_range('http://example.com/path/file.html', 'prefix', False)
    ('example.com/path/file.html', 'example.com/path/file.htmm')
    >>> calc_search_range('http://example.com/path/file.html', 'host', False)
    ('example.com/', 'example.com0')
    # domain range not supported
    >>> calc_search_range('http://example.com/path/file.html', 'domain', False)
    Traceback (most recent call last):
    Exception: matchType=domain unsupported for non-surt
    """
    def inc_last_char(x):
        return x[0:-1] + chr(ord(x[-1]) + 1)
    if not url_canon:
        # make new canon
        url_canon = UrlCanonicalizer(surt_ordered)
    else:
        # ensure surt order matches url_canon
        surt_ordered = url_canon.surt_ordered
    start_key = url_canon(url)
    if match_type == 'exact':
        end_key = start_key + '!'
    elif match_type == 'prefix':
        # add trailing slash if url has it
        if url.endswith('/') and not start_key.endswith('/'):
            start_key += '/'
        end_key = inc_last_char(start_key)
    elif match_type == 'host':
        if surt_ordered:
            host = start_key.split(')/')[0]
            start_key = host + ')/'
            end_key = host + '*'
        else:
            host = urlparse.urlsplit(url).netloc
            start_key = host + '/'
            end_key = host + '0'
    elif match_type == 'domain':
        if not surt_ordered:
            raise Exception('matchType=domain unsupported for non-surt')
        host = start_key.split(')/')[0]
        # if tld, use com, as start_key
        # otherwise, stick with com,example)/
        if not ',' in host:
            start_key = host + ','
        else:
            start_key = host + ')/'
        end_key = host + '-'
    else:
        raise Exception('Invalid match_type: ' + match_type)
    return (start_key, end_key)
 if __name__ == "__main__":
    import doctest
    doctest.testmod()
--- a/pywb/cdx/cdxobject.py
+++ b/pywb/cdx/cdxobject.py
@ -77,3 +77,34 @@ class CDXObject(OrderedDict):
        li = itertools.imap(lambda (n, val): val, self.items())
        return ' '.join(li)
 #=================================================================
 class IDXObject(OrderedDict):
    FORMAT = ['urlkey', 'part', 'offset', 'length', 'lineno']
    NUM_REQ_FIELDS = len(FORMAT) - 1  # lineno is an optional field
    def __init__(self, idxline):
        OrderedDict.__init__(self)
        idxline = idxline.rstrip()
        fields = idxline.split('\t')
        if len(fields) < self.NUM_REQ_FIELDS:
            msg = 'invalid idx format: {0} fields found, {1} required'
            raise Exception(msg.format(len(fields), self.NUM_REQ_FIELDS))
        for header, field in itertools.izip(self.FORMAT, fields):
            self[header] = field
        self['offset'] = int(self['offset'])
        self['length'] = int(self['length'])
        lineno = self.get('lineno')
        if lineno:
            self['lineno'] = int(lineno)
        self.idxline = idxline
    def __str__(self):
        return self.idxline
--- a/pywb/cdx/cdxops.py
+++ b/pywb/cdx/cdxops.py
@ -1,4 +1,4 @@
-from cdxobject import CDXObject, AccessException
+from cdxobject import CDXObject, IDXObject, AccessException
 from pywb.utils.timeutils import timestamp_to_sec
 import bisect
@ -56,7 +56,7 @@ def cdx_text_out(cdx, fields):
 def cdx_load_and_filter(sources, params):
    cdx_iter = load_cdx_streams(sources, params)
-    cdx_iter = make_cdx_iter(cdx_iter)
+    cdx_iter = make_obj_iter(cdx_iter, params)
    if params.get('proxyAll'):
        return cdx_iter
@ -102,9 +102,15 @@ def load_cdx_streams(sources, params):
 #=================================================================
-# convert text cdx stream to CDXObject
+# convert text cdx stream to CDXObject/IDXObject
-def make_cdx_iter(text_iter):
+def make_obj_iter(text_iter, params):
-    return itertools.imap(lambda line: CDXObject(line), text_iter)
+    # already converted
    if params.get('showPagedIndex'):
        cls = IDXObject
    else:
        cls = CDXObject
    return itertools.imap(lambda line: cls(line), text_iter)
 #=================================================================
--- a/pywb/cdx/cdxserver.py
+++ b/pywb/cdx/cdxserver.py
@ -1,10 +1,13 @@
-from canonicalize import UrlCanonicalizer
+from canonicalize import UrlCanonicalizer, calc_search_range
 from cdxops import cdx_load
-from cdxsource import CDXSource, CDXFile, RemoteCDXSource
+from cdxsource import CDXSource, CDXFile, RemoteCDXSource, RedisCDXSource
 from zipnum import ZipNumCluster
 from cdxobject import CDXObject, CaptureNotFoundException, CDXException
 from cdxdomainspecific import load_domain_specific_cdx_rules
 from pywb.utils.loaders import is_http
 from itertools import chain
 import logging
 import os
@ -14,8 +17,23 @@ import urlparse
 #=================================================================
 class BaseCDXServer(object):
    def __init__(self, **kwargs):
-        self.url_canon = kwargs.get('url_canon', UrlCanonicalizer())
+        ds_rules = kwargs.get('ds_rules')
-        self.fuzzy_query = kwargs.get('fuzzy_query')
+        surt_ordered = kwargs.get('surt_ordered', True)
        # load from domain-specific rules
        if ds_rules:
            self.url_canon, self.fuzzy_query = (
                load_domain_specific_cdx_rules(ds_rules, surt_ordered))
        # or custom passed in canonicalizer
        else:
            self.url_canon = kwargs.get('url_canon')
            self.fuzzy_query = kwargs.get('fuzzy_query')
        # set default canonicalizer if none set thus far
        if not self.url_canon:
            self.url_canon = UrlCanonicalizer(surt_ordered)
        # set perms checker, if any
        self.perms_checker = kwargs.get('perms_checker')
    def _check_cdx_iter(self, cdx_iter, params):
@ -66,7 +84,7 @@ class CDXServer(BaseCDXServer):
    def __init__(self, paths, **kwargs):
        super(CDXServer, self).__init__(**kwargs)
-        self.sources = create_cdx_sources(paths)
+        self.sources = create_cdx_sources(paths, kwargs.get('config'))
    def load_cdx(self, **params):
        # if key not set, assume 'url' is set and needs canonicalization
@ -77,7 +95,14 @@ class CDXServer(BaseCDXServer):
                msg = 'A url= param must be specified to query the cdx server'
                raise CDXException(msg)
-            params['key'] = self.url_canon(url)
+            #params['key'] = self.url_canon(url)
            match_type = params.get('matchType', 'exact')
            key, end_key = calc_search_range(url=url,
                                             match_type=match_type,
                                             url_canon=self.url_canon)
            params['key'] = key
            params['end_key'] = end_key
        cdx_iter = cdx_load(self.sources, params, self.perms_checker)
@ -124,36 +149,29 @@ def create_cdx_server(config, ds_rules_file=None):
        paths = config.get('index_paths')
        surt_ordered = config.get('surt_ordered', True)
        perms_checker = config.get('perms_checker')
        pass_config = config
    else:
        paths = config
        surt_ordered = True
        perms_checker = None
        pass_config = None
    logging.debug('CDX Surt-Ordered? ' + str(surt_ordered))
-    if ds_rules_file:
+    if isinstance(paths, str) and is_http(paths):
        canon, fuzzy = load_domain_specific_cdx_rules(ds_rules_file,
                                                      surt_ordered)
    else:
        canon, fuzzy = None, None
    if not canon:
        canon = UrlCanonicalizer(surt_ordered)
    if (isinstance(paths, str) and
        any(paths.startswith(x) for x in ['http://', 'https://'])):
        server_cls = RemoteCDXServer
    else:
        server_cls = CDXServer
    return server_cls(paths,
-                      url_canon=canon,
+                      config=pass_config,
-                      fuzzy_query=fuzzy,
+                      surt_ordered=surt_ordered,
                      ds_rules=ds_rules_file,
                      perms_checker=perms_checker)
 #=================================================================
-def create_cdx_sources(paths):
+def create_cdx_sources(paths, config=None):
    sources = []
    if not isinstance(paths, list):
@ -161,13 +179,13 @@ def create_cdx_sources(paths):
    for path in paths:
        if isinstance(path, CDXSource):
-            add_cdx_source(sources, path)
+            add_cdx_source(sources, path, config)
        elif isinstance(path, str):
            if os.path.isdir(path):
                for file in os.listdir(path):
-                    add_cdx_source(sources, path + file)
+                    add_cdx_source(sources, path + file, config)
            else:
-                add_cdx_source(sources, path)
+                add_cdx_source(sources, path, config)
    if len(sources) == 0:
        logging.exception('No CDX Sources Found from: ' + str(sources))
@ -176,9 +194,9 @@ def create_cdx_sources(paths):
 #=================================================================
-def add_cdx_source(sources, source):
+def add_cdx_source(sources, source, config):
    if not isinstance(source, CDXSource):
-        source = create_cdx_source(source)
+        source = create_cdx_source(source, config)
        if not source:
            return
@ -187,19 +205,20 @@ def add_cdx_source(sources, source):
 #=================================================================
-def create_cdx_source(filename):
+def create_cdx_source(filename, config):
-    if filename.startswith('http://') or filename.startswith('https://'):
+    if is_http(filename):
        return RemoteCDXSource(filename)
    if filename.startswith('redis://'):
        return RedisCDXSource(filename, config)
    if filename.endswith('.cdx'):
        return CDXFile(filename)
    if filename.endswith('.summary'):
        return ZipNumCluster(filename, config)
    return None
    #TODO: support zipnum
    #elif filename.endswith('.summary')
    #    return ZipNumCDXSource(filename)
    #elif filename.startswith('redis://')
    #    return RedisCDXSource(filename)
 #=================================================================
--- a/pywb/cdx/cdxsource.py
+++ b/pywb/cdx/cdxsource.py
@ -1,9 +1,9 @@
-from pywb.utils.binsearch import iter_exact, iter_prefix
+from pywb.utils.binsearch import iter_range
 from pywb.utils.loaders import SeekableTextFileReader
 import urllib
 import urllib2
-
+import itertools
 #=================================================================
 class CDXSource(object):
@ -24,17 +24,7 @@ class CDXFile(CDXSource):
    def load_cdx(self, params):
        source = SeekableTextFileReader(self.filename)
-
+        return iter_range(source, params.get('key'), params.get('end_key'))
        match_type = params.get('matchType')
        if match_type == 'prefix':
            iter_func = iter_prefix
        else:
            iter_func = iter_exact
        key = params.get('key')
        return iter_func(source, key)
    def __str__(self):
        return 'CDX File - ' + self.filename
@ -90,3 +80,35 @@ class RemoteCDXSource(CDXSource):
    def __str__(self):
        return 'Remote CDX Server: ' + self.remote_url
 #=================================================================
 class RedisCDXSource(CDXSource):
    DEFAULT_KEY_PREFIX = 'c:'
    def __init__(self, redis_url, config=None):
        import redis
        self.redis = redis.StrictRedis.from_url(redis_url)
        self.key_prefix = self.DEFAULT_KEY_PREFIX
        if config:
            self.key_prefix = config.get('redis_key_prefix', self.key_prefix)
    def load_cdx(self, params):
        """
        Load cdx from redis cache, from an ordered list
        Currently, there is no support for range queries
        Only 'exact' matchType is supported
        """
        key = params['key']
        # ensure only url/surt is part of key
        key = key.split(' ')[0]
        cdx_list = self.redis.zrange(self.key_prefix + key, 0, -1)
        # key is not part of list, so prepend to each line
        key += ' '
        cdx_list = itertools.imap(lambda x: key + x, cdx_list)
        return cdx_list
--- a/pywb/cdx/test/cdxserver_test.py
+++ b/pywb/cdx/test/cdxserver_test.py
@ -132,8 +132,8 @@ org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db tex
 ('filename', 'dupes.warc.gz')]
 # NOTE: external dependency -- need self-contained test
->>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'example.com', output = 'raw', limit = '2')
+#>>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'example.com', output = 'raw', limit = '2')
->>> pprint.pprint(x.next().items())
+#>>> pprint.pprint(x.next().items())
 [('urlkey', 'com,example)/'),
 ('timestamp', '20020120142510'),
 ('original', 'http://example.com:80/'),
--- a/pywb/cdx/zipnum.py
+++ b/pywb/cdx/zipnum.py
@ -0,0 +1,203 @@
 import os
 import collections
 import itertools
 import logging
 from cStringIO import StringIO
 import datetime
 from cdxsource import CDXSource
 from cdxobject import IDXObject
 from pywb.utils.loaders import BlockLoader
 from pywb.utils.loaders import SeekableTextFileReader
 from pywb.utils.bufferedreaders import gzip_decompressor
 from pywb.utils.binsearch import iter_range, linearsearch
 #=================================================================
 class ZipBlocks:
    def __init__(self, part, offset, length, count):
        self.part = part
        self.offset = offset
        self.length = length
        self.count = count
 #=================================================================
 def readline_to_iter(stream):
    try:
        count = 0
        buff = stream.readline()
        while buff:
            count += 1
            yield buff
            buff = stream.readline()
    finally:
        stream.close()
 #=================================================================
 class ZipNumCluster(CDXSource):
    DEFAULT_RELOAD_INTERVAL = 10  # in minutes
    DEFAULT_MAX_BLOCKS = 50
    def __init__(self, summary, config=None):
        loc = None
        cookie_maker = None
        self.max_blocks = self.DEFAULT_MAX_BLOCKS
        reload_ival = self.DEFAULT_RELOAD_INTERVAL
        if config:
            loc = config.get('zipnum_loc')
            cookie_maker = config.get('cookie_maker')
            self.max_blocks = config.get('max_blocks', self.max_blocks)
            reload_ival = config.get('reload_interval', reload_ival)
        if not loc:
            splits = os.path.splitext(summary)
            loc = splits[0] + '.loc'
        self.summary = summary
        self.loc_filename = loc
        # initial loc map
        self.loc_map = {}
        self.loc_mtime = 0
        self.load_loc()
        # reload interval
        self.loc_update_time = datetime.datetime.now()
        self.reload_interval = datetime.timedelta(minutes=reload_ival)
        self.blk_loader = BlockLoader(cookie_maker=cookie_maker)
    def load_loc(self):
        # check modified time of current file before loading
        new_mtime = os.path.getmtime(self.loc_filename)
        if (new_mtime == self.loc_mtime):
            return
        # update loc file mtime
        self.loc_mtime = new_mtime
        logging.debug('Loading loc from: ' + self.loc_filename)
        with open(self.loc_filename) as fh:
            for line in fh:
                parts = line.rstrip().split('\t')
                self.loc_map[parts[0]] = parts[1:]
    @staticmethod
    def reload_timed(timestamp, val, delta, func):
        now = datetime.datetime.now()
        if now - timestamp >= delta:
            func()
            return now
        return None
    def reload_loc(self):
        reload_time = self.reload_timed(self.loc_update_time,
                                        self.loc_map,
                                        self.reload_interval,
                                        self.load_loc)
        if reload_time:
            self.loc_update_time = reload_time
    def lookup_loc(self, part):
        return self.loc_map[part]
    def load_cdx(self, params):
        self.reload_loc()
        reader = SeekableTextFileReader(self.summary)
        idx_iter = iter_range(reader,
                              params['key'],
                              params['end_key'],
                              prev_size=1)
        if params.get('showPagedIndex'):
            params['proxyAll'] = True
            return idx_iter
        else:
            blocks = self.idx_to_cdx(idx_iter, params)
            def gen_cdx():
                for blk in blocks:
                    for cdx in blk:
                        yield cdx
            return gen_cdx()
    def idx_to_cdx(self, idx_iter, params):
        blocks = None
        ranges = []
        for idx in idx_iter:
            idx = IDXObject(idx)
            if (blocks and blocks.part == idx['part'] and
                blocks.offset + blocks.length == idx['offset'] and
                blocks.count < self.max_blocks):
                    blocks.length += idx['length']
                    blocks.count += 1
                    ranges.append(idx['length'])
            else:
                if blocks:
                    yield self.block_to_cdx_iter(blocks, ranges, params)
                blocks = ZipBlocks(idx['part'],
                                   idx['offset'],
                                   idx['length'],
                                   1)
                ranges = [blocks.length]
        if blocks:
            yield self.block_to_cdx_iter(blocks, ranges, params)
    def block_to_cdx_iter(self, blocks, ranges, params):
        last_exc = None
        last_traceback = None
        for location in self.lookup_loc(blocks.part):
            try:
                return self.load_blocks(location, blocks, ranges, params)
            except Exception as exc:
                last_exc = exc
                import sys
                last_traceback = sys.exc_info()[2]
        if last_exc:
            raise exc, None, last_traceback
        else:
            raise Exception('No Locations Found for: ' + block.part)
    def load_blocks(self, location, blocks, ranges, params):
        if (logging.getLogger().getEffectiveLevel() <= logging.DEBUG):
            msg = 'Loading {b.count} blocks from {loc}:{b.offset}+{b.length}'
            logging.debug(msg.format(b=blocks, loc=location))
        reader = self.blk_loader.load(location, blocks.offset, blocks.length)
        def decompress_block(range_):
            decomp = gzip_decompressor()
            buff = decomp.decompress(reader.read(range_))
            return readline_to_iter(StringIO(buff))
        iter_ = itertools.chain(*itertools.imap(decompress_block, ranges))
        # start bound
        iter_ = linearsearch(iter_, params['key'])
        # end bound
        end = params['end_key']
        iter_ = itertools.takewhile(lambda line: line < end, iter_)
        return iter_
--- a/pywb/handlers.py
+++ b/pywb/handlers.py
@ -10,19 +10,28 @@ from wbexceptions import WbException, NotFoundException
 from views import TextCapturesView
-class BaseHandler:
+#=================================================================
-    @staticmethod
+class BaseHandler(object):
    def get_wburl_type():
        return WbUrl
    def __call__(self, wbrequest):
        return wbrequest
    def get_wburl_type(self):
        return None
 #=================================================================
 class WbUrlHandler(BaseHandler):
    def get_wburl_type(self):
        return WbUrl
 #=================================================================
 # Standard WB Handler
 #=================================================================
-class WBHandler(BaseHandler):
+class WBHandler(WbUrlHandler):
-    def __init__(self, index_reader, replay, html_view = None, search_view = None):
+    def __init__(self, index_reader, replay,
                 html_view=None, search_view=None):
        self.index_reader = index_reader
        self.replay = replay
@ -31,7 +40,6 @@ class WBHandler(BaseHandler):
        self.html_view = html_view
        self.search_view = search_view
    def __call__(self, wbrequest):
        if wbrequest.wb_url_str == '/':
            return self.render_search_page(wbrequest)
@ -61,6 +69,7 @@ class WBHandler(BaseHandler):
    def __str__(self):
        return 'WBHandler: ' + str(self.index_reader) + ', ' + str(self.replay)
 #=================================================================
 # CDX-Server Handler -- pass all params to cdx server
 #=================================================================
@ -75,11 +84,6 @@ class CDXHandler(BaseHandler):
        return self.view.render_response(wbrequest, cdx_lines)
    @staticmethod
    def get_wburl_type():
        return None
    def __str__(self):
        return 'Index Reader: ' + str(self.index_reader)
@ -115,10 +119,6 @@ class StaticHandler(BaseHandler):
        except IOError:
            raise NotFoundException('Static File Not Found: ' + wbrequest.wb_url_str)
    @staticmethod
    def get_wburl_type():
        return None
    def __str__(self):
        return 'Static files from ' + self.static_path
@ -130,6 +130,7 @@ class DebugEchoEnvHandler(BaseHandler):
    def __call__(self, wbrequest):
        return WbResponse.text_response(str(wbrequest.env))
 #=================================================================
 class DebugEchoHandler(BaseHandler):
    def __call__(self, wbrequest):
@ -150,5 +151,3 @@ class PerfTimer:
        self.end = time.clock()
        if self.perfdict is not None:
            self.perfdict[self.name] = str(self.end - self.start)
--- a/pywb/indexreader.py
+++ b/pywb/indexreader.py
@ -37,7 +37,7 @@ class IndexReader(object):
    def load_cdx(self, **params):
        return self.cdx_server.load_cdx(**params)
-    def get_query_params(self, wburl, limit = 150000, collapse_time = None, replay_closest = 10):
+    def get_query_params(self, wburl, limit = 150000, collapse_time = None, replay_closest = 100):
        if wburl.type == wburl.URL_QUERY:
            raise NotImplementedError('Url Query Not Yet Supported')
--- a/pywb/proxy.py
+++ b/pywb/proxy.py
@ -45,14 +45,14 @@ class ProxyRouter:
            return None
        wbrequest = WbRequest(env,
-                              request_uri = url,
+                              request_uri=url,
-                              wb_url_str = url,
+                              wb_url_str=url,
-                              wb_prefix = '',
+                              #rel_prefix=url,
-                              coll = '',
+                              #host_prefix=self.hostpaths[0],
-                              host_prefix = self.hostpaths[0],
+                              wburl_class=self.handler.get_wburl_type(),
-                              wburl_class = self.handler.get_wburl_type(),
+                              urlrewriter_class=ProxyHttpsUrlRewriter,
-                              url_rewriter_class = ProxyHttpsUrlRewriter,
+                              use_abs_prefix=False,
-                              is_proxy = True)
+                              is_proxy=True)
        return self.handler(wbrequest)
--- a/pywb/replay_views.py
+++ b/pywb/replay_views.py
@ -7,7 +7,6 @@ from wbrequestresponse import WbResponse
 from wbexceptions import CaptureException, InternalRedirect
 from pywb.warc.recordloader import ArchiveLoadFailed
 #=================================================================
 class ReplayView:
    def __init__(self, content_loader, content_rewriter, head_insert_view = None,
@ -49,6 +48,9 @@ class ReplayView:
                # check if redir is needed
                self._redirect_if_needed(wbrequest, cdx)
                # one more check for referrer-based self-redirect
                self._reject_referrer_self_redirect(wbrequest, status_headers)
                response = None
                if self.content_rewriter and wbrequest.wb_url.mod != 'id_':
@ -148,6 +150,7 @@ class ReplayView:
    def _reject_self_redirect(self, wbrequest, cdx, status_headers):
        # self-redirect via location
        if status_headers.statusline.startswith('3'):
            request_url = wbrequest.wb_url.url.lower()
            location_url = status_headers.get_header('Location').lower()
@ -156,3 +159,16 @@ class ReplayView:
            if (UrlRewriter.strip_protocol(request_url) == UrlRewriter.strip_protocol(location_url)):
                raise CaptureException('Self Redirect: ' + str(cdx))
    def _reject_referrer_self_redirect(self, wbrequest, status_headers):
        # at correct timestamp now, but must check for referrer redirect
        # indirect self-redirect, via meta-refresh, if referrer is same as current url
        if status_headers.statusline.startswith('2'):
            # build full url even if using relative-rewriting
            request_url = wbrequest.host_prefix + wbrequest.rel_prefix + str(wbrequest.wb_url)
            referrer_url = wbrequest.referrer
            if (referrer_url and UrlRewriter.strip_protocol(request_url) == UrlRewriter.strip_protocol(referrer_url)):
                raise CaptureException('Self Redirect via Referrer: ' + str(wbrequest.wb_url))
--- a/pywb/rewrite/rewrite_content.py
+++ b/pywb/rewrite/rewrite_content.py
@ -6,7 +6,7 @@ from regex_rewriters import RegexRewriter, JSRewriter, CSSRewriter, XMLRewriter
 from header_rewriter import HeaderRewriter, RewrittenStatusAndHeaders
 from pywb.utils.statusandheaders import StatusAndHeaders
-from pywb.utils.bufferedreaders import BufferedReader, ChunkedDataReader
+from pywb.utils.bufferedreaders import DecompressingBufferedReader, ChunkedDataReader
 class RewriteContent:
@ -54,7 +54,7 @@ class RewriteContent:
        # =========================================================================
        # special case -- need to ungzip the body
        if (rewritten_headers.contains_removed_header('content-encoding', 'gzip')):
-            stream = BufferedReader(stream, decomp_type='gzip')
+            stream = DecompressingBufferedReader(stream, decomp_type='gzip')
        if rewritten_headers.charset:
            encoding = rewritten_headers.charset
--- a/pywb/rewrite/test/test_rewrite_live.py
+++ b/pywb/rewrite/test/test_rewrite_live.py
@ -24,9 +24,9 @@ def test_example_2():
-def test_example_3():
+#def test_example_3():
-    status_headers, buff = get_rewritten('http://archive.org/', urlrewriter)
+#    status_headers, buff = get_rewritten('http://archive.org/', urlrewriter)
-    assert '/pywb/20131226101010/http://example.com/about/terms.php' in buff, buff
+#    assert '/pywb/20131226101010/http://example.com/about/terms.php' in buff, buff
--- a/pywb/rewrite/url_rewriter.py
+++ b/pywb/rewrite/url_rewriter.py
@ -103,10 +103,12 @@ class UrlRewriter:
        return self.prefix + self.wburl.to_str(timestamp=timestamp, url=url)
    def set_base_url(self, newUrl):
        self.wburl.url = newUrl
    def __repr__(self):
        return "UrlRewriter('{0}', '{1}')".format(self.wburl, self.prefix)
    @staticmethod
    def strip_protocol(url):
        for protocol in UrlRewriter.PROTOCOLS:
--- a/pywb/rewrite/wburl.py
+++ b/pywb/rewrite/wburl.py
@ -1,9 +1,5 @@
 #!/usr/bin/python
 import re
 import rfc3987
 # WbUrl : wb archival url representation for WB
 """
 WbUrl represents the standard wayback archival url format.
 A regular url is a subset of the WbUrl (latest replay).
@ -34,9 +30,38 @@ replay form:
 latest_replay: (no timestamp)
 http://example.com
 Additionally, the BaseWbUrl provides the base components
 (url, timestamp, end_timestamp, modifier, type) which
 can be used to provide a custom representation of the
 wayback url format.
 """
-class WbUrl:
+import re
 import rfc3987
 #=================================================================
 class BaseWbUrl(object):
    QUERY = 'query'
    URL_QUERY = 'url_query'
    REPLAY = 'replay'
    LATEST_REPLAY = 'latest_replay'
    def __init__(self, url='', mod='',
                 timestamp='', end_timestamp='', type=None):
        self.url = url
        self.timestamp = timestamp
        self.end_timestamp = end_timestamp
        self.mod = mod
        self.type = type
 #=================================================================
 class WbUrl(BaseWbUrl):
    """
    # Replay Urls
    # ======================
@ -107,22 +132,14 @@ class WbUrl:
    QUERY_REGEX = re.compile('^(?:([\w\-:]+)/)?(\d*)(?:-(\d+))?\*/?(.*)$')
    REPLAY_REGEX = re.compile('^(\d*)([a-z]+_)?/{0,3}(.*)$')
    QUERY = 'query'
    URL_QUERY = 'url_query'
    REPLAY = 'replay'
    LATEST_REPLAY = 'latest_replay'
    DEFAULT_SCHEME = 'http://'
    # ======================
    def __init__(self, url):
        super(WbUrl, self).__init__()
        self.original_url = url
        self.type = None
        self.url = ''
        self.timestamp = ''
        self.end_timestamp = ''
        self.mod = ''
        if not any (f(url) for f in [self._init_query, self._init_replay]):
            raise Exception('Invalid WbUrl: ', url)
--- a/pywb/test/test_archivalrouter.py
+++ b/pywb/test/test_archivalrouter.py
@ -1,13 +1,19 @@
 """
-Test Route
+# Test WbRequest parsed via a Route
-# route with relative path
+# route with relative path, print resulting wbrequest
->>> Route('web', BaseHandler())({'REL_REQUEST_URI': '/web/test.example.com', 'SCRIPT_NAME': ''}, False)
+>>> print_req(Route('web', WbUrlHandler())({'REL_REQUEST_URI': '/web/test.example.com', 'SCRIPT_NAME': ''}, False))
-{'wb_url': ('latest_replay', '', '', 'http://test.example.com', 'http://test.example.com'), 'coll': 'web', 'wb_prefix': '/web/', 'request_uri': '/web/test.example.com'}
+{'coll': 'web',
 'request_uri': '/web/test.example.com',
 'wb_prefix': '/web/',
 'wb_url': ('latest_replay', '', '', 'http://test.example.com', 'http://test.example.com')}
 # route with absolute path, running at script /my_pywb
 >>> Route('web', BaseHandler())({'REL_REQUEST_URI': '/web/2013im_/test.example.com', 'SCRIPT_NAME': '/my_pywb', 'HTTP_HOST': 'localhost:8081', 'wsgi.url_scheme': 'https'}, True)
 {'wb_url': ('replay', '2013', 'im_', 'http://test.example.com', '2013im_/http://test.example.com'), 'coll': 'web', 'wb_prefix': 'https://localhost:8081/my_pywb/web/', 'request_uri': '/web/2013im_/test.example.com'}
 # route with absolute path, running at script /my_pywb, print resultingwbrequest
 >>> print_req(Route('web', WbUrlHandler())({'REL_REQUEST_URI': '/web/2013im_/test.example.com', 'SCRIPT_NAME': '/my_pywb', 'HTTP_HOST': 'localhost:8081', 'wsgi.url_scheme': 'https'}, True))
 {'coll': 'web',
 'request_uri': '/web/2013im_/test.example.com',
 'wb_prefix': 'https://localhost:8081/my_pywb/web/',
 'wb_url': ('replay', '2013', 'im_', 'http://test.example.com', '2013im_/http://test.example.com')}
 # not matching route -- skipped
 >>> Route('web', BaseHandler())({'REL_REQUEST_URI': '/other/test.example.com', 'SCRIPT_NAME': ''}, False)
@ -65,7 +71,12 @@ False
 """
 from pywb.archivalrouter import Route, ReferRedirect
-from pywb.handlers import BaseHandler
+from pywb.handlers import BaseHandler, WbUrlHandler
 import pprint
 def print_req(req):
    varlist = vars(req)
    pprint.pprint({k: varlist[k] for k in ('request_uri', 'wb_prefix', 'wb_url', 'coll')})
 def _test_redir(match_host, request_uri, referrer, script_name = '', coll = 'coll', http_host = None):
@ -74,7 +85,7 @@ def _test_redir(match_host, request_uri, referrer, script_name = '', coll = 'col
    if http_host:
        env['HTTP_HOST'] = http_host
-    routes = [Route(coll, BaseHandler())]
+    routes = [Route(coll, WbUrlHandler())]
    redir = ReferRedirect(match_host)
    #req = WbRequest.from_uri(request_uri, env)
@ -85,4 +96,6 @@ def _test_redir(match_host, request_uri, referrer, script_name = '', coll = 'col
    return rep.status_headers.get_header('Location')
-
+if __name__ == "__main__":
    import doctest
    doctest.testmod()
--- a/pywb/test/test_wbrequestresponse.py
+++ b/pywb/test/test_wbrequestresponse.py
@ -0,0 +1,87 @@
 """
 # WbRequest Tests
 # =================
 >>> print_req_from_uri('/save/_embed/example.com/?a=b')
 {'wb_url': ('latest_replay', '', '', 'http://_embed/example.com/?a=b', 'http://_embed/example.com/?a=b'), 'coll': 'save', 'wb_prefix': '/save/', 'request_uri': '/save/_embed/example.com/?a=b'}
 >>> print_req_from_uri('/2345/20101024101112im_/example.com/?b=c')
 {'wb_url': ('replay', '20101024101112', 'im_', 'http://example.com/?b=c', '20101024101112im_/http://example.com/?b=c'), 'coll': '2345', 'wb_prefix': '/2345/', 'request_uri': '/2345/20101024101112im_/example.com/?b=c'}
 >>> print_req_from_uri('/2010/example.com')
 {'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': '/2010/', 'request_uri': '/2010/example.com'}
 >>> print_req_from_uri('../example.com')
 {'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '', 'wb_prefix': '/', 'request_uri': '../example.com'}
 # Abs path
 >>> print_req_from_uri('/2010/example.com', {'wsgi.url_scheme': 'https', 'HTTP_HOST': 'localhost:8080'}, use_abs_prefix = True)
 {'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': 'https://localhost:8080/2010/', 'request_uri': '/2010/example.com'}
 # No Scheme, so stick to relative
 >>> print_req_from_uri('/2010/example.com', {'HTTP_HOST': 'localhost:8080'}, use_abs_prefix = True)
 {'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': '/2010/', 'request_uri': '/2010/example.com'}
 # WbResponse Tests
 # =================
 >>> WbResponse.text_response('Test')
 {'body': ['Test'], 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [('Content-Type', 'text/plain')])}
 >>> WbResponse.text_stream(['Test', 'Another'], '404')
 {'body': ['Test', 'Another'], 'status_headers': StatusAndHeaders(protocol = '', statusline = '404', headers = [('Content-Type', 'text/plain')])}
 >>> WbResponse.redir_response('http://example.com/otherfile')
 {'body': [], 'status_headers': StatusAndHeaders(protocol = '', statusline = '302 Redirect', headers = [('Location', 'http://example.com/otherfile')])}
 """
 from pywb.rewrite.wburl import WbUrl
 from pywb.rewrite.url_rewriter import UrlRewriter
 from pywb.utils.statusandheaders import StatusAndHeaders
 from pywb.wbrequestresponse import WbRequest, WbResponse
 def print_req_from_uri(request_uri, env={}, use_abs_prefix=False):
    response = req_from_uri(request_uri, env, use_abs_prefix)
    varlist = vars(response)
    print str({k: varlist[k] for k in ('request_uri', 'wb_prefix', 'wb_url', 'coll')})
 def req_from_uri(request_uri, env={}, use_abs_prefix=False):
    if not request_uri:
        request_uri = env.get('REL_REQUEST_URI')
    parts = request_uri.split('/', 2)
    # Has coll prefix
    if len(parts) == 3:
        rel_prefix = '/' + parts[1] + '/'
        wb_url_str = parts[2]
        coll = parts[1]
    # No Coll Prefix
    elif len(parts) == 2:
        rel_prefix = '/'
        wb_url_str = parts[1]
        coll = ''
    else:
        rel_prefix = '/'
        wb_url_str = parts[0]
        coll = ''
    return WbRequest(env,
                     request_uri=request_uri,
                     rel_prefix=rel_prefix,
                     wb_url_str=wb_url_str,
                     coll=coll,
                     wburl_class=WbUrl,
                     urlrewriter_class=UrlRewriter,
                     use_abs_prefix=use_abs_prefix)
 if __name__ == "__main__":
    import doctest
    doctest.testmod()
--- a/pywb/utils/binsearch.py
+++ b/pywb/utils/binsearch.py
@ -35,6 +35,58 @@ def binsearch_offset(reader, key, compare_func=cmp, block_size=8192):
    return min_ * block_size
 #=================================================================
 def binsearch(reader, key, compare_func=cmp, block_size=8192):
    """
    Perform a binary search for a specified key to within a 'block_size'
    (default 8192) granularity, and return first full line found.
    """
    min_ = binsearch_offset(reader, key, compare_func, block_size)
    reader.seek(min_)
    if min_ > 0:
        reader.readline()  # skip partial line
    def gen_iter(line):
        while line:
            yield line.rstrip()
            line = reader.readline()
    return gen_iter(reader.readline())
 #=================================================================
 def linearsearch(iter_, key, prev_size=0, compare_func=cmp):
    """
    Perform a linear search over iterator until
    current_line >= key
    optionally also tracking upto N previous lines, which are
    returned before the first matched line.
    if end of stream is reached before a match is found,
    nothing is returned (prev lines discarded also)
    """
    prev_deque = deque(maxlen=prev_size + 1)
    matched = False
    for line in iter_:
        prev_deque.append(line)
        if compare_func(line, key) >= 0:
            matched = True
            break
    # no matches, so return empty iterator
    if not matched:
        return []
    return itertools.chain(prev_deque, iter_)
 #=================================================================
 def search(reader, key, prev_size=0, compare_func=cmp, block_size=8192):
    """
@ -45,46 +97,27 @@ def search(reader, key, prev_size=0, compare_func=cmp, block_size=8192):
    When performin_g linear search, keep track of up to N previous lines before
    first matching line.
    """
-    min_ = binsearch_offset(reader, key, compare_func, block_size)
+    iter_ = binsearch(reader, key, compare_func, block_size)
    iter_ = linearsearch(iter_,
                         key, prev_size=prev_size,
                         compare_func=compare_func)
    return iter_
    reader.seek(min_)
-    if min_ > 0:
+#=================================================================
-        reader.readline()  # skip partial line
+def iter_range(reader, start, end, prev_size=0):
    """
    Creates an iterator which iterates over lines where
    start <= line < end (end exclusive)
    """
-    if prev_size > 1:
+    iter_ = search(reader, start, prev_size=prev_size)
        prev_deque = deque(max_len=prev_size)
-    line = None
+    end_iter = itertools.takewhile(
       lambda line: line < end,
       iter_)
-    while True:
+    return end_iter
        line = reader.readline()
        if not line:
            break
        if compare_func(line, key) >= 0:
            break
        if prev_size == 1:
            prev = line
        elif prev_size > 1:
            prev_deque.append(line)
    def gen_iter(line):
        """
        Create iterator over any previous lines to
        current matched line
        """
        if prev_size == 1:
            yield prev.rstrip()
        elif prev_size > 1:
            for i in prev_deque:
                yield i.rstrip()
        while line:
            yield line.rstrip()
            line = reader.readline()
    return gen_iter(line)
 #=================================================================
--- a/pywb/utils/bufferedreaders.py
+++ b/pywb/utils/bufferedreaders.py
@ -11,7 +11,7 @@ def gzip_decompressor():
 #=================================================================
-class BufferedReader(object):
+class DecompressingBufferedReader(object):
    """
    A wrapping line reader which wraps an existing reader.
    Read operations operate on underlying buffer, which is filled to
@ -29,7 +29,7 @@ class BufferedReader(object):
    DECOMPRESSORS = {'gzip': gzip_decompressor}
-    def __init__(self, stream, max_len=0, block_size=1024, decomp_type=None):
+    def __init__(self, stream, block_size=1024, decomp_type=None):
        self.stream = stream
        self.block_size = block_size
@ -44,24 +44,19 @@ class BufferedReader(object):
        self.buff = None
        self.num_read = 0
        self.max_len = max_len
    def _fillbuff(self, block_size=None):
        if not block_size:
            block_size = self.block_size
        if not self.buff or self.buff.pos >= self.buff.len:
-            if self.max_len > 0:
+            data = self.stream.read(block_size)
                to_read = min(self.max_len - self.num_read, self.block_size)
            else:
                to_read = self.block_size
            data = self.stream.read(to_read)
            self._process_read(data)
    def _process_read(self, data):
        data = self._decompress(data)
-        self.num_read += len(data)
+        self.buff_size = len(data)
        self.num_read += self.buff_size
        self.buff = StringIO.StringIO(data)
    def _decompress(self, data):
@ -78,12 +73,40 @@ class BufferedReader(object):
        return data
    def read(self, length=None):
        """
        Fill bytes and read some number of bytes
        (up to length if specified)
        < length bytes may be read if reached the end of input
        or at a buffer boundary. If at a boundary, the subsequent
        call will fill buffer anew.
        """
        self._fillbuff()
        return self.buff.read(length)
    def readline(self, length=None):
        """
        Fill buffer and read a full line from the buffer
        (up to specified length, if provided)
        If no newline found at end, try filling buffer again in case
        at buffer boundary.
        """
        self._fillbuff()
-        return self.buff.readline(length)
+        linebuff = self.buff.readline(length)
        # we may be at a boundary
        while not linebuff.endswith('\n'):
            if length:
                length -= len(linebuff)
                if length <= 0:
                    break
            self._fillbuff()
            if self.buff_size == 0:
                break
            linebuff += self.buff.readline(length)
        return linebuff
    def close(self):
        if self.stream:
@ -97,7 +120,7 @@ class ChunkedDataException(Exception):
 #=================================================================
-class ChunkedDataReader(BufferedReader):
+class ChunkedDataReader(DecompressingBufferedReader):
    r"""
    A ChunkedDataReader is a BufferedReader which also supports de-chunking
    of the data if it happens to be http 'chunk-encoded'.
@ -133,7 +156,7 @@ class ChunkedDataReader(BufferedReader):
    def _fillbuff(self, block_size=None):
        if self.not_chunked:
-            return BufferedReader._fillbuff(self, block_size)
+            return super(ChunkedDataReader, self)._fillbuff(block_size)
        if self.all_chunks_read:
            return
--- a/pywb/utils/loaders.py
+++ b/pywb/utils/loaders.py
@ -9,18 +9,50 @@ import urllib2
 import time
 def is_http(filename):
    return any(filename.startswith(x) for x in ['http://', 'https://'])
 #=================================================================
-# load a reader from http
+class BlockLoader(object):
 #=================================================================
 class HttpLoader(object):
    """
-    Load a file-like reader over http using range requests
+    a loader which can stream blocks of content
-    and an optional cookie created via a cookie_maker
+    given a uri, offset and optional length.
    Currently supports: http/https and file/local file system
    """
    def __init__(self, cookie_maker=None):
        self.cookie_maker = cookie_maker
    def load(self, url, offset, length):
        """
        Determine loading method based on uri
        """
        if is_http(url):
            return self.load_http(url, offset, length)
        else:
            return self.load_file(url, offset, length)
    def load_file(self, url, offset, length):
        """
        Load a file-like reader from the local file system
        """
        if url.startswith('file://'):
            url = url[len('file://'):]
        afile = open(url, 'rb')
        afile.seek(offset)
        if length > 0:
            return LimitReader(afile, length)
        else:
            return afile
    def load_http(self, url, offset, length):
        """
        Load a file-like reader over http using range requests
        and an optional cookie created via a cookie_maker
        """
        if length > 0:
            range_header = 'bytes={0}-{1}'.format(offset, offset + length - 1)
        else:
@ -71,25 +103,6 @@ class HMACCookieMaker(object):
        return cookie
 #=================================================================
 # load a reader from local filesystem
 #=================================================================
 class FileLoader(object):
    """
    Load a file-like reader from the local file system
    """
    def load(self, url, offset, length):
        if url.startswith('file://'):
            url = url[len('file://'):]
        afile = open(url, 'rb')
        afile.seek(offset)
        if length > 0:
            return LimitReader(afile, length)
 #=================================================================
 # Limit Reader
 #=================================================================
--- a/pywb/utils/statusandheaders.py
+++ b/pywb/utils/statusandheaders.py
@ -65,23 +65,36 @@ class StatusAndHeadersParser(object):
        """
        parse stream for status line and headers
        return a StatusAndHeaders object
        support continuation headers starting with space or tab
        """
        statusline = stream.readline().rstrip()
        protocol_status = self.split_prefix(statusline, self.statuslist)
        if not protocol_status:
-            msg = 'Expected Status Line - Found: ' + statusline
+            msg = 'Expected Status Line starting with {0} - Found: {1}'
            msg = msg.format(self.statuslist, statusline)
            raise StatusAndHeadersParserException(msg, statusline)
        headers = []
        line = stream.readline().rstrip()
-        while line and line != '\r\n':
+        while line:
            name, value = line.split(':', 1)
-            header = (name, value.strip())
+            name = name.rstrip(' \t')
            value = value.lstrip()
            next_line = stream.readline().rstrip()
            # append continuation lines, if any
            while next_line and next_line.startswith((' ', '\t')):
                value += next_line
                next_line = stream.readline().rstrip()
            header = (name, value)
            headers.append(header)
-            line = stream.readline().rstrip()
+            line = next_line
        return StatusAndHeaders(statusline=protocol_status[1].strip(),
                                headers=headers,
@ -107,4 +120,3 @@ class StatusAndHeadersParserException(Exception):
    def __init__(self, msg, statusline):
        super(StatusAndHeadersParserException, self).__init__(msg)
        self.statusline = statusline
--- a/pywb/utils/test/binsearch_test.py
+++ b/pywb/utils/test/binsearch_test.py
@ -9,6 +9,7 @@ org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ te
 org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz
 org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz
 # Exact Search
 >>> print_binsearch_results('org,iana)/domains/root', iter_exact)
 org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz
@ -19,18 +20,45 @@ org,iana)/ 20140126200624 http://www.iana.org/ text/html 200 OSSAPWJ23L56IYVRW3G
 org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz
 org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz
-# Exact Search
+>>> print_binsearch_results('org,iana)/time-zones', iter_exact)
 org,iana)/time-zones 20140126200737 http://www.iana.org/time-zones text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
 # Exact search -- no matches
 >>> print_binsearch_results('org,iaana)/', iter_exact)
 >>> print_binsearch_results('org,ibna)/', iter_exact)
->>> print_binsearch_results('org,iana)/time-zones', iter_exact)
+
-org,iana)/time-zones 20140126200737 http://www.iana.org/time-zones text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
+# Range Search (end exclusive)
 >>> print_binsearch_results_range('org,iana)/about', 'org,iana)/domains', iter_range)
 org,iana)/about 20140126200706 http://www.iana.org/about text/html 200 6G77LZKFAVKH4PCWWKMW6TRJPSHWUBI3 - - 2962 483588 iana.warc.gz
 org,iana)/about/performance/ietf-draft-status 20140126200815 http://www.iana.org/about/performance/ietf-draft-status text/html 302 Y7CTA2QZUSCDTJCSECZNSPIBLJDO7PJJ - - 584 596566 iana.warc.gz
 org,iana)/about/performance/ietf-statistics 20140126200804 http://www.iana.org/about/performance/ietf-statistics text/html 302 HNYDN7XRX46RQTT2OFIWXKEYMZQAJWHD - - 582 581890 iana.warc.gz
 org,iana)/dnssec 20140126201306 http://www.iana.org/dnssec text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 442 772827 iana.warc.gz
 org,iana)/dnssec 20140126201307 https://www.iana.org/dnssec text/html 200 PHLRSX73EV3WSZRFXMWDO6BRKTVUSASI - - 2278 773766 iana.warc.gz
 # Range Search -- exact
 >>> print_binsearch_results_range('org,iana)/about', 'org,iana)/about!', iter_range)
 org,iana)/about 20140126200706 http://www.iana.org/about text/html 200 6G77LZKFAVKH4PCWWKMW6TRJPSHWUBI3 - - 2962 483588 iana.warc.gz
 # Range Search -- exact + 1 prev
 >>> print_binsearch_results_range('org,iana)/about', 'org,iana)/about!', iter_range, prev_size=1)
 org,iana)/_js/2013.1/jquery.js 20140126201307 https://www.iana.org/_js/2013.1/jquery.js warc/revisit - AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO - - 543 778507 iana.warc.gz
 org,iana)/about 20140126200706 http://www.iana.org/about text/html 200 6G77LZKFAVKH4PCWWKMW6TRJPSHWUBI3 - - 2962 483588 iana.warc.gz
 # Range Search -- exact + 2 prev
 >>> print_binsearch_results_range('org,iana)/about', 'org,iana)/about!', iter_range, prev_size=2)
 org,iana)/_js/2013.1/jquery.js 20140126201248 http://www.iana.org/_js/2013.1/jquery.js warc/revisit - AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO - - 544 765491 iana.warc.gz
 org,iana)/_js/2013.1/jquery.js 20140126201307 https://www.iana.org/_js/2013.1/jquery.js warc/revisit - AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO - - 543 778507 iana.warc.gz
 org,iana)/about 20140126200706 http://www.iana.org/about text/html 200 6G77LZKFAVKH4PCWWKMW6TRJPSHWUBI3 - - 2962 483588 iana.warc.gz
 """
 #=================================================================
 import os
-from pywb.utils.binsearch import iter_prefix, iter_exact
+from pywb.utils.binsearch import iter_prefix, iter_exact, iter_range
 from pywb.utils.loaders import SeekableTextFileReader
 from pywb import get_test_dir
@ -45,6 +73,13 @@ def print_binsearch_results(key, iter_func):
        print line
 def print_binsearch_results_range(key, end_key, iter_func, prev_size=0):
    cdx =  SeekableTextFileReader(test_cdx_dir + 'iana.cdx')
    for line in iter_func(cdx, key, end_key, prev_size=prev_size):
        print line
 if __name__ == "__main__":
    import doctest
    doctest.testmod()
--- a/pywb/utils/test/loaders_test.py
+++ b/pywb/utils/test/loaders_test.py
@ -10,9 +10,9 @@
 >>> read_multiple(LimitReader(StringIO.StringIO('abcdefghjiklmnopqrstuvwxyz'), 10), [2, 2, 20])
 'efghji'
-# FileLoader Tests (includes LimitReader)
+# BlockLoader Tests (includes LimitReader)
 # Ensure attempt to read more than 100 bytes, reads exactly 100 bytes
->>> len(FileLoader().load(test_cdx_dir + 'iana.cdx', 0, 100).read('400'))
+>>> len(BlockLoader().load(test_cdx_dir + 'iana.cdx', 0, 100).read('400'))
 100
 # SeekableTextFileReader Test
@ -23,25 +23,39 @@
 >>> seek_read_full(sr, 100)
 'org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200826 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 34054 620049 iana.warc.gz\\n'
-#BufferedReader readline()
+# Buffered Reader Tests
->>> BufferedReader(open(test_cdx_dir + 'iana.cdx', 'rb')).readline()
+#=================================================================
 #DecompressingBufferedReader readline()
 >>> DecompressingBufferedReader(open(test_cdx_dir + 'iana.cdx', 'rb')).readline()
 ' CDX N b a m s k r M S V g\\n'
-#BufferedReader readline() with decompression
+#DecompressingBufferedReader readline() with decompression
->>> BufferedReader(open(test_cdx_dir + 'iana.cdx.gz', 'rb'), decomp_type = 'gzip').readline()
+>>> DecompressingBufferedReader(open(test_cdx_dir + 'iana.cdx.gz', 'rb'), decomp_type = 'gzip').readline()
 ' CDX N b a m s k r M S V g\\n'
->>> HttpLoader(HMACCookieMaker('test', 'test', 5)).load('http://example.com', 41, 14).read()
+>>> BlockLoader(HMACCookieMaker('test', 'test', 5)).load('http://example.com', 41, 14).read()
 'Example Domain'
 # test very small block size
 >>> dbr = DecompressingBufferedReader(StringIO.StringIO('ABCDEFG\\nHIJKLMN\\nOPQR\\nXYZ'), block_size = 3)
 >>> dbr.readline(); dbr.readline(4); dbr.readline(); dbr.readline(); dbr.readline(2); dbr.readline(); dbr.readline()
 'ABCDEFG\\n'
 'HIJK'
 'LMN\\n'
 'OPQR\\n'
 'XY'
 'Z'
 ''
 """
 #=================================================================
 import os
 import StringIO
-from pywb.utils.loaders import FileLoader, HttpLoader, HMACCookieMaker
+from pywb.utils.loaders import BlockLoader, HMACCookieMaker
 from pywb.utils.loaders import LimitReader, SeekableTextFileReader
-from pywb.utils.bufferedreaders import BufferedReader
+from pywb.utils.bufferedreaders import DecompressingBufferedReader
 from pywb import get_test_dir
 #test_cdx_dir = os.path.dirname(os.path.realpath(__file__)) + '/../sample-data/'
--- a/pywb/utils/test/statusandheaders_test.py
+++ b/pywb/utils/test/statusandheaders_test.py
@ -0,0 +1,29 @@
 """
 >>> StatusAndHeadersParser(['HTTP/1.0']).parse(StringIO.StringIO(status_headers_1))
 StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '200 OK', headers = [ ('Content-Type', 'ABC'),
  ('Some', 'Value'),
  ('Multi-Line', 'Value1    Also This')])
 >>> StatusAndHeadersParser(['Other']).parse(StringIO.StringIO(status_headers_1))
 Traceback (most recent call last):
 StatusAndHeadersParserException: Expected Status Line starting with ['Other'] - Found: HTTP/1.0 200 OK
 """
 from pywb.utils.statusandheaders import StatusAndHeadersParser
 import StringIO
 status_headers_1 = "\
 HTTP/1.0 200 OK\r\n\
 Content-Type: ABC\r\n\
 Some: Value\r\n\
 Multi-Line: Value1\r\n\
    Also This\r\n\
 \r\n\
 Body"
 if __name__ == "__main__":
    import doctest
    doctest.testmod()
--- a/pywb/utils/timeutils.py
+++ b/pywb/utils/timeutils.py
@ -17,7 +17,8 @@ DATE_TIMESPLIT = re.compile(r'[^\d]')
 TIMESTAMP_14 = '%Y%m%d%H%M%S'
-PAD_STAMP_END = '29991231235959'
+#PAD_STAMP_END = '29991231235959'
 PAD_6 = '299912'
 def iso_date_to_datetime(string):
@ -58,41 +59,145 @@ def iso_date_to_timestamp(string):
    return datetime_to_timestamp(iso_date_to_datetime(string))
-# default pad is end of range for compatibility
+# pad to certain length (default 6)
-def pad_timestamp(string, pad_str=PAD_STAMP_END):
+def _pad_timestamp(string, pad_str=PAD_6):
    """
-    >>> pad_timestamp('20')
+    >>> _pad_timestamp('20')
-    '20991231235959'
+    '209912'
-    >>> pad_timestamp('2014')
+    >>> _pad_timestamp('2014')
-    '20141231235959'
+    '201412'
-    >>> pad_timestamp('20141011')
+    >>> _pad_timestamp('20141011')
-    '20141011235959'
+    '20141011'
-    >>> pad_timestamp('201410110010')
+    >>> _pad_timestamp('201410110010')
-    '20141011001059'
+    '201410110010'
     """
    str_len = len(string)
    pad_len = len(pad_str)
-    return string if str_len >= pad_len else string + pad_str[str_len:]
+    if str_len < pad_len:
        string = string + pad_str[str_len:]
    return string
 def timestamp_to_datetime(string):
    """
-    >>> timestamp_to_datetime('20131226095010')
+    # >14-digit -- rest ignored
-    time.struct_time(tm_year=2013, tm_mon=12, tm_mday=26, \
+    >>> timestamp_to_datetime('2014122609501011')
-tm_hour=9, tm_min=50, tm_sec=10, tm_wday=3, tm_yday=360, tm_isdst=-1)
+    datetime.datetime(2014, 12, 26, 9, 50, 10)
    # 14-digit
    >>> timestamp_to_datetime('20141226095010')
    datetime.datetime(2014, 12, 26, 9, 50, 10)
    # 13-digit padding
    >>> timestamp_to_datetime('2014122609501')
    datetime.datetime(2014, 12, 26, 9, 50, 59)
    # 12-digit padding
    >>> timestamp_to_datetime('201412260950')
    datetime.datetime(2014, 12, 26, 9, 50, 59)
    # 11-digit padding
    >>> timestamp_to_datetime('20141226095')
    datetime.datetime(2014, 12, 26, 9, 59, 59)
    # 10-digit padding
    >>> timestamp_to_datetime('2014122609')
    datetime.datetime(2014, 12, 26, 9, 59, 59)
    # 9-digit padding
    >>> timestamp_to_datetime('201412260')
    datetime.datetime(2014, 12, 26, 23, 59, 59)
    # 8-digit padding
    >>> timestamp_to_datetime('20141226')
    datetime.datetime(2014, 12, 26, 23, 59, 59)
    # 7-digit padding
    >>> timestamp_to_datetime('2014122')
    datetime.datetime(2014, 12, 31, 23, 59, 59)
    # 6-digit padding
    >>> timestamp_to_datetime('201410')
    datetime.datetime(2014, 10, 31, 23, 59, 59)
    # 5-digit padding
    >>> timestamp_to_datetime('20141')
    datetime.datetime(2014, 12, 31, 23, 59, 59)
    # 4-digit padding
    >>> timestamp_to_datetime('2014')
-    time.struct_time(tm_year=2014, tm_mon=12, tm_mday=31, \
+    datetime.datetime(2014, 12, 31, 23, 59, 59)
-tm_hour=23, tm_min=59, tm_sec=59, tm_wday=2, tm_yday=365, tm_isdst=-1)
+
    # 3-digit padding
    >>> timestamp_to_datetime('201')
    datetime.datetime(2019, 12, 31, 23, 59, 59)
    # 2-digit padding
    >>> timestamp_to_datetime('20')
    datetime.datetime(2099, 12, 31, 23, 59, 59)
    # 1-digit padding
    >>> timestamp_to_datetime('2')
    datetime.datetime(2999, 12, 31, 23, 59, 59)
    # 1-digit out-of-range padding
    >>> timestamp_to_datetime('3')
    datetime.datetime(2999, 12, 31, 23, 59, 59)
    # 0-digit padding
    >>> timestamp_to_datetime('')
    datetime.datetime(2999, 12, 31, 23, 59, 59)
    # bad month
    >>> timestamp_to_datetime('20131709005601')
    datetime.datetime(2013, 12, 9, 0, 56, 1)
    # all out of range except minutes
    >>> timestamp_to_datetime('40001965252477')
    datetime.datetime(2999, 12, 31, 23, 24, 59)
    """
-    # Default pad to end of range for comptability
+    # pad to 6 digits
-    return time.strptime(pad_timestamp(string), TIMESTAMP_14)
+    string = _pad_timestamp(string, PAD_6)
    def clamp(val, min_, max_):
        try:
            val = int(val)
            val = max(min_, min(val, max_))
            return val
        except:
            return max_
    def extract(string, start, end, min_, max_):
        if len(string) >= end:
            return clamp(string[start:end], min_, max_)
        else:
            return max_
    # now parse, clamp to boundary
    year = extract(string, 0, 4, 1900, 2999)
    month = extract(string, 4, 6, 1, 12)
    day = extract(string, 6, 8, 1, calendar.monthrange(year, month)[1])
    hour = extract(string, 8, 10, 0, 23)
    minute = extract(string, 10, 12, 0, 59)
    second = extract(string, 12, 14, 0, 59)
    return datetime.datetime(year=year,
                             month=month,
                             day=day,
                             hour=hour,
                             minute=minute,
                             second=second)
    #return time.strptime(pad_timestamp(string), TIMESTAMP_14)
 def timestamp_to_sec(string):
@ -104,7 +209,7 @@ def timestamp_to_sec(string):
    1420070399
    """
-    return calendar.timegm(timestamp_to_datetime(string))
+    return calendar.timegm(timestamp_to_datetime(string).utctimetuple())
 if __name__ == "__main__":
--- a/pywb/views.py
+++ b/pywb/views.py
@ -56,9 +56,9 @@ class J2TemplateView:
    # Filters
    @staticmethod
-    def format_ts(value, format='%a, %b %d %Y %H:%M:%S'):
+    def format_ts(value, format_='%a, %b %d %Y %H:%M:%S'):
        value = timeutils.timestamp_to_datetime(value)
-        return time.strftime(format, value)
+        return value.strftime(format_)
    @staticmethod
    def get_host(url):
--- a/pywb/warc/recordloader.py
+++ b/pywb/warc/recordloader.py
@ -6,8 +6,8 @@ from pywb.utils.statusandheaders import StatusAndHeaders
 from pywb.utils.statusandheaders import StatusAndHeadersParser
 from pywb.utils.statusandheaders import StatusAndHeadersParserException
-from pywb.utils.loaders import FileLoader, HttpLoader
+from pywb.utils.loaders import BlockLoader
-from pywb.utils.bufferedreaders import BufferedReader
+from pywb.utils.bufferedreaders import DecompressingBufferedReader
 #=================================================================
 ArcWarcRecord = collections.namedtuple('ArchiveRecord',
@ -32,24 +32,12 @@ class ArcWarcRecordLoader:
    ARC_HEADERS = ["uri", "ip-address", "creation-date",
                   "content-type", "length"]
-    @staticmethod
+    def __init__(self, loader=None, cookie_maker=None, block_size=8192):
-    def create_default_loaders(cookie_maker=None):
+        if not loader:
-        http = HttpLoader(cookie_maker)
+            loader = BlockLoader(cookie_maker)
        file = FileLoader()
        return {
            'http': http,
            'https': http,
            'file': file,
            '': file
            }
-    def __init__(self, loaders={}, cookie_maker=None, chunk_size=8192):
+        self.loader = loader
-        self.loaders = loaders
+        self.block_size = block_size
        if not self.loaders:
            self.loaders = self.create_default_loaders(cookie_maker)
        self.chunk_size = chunk_size
        self.arc_parser = ARCHeadersParser(self.ARC_HEADERS)
@ -60,22 +48,25 @@ class ArcWarcRecordLoader:
    def load(self, url, offset, length):
        url_parts = urlparse.urlsplit(url)
-        loader = self.loaders.get(url_parts.scheme)
+        #loader = self.loaders.get(url_parts.scheme)
-        if not loader:
+        #if not loader:
-            raise ArchiveLoadFailed('Unknown Protocol', url)
+        #    raise ArchiveLoadFailed('Unknown Protocol', url)
        try:
            length = int(length)
        except:
            length = -1
-        raw = loader.load(url, long(offset), length)
+        raw = self.loader.load(url, long(offset), length)
        decomp_type = 'gzip'
-        stream = BufferedReader(raw, length, self.chunk_size, decomp_type)
+        # Create decompressing stream
        stream = DecompressingBufferedReader(stream = raw,
                                             decomp_type = decomp_type,
                                             block_size = self.block_size)
-        (the_format, rec_headers) = self._load_headers(stream)
+        (the_format, rec_headers) = self._detect_type_load_headers(stream)
        if the_format == 'arc':
            rec_type = 'response'
@ -111,7 +102,7 @@ class ArcWarcRecordLoader:
        return ArcWarcRecord((the_format, rec_type),
                             rec_headers, stream, status_headers)
-    def _load_headers(self, stream):
+    def _detect_type_load_headers(self, stream):
        """
        Try parsing record as WARC, then try parsing as ARC.
        if neither one succeeds, we're out of luck.
--- a/pywb/warc/test/test_loading.py
+++ b/pywb/warc/test/test_loading.py
@ -213,3 +213,6 @@ def load_from_cdx_test(cdx):
    except Exception as e:
        print 'Exception: ' + e.__class__.__name__
 if __name__ == "__main__":
    import doctest
    doctest.testmod()
--- a/pywb/wbrequestresponse.py
+++ b/pywb/wbrequestresponse.py
@ -1,99 +1,75 @@
 from pywb.rewrite.wburl import WbUrl
 from pywb.rewrite.url_rewriter import UrlRewriter
 from pywb.utils.statusandheaders import StatusAndHeaders
 import pprint
 #WB Request and Response
 #=================================================================
 class WbRequest:
    """
-    >>> WbRequest.from_uri('/save/_embed/example.com/?a=b')
+    Represents the main pywb request object.
    {'wb_url': ('latest_replay', '', '', 'http://_embed/example.com/?a=b', 'http://_embed/example.com/?a=b'), 'coll': 'save', 'wb_prefix': '/save/', 'request_uri': '/save/_embed/example.com/?a=b'}
-    >>> WbRequest.from_uri('/2345/20101024101112im_/example.com/?b=c')
+    Contains various info from wsgi env, add additional info
-    {'wb_url': ('replay', '20101024101112', 'im_', 'http://example.com/?b=c', '20101024101112im_/http://example.com/?b=c'), 'coll': '2345', 'wb_prefix': '/2345/', 'request_uri': '/2345/20101024101112im_/example.com/?b=c'}
+    about the request, such as coll, relative prefix,
    host prefix, absolute prefix.
-    >>> WbRequest.from_uri('/2010/example.com')
+    If a wburl and url rewriter classes are specified, the class
-    {'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': '/2010/', 'request_uri': '/2010/example.com'}
+    also contains the url rewriter.
    >>> WbRequest.from_uri('../example.com')
    {'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '', 'wb_prefix': '/', 'request_uri': '../example.com'}
    # Abs path
    >>> WbRequest.from_uri('/2010/example.com', {'wsgi.url_scheme': 'https', 'HTTP_HOST': 'localhost:8080'}, use_abs_prefix = True)
    {'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': 'https://localhost:8080/2010/', 'request_uri': '/2010/example.com'}
    # No Scheme, so stick to relative
    >>> WbRequest.from_uri('/2010/example.com', {'HTTP_HOST': 'localhost:8080'}, use_abs_prefix = True)
    {'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': '/2010/', 'request_uri': '/2010/example.com'}
    """
    @staticmethod
    def from_uri(request_uri, env = {}, use_abs_prefix = False):
        if not request_uri:
            request_uri = env.get('REL_REQUEST_URI')
        parts = request_uri.split('/', 2)
        # Has coll prefix
        if len(parts) == 3:
            wb_prefix = '/' + parts[1] + '/'
            wb_url_str = parts[2]
            coll = parts[1]
        # No Coll Prefix
        elif len(parts) == 2:
            wb_prefix = '/'
            wb_url_str = parts[1]
            coll = ''
        else:
            wb_prefix = '/'
            wb_url_str = parts[0]
            coll = ''
        host_prefix = WbRequest.make_host_prefix(env) if use_abs_prefix else ''
        return WbRequest(env, request_uri, wb_prefix, wb_url_str, coll, host_prefix = host_prefix)
    @staticmethod
    def make_host_prefix(env):
        try:
-            return env['wsgi.url_scheme'] + '://' + env['HTTP_HOST']
+            host = env.get('HTTP_HOST')
            if not host:
                host = env['SERVER_NAME'] + ':' + env['SERVER_PORT']
            return env['wsgi.url_scheme'] + '://' + host
        except KeyError:
            return ''
-    def __init__(self, env, request_uri, wb_prefix, wb_url_str, coll,
+    def __init__(self, env,
-                 host_prefix = '',
+                 request_uri=None,
-                 wburl_class = WbUrl,
+                 rel_prefix='',
-                 url_rewriter_class = UrlRewriter,
+                 wb_url_str='/',
-                 is_proxy = False):
+                 coll='',
                 host_prefix='',
                 use_abs_prefix=False,
                 wburl_class=None,
                 urlrewriter_class=None,
                 is_proxy=False):
        self.env = env
        self.request_uri = request_uri if request_uri else env.get('REL_REQUEST_URI')
-        self.host_prefix = host_prefix
+        self.coll = coll
        if not host_prefix:
            host_prefix = self.make_host_prefix(env)
        self.host_prefix = host_prefix
        self.rel_prefix = rel_prefix
        if use_abs_prefix:
            self.wb_prefix = host_prefix + rel_prefix
        else:
            self.wb_prefix = rel_prefix
        self.wb_prefix = host_prefix + wb_prefix
        if not wb_url_str:
            wb_url_str = '/'
        self.wb_url_str = wb_url_str
        # wb_url present and not root page
        if wb_url_str != '/' and wburl_class:
            self.wb_url_str = wb_url_str
            self.wb_url = wburl_class(wb_url_str)
-            self.urlrewriter = url_rewriter_class(self.wb_url, self.wb_prefix)
+            self.urlrewriter = urlrewriter_class(self.wb_url, self.wb_prefix)
        else:
        # no wb_url, just store blank wb_url
            self.wb_url_str = wb_url_str
            self.wb_url = None
            self.urlrewriter = None
        self.coll = coll
        self.referrer = env.get('HTTP_REFERER')
        self.is_ajax = self._is_ajax()
@ -122,24 +98,19 @@ class WbRequest:
    def __repr__(self):
        #return "WbRequest(env, '" + (self.wb_url) + "', '" + (self.coll) + "')"
        #return str(vars(self))
        varlist = vars(self)
-        return str({k: varlist[k] for k in ('request_uri', 'wb_prefix', 'wb_url', 'coll')})
+        varstr = pprint.pformat(varlist)
        return varstr
 #=================================================================
 class WbResponse:
    """
-    >>> WbResponse.text_response('Test')
+    Represnts a pywb wsgi response object.
    {'body': ['Test'], 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [('Content-Type', 'text/plain')])}
-    >>> WbResponse.text_stream(['Test', 'Another'], '404')
+    Holds a status_headers object and a response iter, to be
-    {'body': ['Test', 'Another'], 'status_headers': StatusAndHeaders(protocol = '', statusline = '404', headers = [('Content-Type', 'text/plain')])}
+    returned to wsgi container.
    >>> WbResponse.redir_response('http://example.com/otherfile')
    {'body': [], 'status_headers': StatusAndHeaders(protocol = '', statusline = '302 Redirect', headers = [('Location', 'http://example.com/otherfile')])}
    """
    def __init__(self, status_headers, value = []):
        self.status_headers = status_headers
        self.body = value
@ -180,8 +151,3 @@ class WbResponse:
    def __repr__(self):
        return str(vars(self))
 if __name__ == "__main__":
    import doctest
    doctest.testmod()
--- a/tests/test_integration.py
+++ b/tests/test_integration.py
@ -75,6 +75,11 @@ class TestWb:
        assert 'wb.js' in resp.body
        assert '/pywb/20140127171238/http://www.iana.org/time-zones' in resp.body
    def test_replay_content_length_1(self):
        # test larger file, rewritten file (svg!)
        resp = self.testapp.get('/pywb/20140126200654/http://www.iana.org/_img/2013.1/rir-map.svg')
        assert resp.headers['Content-Length'] == str(len(resp.body))
    def test_redirect_1(self):
        resp = self.testapp.get('/pywb/20140127171237/http://www.iana.org/')
@ -119,6 +124,20 @@ class TestWb:
        assert resp.content_type == 'text/css'
    def test_referrer_self_redirect(self):
        uri = '/pywb/20140127171239/http://www.iana.org/_css/2013.1/screen.css'
        host = 'somehost:8082'
        referrer = 'http://' + host + uri
        # capture is normally a 200
        resp = self.testapp.get(uri)
        assert resp.status_int == 200
        # redirect causes skip of this capture, redirect to next
        resp = self.testapp.get(uri, headers = [('Referer', referrer), ('Host', host)], status = 302)
        assert resp.status_int == 302
    def test_excluded_content(self):
        resp = self.testapp.get('/pywb/http://www.iana.org/_img/bookmark_icon.ico', status = 403)
        assert resp.status_int == 403