Merge branch 'master' into cdx-server

2025-03-15 00:03:28 +01:00 · 2014-02-25 23:14:15 +00:00 · 2014-02-25 23:14:15 +00:00 · 14f4b4d26e
commit 14f4b4d26e
parent bb87d98b73 b5d8accd1d
33 changed files with 1120 additions and 342 deletions
--- a/.coveragerc
+++ b/.coveragerc
@ -0,0 +1,8 @@
+[run]
+omit = 
+    */test/*
+    */tests/*
+
+[report]
+exclude_lines =
+    if __name__ == .__main__.:
--- a/.travis.yml
+++ b/.travis.yml
@ -4,7 +4,14 @@ python:
 # command to install dependencies
 install:
  - "python setup.py -q install"
+  - "pip install python-coveralls"
+  - "pip install pytest-cov"
 # command to run tests
 #script: nosetests --with-doctest
 #script: py.test run-tests.py ./pywb/ --doctest-modules --ignore=setup.py
-script: py.test -v --doctest-module ./tests/*.py ./pywb/
+#script: py.test -v --doctest-module ./tests/*.py ./pywb/
+script: 
+    py.test --cov-config .coveragerc --cov pywb -v --doctest-module ./pywb/ tests/
+
+after_success:
+    coveralls
--- a/pywb/archivalrouter.py
+++ b/pywb/archivalrouter.py
@ -3,13 +3,13 @@ import re

 from wbrequestresponse import WbRequest, WbResponse
 from pywb.rewrite.url_rewriter import UrlRewriter
-from pywb.rewrite.wburl import WbUrl
+

 #=================================================================
 # ArchivalRouter -- route WB requests in archival mode
 #=================================================================
 class ArchivalRouter:
-    def __init__(self, routes, hostpaths = None, abs_path = True, home_view = None, error_view = None):
+    def __init__(self, routes, hostpaths=None, abs_path=True, home_view=None, error_view=None):
        self.routes = routes
        self.fallback = ReferRedirect(hostpaths)
        self.abs_path = abs_path
@ -69,24 +69,25 @@ class Route:
        if not matcher:
            return None

-        rel_prefix = matcher.group(0)
+        matched_str = matcher.group(0)

-        if rel_prefix:
-            wb_prefix = env['SCRIPT_NAME'] + '/' + rel_prefix + '/'
-            wb_url_str = request_uri[len(rel_prefix) + 2:] # remove the '/' + rel_prefix part of uri
+        if matched_str:
+            rel_prefix = env['SCRIPT_NAME'] + '/' + matched_str + '/'
+            wb_url_str = request_uri[len(matched_str) + 2:] # remove the '/' + rel_prefix part of uri
        else:
-            wb_prefix = env['SCRIPT_NAME'] + '/'
+            rel_prefix = env['SCRIPT_NAME'] + '/'
            wb_url_str = request_uri[1:] # the request_uri is the wb_url, since no coll

        coll = matcher.group(self.coll_group)

        wbrequest = WbRequest(env,
-                              request_uri = request_uri,
-                              wb_url_str = wb_url_str,
-                              wb_prefix = wb_prefix,
-                              coll = coll,
-                              host_prefix = WbRequest.make_host_prefix(env) if use_abs_prefix else '',
-                              wburl_class = self.handler.get_wburl_type())
+                              request_uri=request_uri,
+                              wb_url_str=wb_url_str,
+                              rel_prefix=rel_prefix,
+                              coll=coll,
+                              use_abs_prefix=use_abs_prefix,
+                              wburl_class = self.handler.get_wburl_type(),
+                              urlrewriter_class=UrlRewriter)


        # Allow for applying of additional filters
--- a/pywb/cdx/canonicalize.py
+++ b/pywb/cdx/canonicalize.py
@ -2,6 +2,7 @@
 """

 import surt
+import urlparse
 from cdxobject import CDXException


@ -69,6 +70,109 @@ index.html?a=b?c=)/')
        return surt


+#=================================================================
+def calc_search_range(url, match_type, surt_ordered=True, url_canon=None):
+    """
+    Canonicalize a url (either with custom canonicalizer or
+    standard canonicalizer with or without surt)
+
+    Then, compute a start and end search url search range
+    for a given match type.
+
+    Support match types:
+    * exact
+    * prefix
+    * host
+    * domain (only available when for surt ordering)
+
+    Examples below:
+
+    # surt ranges
+    >>> calc_search_range('http://example.com/path/file.html', 'exact')
+    ('com,example)/path/file.html', 'com,example)/path/file.html!')
+
+    >>> calc_search_range('http://example.com/path/file.html', 'prefix')
+    ('com,example)/path/file.html', 'com,example)/path/file.htmm')
+
+    >>> calc_search_range('http://example.com/path/file.html', 'host')
+    ('com,example)/', 'com,example*')
+
+    >>> calc_search_range('http://example.com/path/file.html', 'domain')
+    ('com,example)/', 'com,example-')
+
+    special case for tld domain range
+    >>> calc_search_range('com', 'domain')
+    ('com,', 'com-')
+
+    # non-surt ranges
+    >>> calc_search_range('http://example.com/path/file.html', 'exact', False)
+    ('example.com/path/file.html', 'example.com/path/file.html!')
+
+    >>> calc_search_range('http://example.com/path/file.html', 'prefix', False)
+    ('example.com/path/file.html', 'example.com/path/file.htmm')
+
+    >>> calc_search_range('http://example.com/path/file.html', 'host', False)
+    ('example.com/', 'example.com0')
+
+    # domain range not supported
+    >>> calc_search_range('http://example.com/path/file.html', 'domain', False)
+    Traceback (most recent call last):
+    Exception: matchType=domain unsupported for non-surt
+    """
+    def inc_last_char(x):
+        return x[0:-1] + chr(ord(x[-1]) + 1)
+
+    if not url_canon:
+        # make new canon
+        url_canon = UrlCanonicalizer(surt_ordered)
+    else:
+        # ensure surt order matches url_canon
+        surt_ordered = url_canon.surt_ordered
+
+    start_key = url_canon(url)
+
+    if match_type == 'exact':
+        end_key = start_key + '!'
+
+    elif match_type == 'prefix':
+        # add trailing slash if url has it
+        if url.endswith('/') and not start_key.endswith('/'):
+            start_key += '/'
+
+        end_key = inc_last_char(start_key)
+
+    elif match_type == 'host':
+        if surt_ordered:
+            host = start_key.split(')/')[0]
+
+            start_key = host + ')/'
+            end_key = host + '*'
+        else:
+            host = urlparse.urlsplit(url).netloc
+
+            start_key = host + '/'
+            end_key = host + '0'
+
+    elif match_type == 'domain':
+        if not surt_ordered:
+            raise Exception('matchType=domain unsupported for non-surt')
+
+        host = start_key.split(')/')[0]
+
+        # if tld, use com, as start_key
+        # otherwise, stick with com,example)/
+        if not ',' in host:
+            start_key = host + ','
+        else:
+            start_key = host + ')/'
+
+        end_key = host + '-'
+    else:
+        raise Exception('Invalid match_type: ' + match_type)
+
+    return (start_key, end_key)
+
+
 if __name__ == "__main__":
    import doctest
    doctest.testmod()
--- a/pywb/cdx/cdxobject.py
+++ b/pywb/cdx/cdxobject.py
@ -77,3 +77,34 @@ class CDXObject(OrderedDict):

        li = itertools.imap(lambda (n, val): val, self.items())
        return ' '.join(li)
+
+
+#=================================================================
+class IDXObject(OrderedDict):
+
+    FORMAT = ['urlkey', 'part', 'offset', 'length', 'lineno']
+    NUM_REQ_FIELDS = len(FORMAT) - 1  # lineno is an optional field
+
+    def __init__(self, idxline):
+        OrderedDict.__init__(self)
+
+        idxline = idxline.rstrip()
+        fields = idxline.split('\t')
+
+        if len(fields) < self.NUM_REQ_FIELDS:
+            msg = 'invalid idx format: {0} fields found, {1} required'
+            raise Exception(msg.format(len(fields), self.NUM_REQ_FIELDS))
+
+        for header, field in itertools.izip(self.FORMAT, fields):
+            self[header] = field
+
+        self['offset'] = int(self['offset'])
+        self['length'] = int(self['length'])
+        lineno = self.get('lineno')
+        if lineno:
+            self['lineno'] = int(lineno)
+
+        self.idxline = idxline
+
+    def __str__(self):
+        return self.idxline
--- a/pywb/cdx/cdxops.py
+++ b/pywb/cdx/cdxops.py
@ -1,4 +1,4 @@
-from cdxobject import CDXObject, AccessException
+from cdxobject import CDXObject, IDXObject, AccessException
 from pywb.utils.timeutils import timestamp_to_sec

 import bisect
@ -56,7 +56,7 @@ def cdx_text_out(cdx, fields):
 def cdx_load_and_filter(sources, params):
    cdx_iter = load_cdx_streams(sources, params)

-    cdx_iter = make_cdx_iter(cdx_iter)
+    cdx_iter = make_obj_iter(cdx_iter, params)

    if params.get('proxyAll'):
        return cdx_iter
@ -102,9 +102,15 @@ def load_cdx_streams(sources, params):


 #=================================================================
-# convert text cdx stream to CDXObject
-def make_cdx_iter(text_iter):
-    return itertools.imap(lambda line: CDXObject(line), text_iter)
+# convert text cdx stream to CDXObject/IDXObject
+def make_obj_iter(text_iter, params):
+    # already converted
+    if params.get('showPagedIndex'):
+        cls = IDXObject
+    else:
+        cls = CDXObject
+
+    return itertools.imap(lambda line: cls(line), text_iter)


 #=================================================================
--- a/pywb/cdx/cdxserver.py
+++ b/pywb/cdx/cdxserver.py
@ -1,10 +1,13 @@
-from canonicalize import UrlCanonicalizer
+from canonicalize import UrlCanonicalizer, calc_search_range

 from cdxops import cdx_load
-from cdxsource import CDXSource, CDXFile, RemoteCDXSource
+from cdxsource import CDXSource, CDXFile, RemoteCDXSource, RedisCDXSource
+from zipnum import ZipNumCluster
 from cdxobject import CDXObject, CaptureNotFoundException, CDXException
 from cdxdomainspecific import load_domain_specific_cdx_rules

+from pywb.utils.loaders import is_http
+
 from itertools import chain
 import logging
 import os
@ -14,8 +17,23 @@ import urlparse
 #=================================================================
 class BaseCDXServer(object):
    def __init__(self, **kwargs):
-        self.url_canon = kwargs.get('url_canon', UrlCanonicalizer())
-        self.fuzzy_query = kwargs.get('fuzzy_query')
+        ds_rules = kwargs.get('ds_rules')
+        surt_ordered = kwargs.get('surt_ordered', True)
+
+        # load from domain-specific rules
+        if ds_rules:
+            self.url_canon, self.fuzzy_query = (
+                load_domain_specific_cdx_rules(ds_rules, surt_ordered))
+        # or custom passed in canonicalizer
+        else:
+            self.url_canon = kwargs.get('url_canon')
+            self.fuzzy_query = kwargs.get('fuzzy_query')
+
+        # set default canonicalizer if none set thus far
+        if not self.url_canon:
+            self.url_canon = UrlCanonicalizer(surt_ordered)
+
+        # set perms checker, if any
        self.perms_checker = kwargs.get('perms_checker')

    def _check_cdx_iter(self, cdx_iter, params):
@ -66,7 +84,7 @@ class CDXServer(BaseCDXServer):

    def __init__(self, paths, **kwargs):
        super(CDXServer, self).__init__(**kwargs)
-        self.sources = create_cdx_sources(paths)
+        self.sources = create_cdx_sources(paths, kwargs.get('config'))

    def load_cdx(self, **params):
        # if key not set, assume 'url' is set and needs canonicalization
@ -77,7 +95,14 @@ class CDXServer(BaseCDXServer):
                msg = 'A url= param must be specified to query the cdx server'
                raise CDXException(msg)

-            params['key'] = self.url_canon(url)
+            #params['key'] = self.url_canon(url)
+            match_type = params.get('matchType', 'exact')
+
+            key, end_key = calc_search_range(url=url,
+                                             match_type=match_type,
+                                             url_canon=self.url_canon)
+            params['key'] = key
+            params['end_key'] = end_key

        cdx_iter = cdx_load(self.sources, params, self.perms_checker)

@ -124,36 +149,29 @@ def create_cdx_server(config, ds_rules_file=None):
        paths = config.get('index_paths')
        surt_ordered = config.get('surt_ordered', True)
        perms_checker = config.get('perms_checker')
+        pass_config = config
    else:
        paths = config
        surt_ordered = True
        perms_checker = None
+        pass_config = None

    logging.debug('CDX Surt-Ordered? ' + str(surt_ordered))

-    if ds_rules_file:
-        canon, fuzzy = load_domain_specific_cdx_rules(ds_rules_file,
-                                                      surt_ordered)
-    else:
-        canon, fuzzy = None, None
-
-    if not canon:
-        canon = UrlCanonicalizer(surt_ordered)
-
-    if (isinstance(paths, str) and
-        any(paths.startswith(x) for x in ['http://', 'https://'])):
+    if isinstance(paths, str) and is_http(paths):
        server_cls = RemoteCDXServer
    else:
        server_cls = CDXServer

    return server_cls(paths,
-                      url_canon=canon,
-                      fuzzy_query=fuzzy,
+                      config=pass_config,
+                      surt_ordered=surt_ordered,
+                      ds_rules=ds_rules_file,
                      perms_checker=perms_checker)


 #=================================================================
-def create_cdx_sources(paths):
+def create_cdx_sources(paths, config=None):
    sources = []

    if not isinstance(paths, list):
@ -161,13 +179,13 @@ def create_cdx_sources(paths):

    for path in paths:
        if isinstance(path, CDXSource):
-            add_cdx_source(sources, path)
+            add_cdx_source(sources, path, config)
        elif isinstance(path, str):
            if os.path.isdir(path):
                for file in os.listdir(path):
-                    add_cdx_source(sources, path + file)
+                    add_cdx_source(sources, path + file, config)
            else:
-                add_cdx_source(sources, path)
+                add_cdx_source(sources, path, config)

    if len(sources) == 0:
        logging.exception('No CDX Sources Found from: ' + str(sources))
@ -176,9 +194,9 @@ def create_cdx_sources(paths):


 #=================================================================
-def add_cdx_source(sources, source):
+def add_cdx_source(sources, source, config):
    if not isinstance(source, CDXSource):
-        source = create_cdx_source(source)
+        source = create_cdx_source(source, config)
        if not source:
            return

@ -187,19 +205,20 @@ def add_cdx_source(sources, source):


 #=================================================================
-def create_cdx_source(filename):
-    if filename.startswith('http://') or filename.startswith('https://'):
+def create_cdx_source(filename, config):
+    if is_http(filename):
        return RemoteCDXSource(filename)

+    if filename.startswith('redis://'):
+        return RedisCDXSource(filename, config)
+
    if filename.endswith('.cdx'):
        return CDXFile(filename)

+    if filename.endswith('.summary'):
+        return ZipNumCluster(filename, config)
+
    return None
-    #TODO: support zipnum
-    #elif filename.endswith('.summary')
-    #    return ZipNumCDXSource(filename)
-    #elif filename.startswith('redis://')
-    #    return RedisCDXSource(filename)


 #=================================================================
--- a/pywb/cdx/cdxsource.py
+++ b/pywb/cdx/cdxsource.py
@ -1,9 +1,9 @@
-from pywb.utils.binsearch import iter_exact, iter_prefix
+from pywb.utils.binsearch import iter_range
 from pywb.utils.loaders import SeekableTextFileReader

 import urllib
 import urllib2
-
+import itertools

 #=================================================================
 class CDXSource(object):
@ -24,17 +24,7 @@ class CDXFile(CDXSource):

    def load_cdx(self, params):
        source = SeekableTextFileReader(self.filename)
-
-        match_type = params.get('matchType')
-
-        if match_type == 'prefix':
-            iter_func = iter_prefix
-        else:
-            iter_func = iter_exact
-
-        key = params.get('key')
-
-        return iter_func(source, key)
+        return iter_range(source, params.get('key'), params.get('end_key'))

    def __str__(self):
        return 'CDX File - ' + self.filename
@ -90,3 +80,35 @@ class RemoteCDXSource(CDXSource):

    def __str__(self):
        return 'Remote CDX Server: ' + self.remote_url
+
+
+#=================================================================
+class RedisCDXSource(CDXSource):
+    DEFAULT_KEY_PREFIX = 'c:'
+
+    def __init__(self, redis_url, config=None):
+        import redis
+        self.redis = redis.StrictRedis.from_url(redis_url)
+
+        self.key_prefix = self.DEFAULT_KEY_PREFIX
+        if config:
+            self.key_prefix = config.get('redis_key_prefix', self.key_prefix)
+        
+
+    def load_cdx(self, params):
+        """
+        Load cdx from redis cache, from an ordered list
+
+        Currently, there is no support for range queries
+        Only 'exact' matchType is supported
+        """
+        key = params['key']
+
+        # ensure only url/surt is part of key
+        key = key.split(' ')[0]
+        cdx_list = self.redis.zrange(self.key_prefix + key, 0, -1)
+
+        # key is not part of list, so prepend to each line
+        key += ' '
+        cdx_list = itertools.imap(lambda x: key + x, cdx_list)
+        return cdx_list
--- a/pywb/cdx/test/cdxserver_test.py
+++ b/pywb/cdx/test/cdxserver_test.py
@ -132,8 +132,8 @@ org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db tex
 ('filename', 'dupes.warc.gz')]

 # NOTE: external dependency -- need self-contained test
->>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'example.com', output = 'raw', limit = '2')
->>> pprint.pprint(x.next().items())
+#>>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'example.com', output = 'raw', limit = '2')
+#>>> pprint.pprint(x.next().items())
 [('urlkey', 'com,example)/'),
 ('timestamp', '20020120142510'),
 ('original', 'http://example.com:80/'),
--- a/pywb/cdx/zipnum.py
+++ b/pywb/cdx/zipnum.py
@ -0,0 +1,203 @@
+import os
+import collections
+import itertools
+import logging
+from cStringIO import StringIO
+import datetime
+
+from cdxsource import CDXSource
+from cdxobject import IDXObject
+
+from pywb.utils.loaders import BlockLoader
+from pywb.utils.loaders import SeekableTextFileReader
+from pywb.utils.bufferedreaders import gzip_decompressor
+from pywb.utils.binsearch import iter_range, linearsearch
+
+
+#=================================================================
+class ZipBlocks:
+    def __init__(self, part, offset, length, count):
+        self.part = part
+        self.offset = offset
+        self.length = length
+        self.count = count
+
+
+#=================================================================
+def readline_to_iter(stream):
+    try:
+        count = 0
+        buff = stream.readline()
+        while buff:
+            count += 1
+            yield buff
+            buff = stream.readline()
+
+    finally:
+        stream.close()
+
+
+#=================================================================
+class ZipNumCluster(CDXSource):
+    DEFAULT_RELOAD_INTERVAL = 10  # in minutes
+    DEFAULT_MAX_BLOCKS = 50
+
+    def __init__(self, summary, config=None):
+
+        loc = None
+        cookie_maker = None
+        self.max_blocks = self.DEFAULT_MAX_BLOCKS
+        reload_ival = self.DEFAULT_RELOAD_INTERVAL
+
+        if config:
+            loc = config.get('zipnum_loc')
+            cookie_maker = config.get('cookie_maker')
+
+            self.max_blocks = config.get('max_blocks', self.max_blocks)
+
+            reload_ival = config.get('reload_interval', reload_ival)
+
+        if not loc:
+            splits = os.path.splitext(summary)
+            loc = splits[0] + '.loc'
+
+        self.summary = summary
+        self.loc_filename = loc
+
+        # initial loc map
+        self.loc_map = {}
+        self.loc_mtime = 0
+        self.load_loc()
+
+        # reload interval
+        self.loc_update_time = datetime.datetime.now()
+        self.reload_interval = datetime.timedelta(minutes=reload_ival)
+
+        self.blk_loader = BlockLoader(cookie_maker=cookie_maker)
+
+    def load_loc(self):
+        # check modified time of current file before loading
+        new_mtime = os.path.getmtime(self.loc_filename)
+        if (new_mtime == self.loc_mtime):
+            return
+
+        # update loc file mtime
+        self.loc_mtime = new_mtime
+
+        logging.debug('Loading loc from: ' + self.loc_filename)
+        with open(self.loc_filename) as fh:
+            for line in fh:
+                parts = line.rstrip().split('\t')
+                self.loc_map[parts[0]] = parts[1:]
+
+    @staticmethod
+    def reload_timed(timestamp, val, delta, func):
+        now = datetime.datetime.now()
+        if now - timestamp >= delta:
+            func()
+            return now
+        return None
+
+    def reload_loc(self):
+        reload_time = self.reload_timed(self.loc_update_time,
+                                        self.loc_map,
+                                        self.reload_interval,
+                                        self.load_loc)
+
+        if reload_time:
+            self.loc_update_time = reload_time
+
+    def lookup_loc(self, part):
+        return self.loc_map[part]
+
+    def load_cdx(self, params):
+        self.reload_loc()
+
+        reader = SeekableTextFileReader(self.summary)
+
+        idx_iter = iter_range(reader,
+                              params['key'],
+                              params['end_key'],
+                              prev_size=1)
+
+        if params.get('showPagedIndex'):
+            params['proxyAll'] = True
+            return idx_iter
+        else:
+            blocks = self.idx_to_cdx(idx_iter, params)
+
+            def gen_cdx():
+                for blk in blocks:
+                    for cdx in blk:
+                        yield cdx
+
+            return gen_cdx()
+
+    def idx_to_cdx(self, idx_iter, params):
+        blocks = None
+        ranges = []
+
+        for idx in idx_iter:
+            idx = IDXObject(idx)
+
+            if (blocks and blocks.part == idx['part'] and
+                blocks.offset + blocks.length == idx['offset'] and
+                blocks.count < self.max_blocks):
+
+                    blocks.length += idx['length']
+                    blocks.count += 1
+                    ranges.append(idx['length'])
+
+            else:
+                if blocks:
+                    yield self.block_to_cdx_iter(blocks, ranges, params)
+
+                blocks = ZipBlocks(idx['part'],
+                                   idx['offset'],
+                                   idx['length'],
+                                   1)
+
+                ranges = [blocks.length]
+
+        if blocks:
+            yield self.block_to_cdx_iter(blocks, ranges, params)
+
+    def block_to_cdx_iter(self, blocks, ranges, params):
+        last_exc = None
+        last_traceback = None
+
+        for location in self.lookup_loc(blocks.part):
+            try:
+                return self.load_blocks(location, blocks, ranges, params)
+            except Exception as exc:
+                last_exc = exc
+                import sys
+                last_traceback = sys.exc_info()[2]
+
+        if last_exc:
+            raise exc, None, last_traceback
+        else:
+            raise Exception('No Locations Found for: ' + block.part)
+
+    def load_blocks(self, location, blocks, ranges, params):
+
+        if (logging.getLogger().getEffectiveLevel() <= logging.DEBUG):
+            msg = 'Loading {b.count} blocks from {loc}:{b.offset}+{b.length}'
+            logging.debug(msg.format(b=blocks, loc=location))
+
+        reader = self.blk_loader.load(location, blocks.offset, blocks.length)
+
+        def decompress_block(range_):
+            decomp = gzip_decompressor()
+            buff = decomp.decompress(reader.read(range_))
+            return readline_to_iter(StringIO(buff))
+
+        iter_ = itertools.chain(*itertools.imap(decompress_block, ranges))
+
+        # start bound
+        iter_ = linearsearch(iter_, params['key'])
+
+        # end bound
+        end = params['end_key']
+        iter_ = itertools.takewhile(lambda line: line < end, iter_)
+        return iter_
--- a/pywb/handlers.py
+++ b/pywb/handlers.py
@ -10,19 +10,28 @@ from wbexceptions import WbException, NotFoundException
 from views import TextCapturesView


-class BaseHandler:
-    @staticmethod
-    def get_wburl_type():
-        return WbUrl
-
+#=================================================================
+class BaseHandler(object):
    def __call__(self, wbrequest):
        return wbrequest

+    def get_wburl_type(self):
+        return None
+
+
+#=================================================================
+class WbUrlHandler(BaseHandler):
+    def get_wburl_type(self):
+        return WbUrl
+
+
 #=================================================================
 # Standard WB Handler
 #=================================================================
-class WBHandler(BaseHandler):
-    def __init__(self, index_reader, replay, html_view = None, search_view = None):
+class WBHandler(WbUrlHandler):
+    def __init__(self, index_reader, replay,
+                 html_view=None, search_view=None):
+
        self.index_reader = index_reader
        self.replay = replay

@ -31,7 +40,6 @@ class WBHandler(BaseHandler):
        self.html_view = html_view
        self.search_view = search_view

-
    def __call__(self, wbrequest):
        if wbrequest.wb_url_str == '/':
            return self.render_search_page(wbrequest)
@ -61,6 +69,7 @@ class WBHandler(BaseHandler):
    def __str__(self):
        return 'WBHandler: ' + str(self.index_reader) + ', ' + str(self.replay)

+
 #=================================================================
 # CDX-Server Handler -- pass all params to cdx server
 #=================================================================
@ -75,11 +84,6 @@ class CDXHandler(BaseHandler):

        return self.view.render_response(wbrequest, cdx_lines)

-
-    @staticmethod
-    def get_wburl_type():
-        return None
-
    def __str__(self):
        return 'Index Reader: ' + str(self.index_reader)

@ -115,10 +119,6 @@ class StaticHandler(BaseHandler):
        except IOError:
            raise NotFoundException('Static File Not Found: ' + wbrequest.wb_url_str)

-    @staticmethod
-    def get_wburl_type():
-        return None
-
    def __str__(self):
        return 'Static files from ' + self.static_path

@ -130,6 +130,7 @@ class DebugEchoEnvHandler(BaseHandler):
    def __call__(self, wbrequest):
        return WbResponse.text_response(str(wbrequest.env))

+
 #=================================================================
 class DebugEchoHandler(BaseHandler):
    def __call__(self, wbrequest):
@ -150,5 +151,3 @@ class PerfTimer:
        self.end = time.clock()
        if self.perfdict is not None:
            self.perfdict[self.name] = str(self.end - self.start)
-
-
--- a/pywb/indexreader.py
+++ b/pywb/indexreader.py
@ -37,7 +37,7 @@ class IndexReader(object):
    def load_cdx(self, **params):
        return self.cdx_server.load_cdx(**params)

-    def get_query_params(self, wburl, limit = 150000, collapse_time = None, replay_closest = 10):
+    def get_query_params(self, wburl, limit = 150000, collapse_time = None, replay_closest = 100):
        if wburl.type == wburl.URL_QUERY:
            raise NotImplementedError('Url Query Not Yet Supported')

--- a/pywb/proxy.py
+++ b/pywb/proxy.py
@ -45,14 +45,14 @@ class ProxyRouter:
            return None

        wbrequest = WbRequest(env,
-                              request_uri = url,
-                              wb_url_str = url,
-                              wb_prefix = '',
-                              coll = '',
-                              host_prefix = self.hostpaths[0],
-                              wburl_class = self.handler.get_wburl_type(),
-                              url_rewriter_class = ProxyHttpsUrlRewriter,
-                              is_proxy = True)
+                              request_uri=url,
+                              wb_url_str=url,
+                              #rel_prefix=url,
+                              #host_prefix=self.hostpaths[0],
+                              wburl_class=self.handler.get_wburl_type(),
+                              urlrewriter_class=ProxyHttpsUrlRewriter,
+                              use_abs_prefix=False,
+                              is_proxy=True)

        return self.handler(wbrequest)

--- a/pywb/replay_views.py
+++ b/pywb/replay_views.py
@ -7,7 +7,6 @@ from wbrequestresponse import WbResponse
 from wbexceptions import CaptureException, InternalRedirect
 from pywb.warc.recordloader import ArchiveLoadFailed

-
 #=================================================================
 class ReplayView:
    def __init__(self, content_loader, content_rewriter, head_insert_view = None,
@ -49,6 +48,9 @@ class ReplayView:
                # check if redir is needed
                self._redirect_if_needed(wbrequest, cdx)

+                # one more check for referrer-based self-redirect
+                self._reject_referrer_self_redirect(wbrequest, status_headers)
+
                response = None

                if self.content_rewriter and wbrequest.wb_url.mod != 'id_':
@ -148,6 +150,7 @@ class ReplayView:


    def _reject_self_redirect(self, wbrequest, cdx, status_headers):
+        # self-redirect via location
        if status_headers.statusline.startswith('3'):
            request_url = wbrequest.wb_url.url.lower()
            location_url = status_headers.get_header('Location').lower()
@ -156,3 +159,16 @@ class ReplayView:
            if (UrlRewriter.strip_protocol(request_url) == UrlRewriter.strip_protocol(location_url)):
                raise CaptureException('Self Redirect: ' + str(cdx))

+    def _reject_referrer_self_redirect(self, wbrequest, status_headers):
+        # at correct timestamp now, but must check for referrer redirect
+        # indirect self-redirect, via meta-refresh, if referrer is same as current url
+        if status_headers.statusline.startswith('2'):
+            # build full url even if using relative-rewriting
+            request_url = wbrequest.host_prefix + wbrequest.rel_prefix + str(wbrequest.wb_url)
+            referrer_url = wbrequest.referrer
+            if (referrer_url and UrlRewriter.strip_protocol(request_url) == UrlRewriter.strip_protocol(referrer_url)):
+                raise CaptureException('Self Redirect via Referrer: ' + str(wbrequest.wb_url))
+
+
+
+
--- a/pywb/rewrite/rewrite_content.py
+++ b/pywb/rewrite/rewrite_content.py
@ -6,7 +6,7 @@ from regex_rewriters import RegexRewriter, JSRewriter, CSSRewriter, XMLRewriter
 from header_rewriter import HeaderRewriter, RewrittenStatusAndHeaders

 from pywb.utils.statusandheaders import StatusAndHeaders
-from pywb.utils.bufferedreaders import BufferedReader, ChunkedDataReader
+from pywb.utils.bufferedreaders import DecompressingBufferedReader, ChunkedDataReader

 class RewriteContent:

@ -54,7 +54,7 @@ class RewriteContent:
        # =========================================================================
        # special case -- need to ungzip the body
        if (rewritten_headers.contains_removed_header('content-encoding', 'gzip')):
-            stream = BufferedReader(stream, decomp_type='gzip')
+            stream = DecompressingBufferedReader(stream, decomp_type='gzip')

        if rewritten_headers.charset:
            encoding = rewritten_headers.charset
--- a/pywb/rewrite/test/test_rewrite_live.py
+++ b/pywb/rewrite/test/test_rewrite_live.py
@ -24,9 +24,9 @@ def test_example_2():



-def test_example_3():
-    status_headers, buff = get_rewritten('http://archive.org/', urlrewriter)
+#def test_example_3():
+#    status_headers, buff = get_rewritten('http://archive.org/', urlrewriter)

-    assert '/pywb/20131226101010/http://example.com/about/terms.php' in buff, buff
+#    assert '/pywb/20131226101010/http://example.com/about/terms.php' in buff, buff


--- a/pywb/rewrite/url_rewriter.py
+++ b/pywb/rewrite/url_rewriter.py
@ -103,10 +103,12 @@ class UrlRewriter:

        return self.prefix + self.wburl.to_str(timestamp=timestamp, url=url)

-
    def set_base_url(self, newUrl):
        self.wburl.url = newUrl

+    def __repr__(self):
+        return "UrlRewriter('{0}', '{1}')".format(self.wburl, self.prefix)
+
    @staticmethod
    def strip_protocol(url):
        for protocol in UrlRewriter.PROTOCOLS:
--- a/pywb/rewrite/wburl.py
+++ b/pywb/rewrite/wburl.py
@ -1,9 +1,5 @@
 #!/usr/bin/python

-import re
-import rfc3987
-
-# WbUrl : wb archival url representation for WB
 """
 WbUrl represents the standard wayback archival url format.
 A regular url is a subset of the WbUrl (latest replay).
@ -34,9 +30,38 @@ replay form:

 latest_replay: (no timestamp)
 http://example.com
+
+Additionally, the BaseWbUrl provides the base components
+(url, timestamp, end_timestamp, modifier, type) which
+can be used to provide a custom representation of the
+wayback url format.
+
 """

-class WbUrl:
+import re
+import rfc3987
+
+
+#=================================================================
+class BaseWbUrl(object):
+    QUERY = 'query'
+    URL_QUERY = 'url_query'
+    REPLAY = 'replay'
+    LATEST_REPLAY = 'latest_replay'
+
+
+    def __init__(self, url='', mod='',
+                 timestamp='', end_timestamp='', type=None):
+
+        self.url = url
+        self.timestamp = timestamp
+        self.end_timestamp = end_timestamp
+        self.mod = mod
+        self.type = type
+
+
+#=================================================================
+class WbUrl(BaseWbUrl):
    """
    # Replay Urls
    # ======================
@ -107,22 +132,14 @@ class WbUrl:
    QUERY_REGEX = re.compile('^(?:([\w\-:]+)/)?(\d*)(?:-(\d+))?\*/?(.*)$')
    REPLAY_REGEX = re.compile('^(\d*)([a-z]+_)?/{0,3}(.*)$')

-    QUERY = 'query'
-    URL_QUERY = 'url_query'
-    REPLAY = 'replay'
-    LATEST_REPLAY = 'latest_replay'
-
    DEFAULT_SCHEME = 'http://'
    # ======================


    def __init__(self, url):
+        super(WbUrl, self).__init__()
+
        self.original_url = url
-        self.type = None
-        self.url = ''
-        self.timestamp = ''
-        self.end_timestamp = ''
-        self.mod = ''

        if not any (f(url) for f in [self._init_query, self._init_replay]):
            raise Exception('Invalid WbUrl: ', url)
--- a/pywb/test/test_archivalrouter.py
+++ b/pywb/test/test_archivalrouter.py
@ -1,13 +1,19 @@
 """
-Test Route
-# route with relative path
->>> Route('web', BaseHandler())({'REL_REQUEST_URI': '/web/test.example.com', 'SCRIPT_NAME': ''}, False)
-{'wb_url': ('latest_replay', '', '', 'http://test.example.com', 'http://test.example.com'), 'coll': 'web', 'wb_prefix': '/web/', 'request_uri': '/web/test.example.com'}
+# Test WbRequest parsed via a Route
+# route with relative path, print resulting wbrequest
+>>> print_req(Route('web', WbUrlHandler())({'REL_REQUEST_URI': '/web/test.example.com', 'SCRIPT_NAME': ''}, False))
+{'coll': 'web',
+ 'request_uri': '/web/test.example.com',
+ 'wb_prefix': '/web/',
+ 'wb_url': ('latest_replay', '', '', 'http://test.example.com', 'http://test.example.com')}

-# route with absolute path, running at script /my_pywb
->>> Route('web', BaseHandler())({'REL_REQUEST_URI': '/web/2013im_/test.example.com', 'SCRIPT_NAME': '/my_pywb', 'HTTP_HOST': 'localhost:8081', 'wsgi.url_scheme': 'https'}, True)
-{'wb_url': ('replay', '2013', 'im_', 'http://test.example.com', '2013im_/http://test.example.com'), 'coll': 'web', 'wb_prefix': 'https://localhost:8081/my_pywb/web/', 'request_uri': '/web/2013im_/test.example.com'}

+# route with absolute path, running at script /my_pywb, print resultingwbrequest
+>>> print_req(Route('web', WbUrlHandler())({'REL_REQUEST_URI': '/web/2013im_/test.example.com', 'SCRIPT_NAME': '/my_pywb', 'HTTP_HOST': 'localhost:8081', 'wsgi.url_scheme': 'https'}, True))
+{'coll': 'web',
+ 'request_uri': '/web/2013im_/test.example.com',
+ 'wb_prefix': 'https://localhost:8081/my_pywb/web/',
+ 'wb_url': ('replay', '2013', 'im_', 'http://test.example.com', '2013im_/http://test.example.com')}

 # not matching route -- skipped
 >>> Route('web', BaseHandler())({'REL_REQUEST_URI': '/other/test.example.com', 'SCRIPT_NAME': ''}, False)
@ -65,7 +71,12 @@ False
 """

 from pywb.archivalrouter import Route, ReferRedirect
-from pywb.handlers import BaseHandler
+from pywb.handlers import BaseHandler, WbUrlHandler
+import pprint
+
+def print_req(req):
+    varlist = vars(req)
+    pprint.pprint({k: varlist[k] for k in ('request_uri', 'wb_prefix', 'wb_url', 'coll')})


 def _test_redir(match_host, request_uri, referrer, script_name = '', coll = 'coll', http_host = None):
@ -74,7 +85,7 @@ def _test_redir(match_host, request_uri, referrer, script_name = '', coll = 'col
    if http_host:
        env['HTTP_HOST'] = http_host

-    routes = [Route(coll, BaseHandler())]
+    routes = [Route(coll, WbUrlHandler())]

    redir = ReferRedirect(match_host)
    #req = WbRequest.from_uri(request_uri, env)
@ -85,4 +96,6 @@ def _test_redir(match_host, request_uri, referrer, script_name = '', coll = 'col
    return rep.status_headers.get_header('Location')


-
+if __name__ == "__main__":
+    import doctest
+    doctest.testmod()
--- a/pywb/test/test_wbrequestresponse.py
+++ b/pywb/test/test_wbrequestresponse.py
@ -0,0 +1,87 @@
+"""
+# WbRequest Tests
+# =================
+>>> print_req_from_uri('/save/_embed/example.com/?a=b')
+{'wb_url': ('latest_replay', '', '', 'http://_embed/example.com/?a=b', 'http://_embed/example.com/?a=b'), 'coll': 'save', 'wb_prefix': '/save/', 'request_uri': '/save/_embed/example.com/?a=b'}
+
+>>> print_req_from_uri('/2345/20101024101112im_/example.com/?b=c')
+{'wb_url': ('replay', '20101024101112', 'im_', 'http://example.com/?b=c', '20101024101112im_/http://example.com/?b=c'), 'coll': '2345', 'wb_prefix': '/2345/', 'request_uri': '/2345/20101024101112im_/example.com/?b=c'}
+
+>>> print_req_from_uri('/2010/example.com')
+{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': '/2010/', 'request_uri': '/2010/example.com'}
+
+>>> print_req_from_uri('../example.com')
+{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '', 'wb_prefix': '/', 'request_uri': '../example.com'}
+
+# Abs path
+>>> print_req_from_uri('/2010/example.com', {'wsgi.url_scheme': 'https', 'HTTP_HOST': 'localhost:8080'}, use_abs_prefix = True)
+{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': 'https://localhost:8080/2010/', 'request_uri': '/2010/example.com'}
+
+# No Scheme, so stick to relative
+>>> print_req_from_uri('/2010/example.com', {'HTTP_HOST': 'localhost:8080'}, use_abs_prefix = True)
+{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': '/2010/', 'request_uri': '/2010/example.com'}
+
+
+
+# WbResponse Tests
+# =================
+>>> WbResponse.text_response('Test')
+{'body': ['Test'], 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [('Content-Type', 'text/plain')])}
+
+>>> WbResponse.text_stream(['Test', 'Another'], '404')
+{'body': ['Test', 'Another'], 'status_headers': StatusAndHeaders(protocol = '', statusline = '404', headers = [('Content-Type', 'text/plain')])}
+
+>>> WbResponse.redir_response('http://example.com/otherfile')
+{'body': [], 'status_headers': StatusAndHeaders(protocol = '', statusline = '302 Redirect', headers = [('Location', 'http://example.com/otherfile')])}
+
+"""
+
+
+from pywb.rewrite.wburl import WbUrl
+from pywb.rewrite.url_rewriter import UrlRewriter
+from pywb.utils.statusandheaders import StatusAndHeaders
+
+from pywb.wbrequestresponse import WbRequest, WbResponse
+
+
+def print_req_from_uri(request_uri, env={}, use_abs_prefix=False):
+    response = req_from_uri(request_uri, env, use_abs_prefix)
+    varlist = vars(response)
+    print str({k: varlist[k] for k in ('request_uri', 'wb_prefix', 'wb_url', 'coll')})
+
+
+def req_from_uri(request_uri, env={}, use_abs_prefix=False):
+    if not request_uri:
+        request_uri = env.get('REL_REQUEST_URI')
+
+    parts = request_uri.split('/', 2)
+
+    # Has coll prefix
+    if len(parts) == 3:
+        rel_prefix = '/' + parts[1] + '/'
+        wb_url_str = parts[2]
+        coll = parts[1]
+    # No Coll Prefix
+    elif len(parts) == 2:
+        rel_prefix = '/'
+        wb_url_str = parts[1]
+        coll = ''
+    else:
+        rel_prefix = '/'
+        wb_url_str = parts[0]
+        coll = ''
+
+    return WbRequest(env,
+                     request_uri=request_uri,
+                     rel_prefix=rel_prefix,
+                     wb_url_str=wb_url_str,
+                     coll=coll,
+                     wburl_class=WbUrl,
+                     urlrewriter_class=UrlRewriter,
+                     use_abs_prefix=use_abs_prefix)
+
+
+if __name__ == "__main__":
+    import doctest
+    doctest.testmod()
+
--- a/pywb/utils/binsearch.py
+++ b/pywb/utils/binsearch.py
@ -35,6 +35,58 @@ def binsearch_offset(reader, key, compare_func=cmp, block_size=8192):
    return min_ * block_size


+#=================================================================
+def binsearch(reader, key, compare_func=cmp, block_size=8192):
+    """
+    Perform a binary search for a specified key to within a 'block_size'
+    (default 8192) granularity, and return first full line found.
+    """
+
+    min_ = binsearch_offset(reader, key, compare_func, block_size)
+
+    reader.seek(min_)
+
+    if min_ > 0:
+        reader.readline()  # skip partial line
+
+    def gen_iter(line):
+        while line:
+            yield line.rstrip()
+            line = reader.readline()
+
+    return gen_iter(reader.readline())
+
+
+#=================================================================
+def linearsearch(iter_, key, prev_size=0, compare_func=cmp):
+    """
+    Perform a linear search over iterator until
+    current_line >= key
+
+    optionally also tracking upto N previous lines, which are
+    returned before the first matched line.
+
+    if end of stream is reached before a match is found,
+    nothing is returned (prev lines discarded also)
+    """
+
+    prev_deque = deque(maxlen=prev_size + 1)
+
+    matched = False
+
+    for line in iter_:
+        prev_deque.append(line)
+        if compare_func(line, key) >= 0:
+            matched = True
+            break
+
+    # no matches, so return empty iterator
+    if not matched:
+        return []
+
+    return itertools.chain(prev_deque, iter_)
+
+
 #=================================================================
 def search(reader, key, prev_size=0, compare_func=cmp, block_size=8192):
    """
@ -45,46 +97,27 @@ def search(reader, key, prev_size=0, compare_func=cmp, block_size=8192):
    When performin_g linear search, keep track of up to N previous lines before
    first matching line.
    """
-    min_ = binsearch_offset(reader, key, compare_func, block_size)
+    iter_ = binsearch(reader, key, compare_func, block_size)
+    iter_ = linearsearch(iter_,
+                         key, prev_size=prev_size,
+                         compare_func=compare_func)
+    return iter_

-    reader.seek(min_)

-    if min_ > 0:
-        reader.readline()  # skip partial line
+#=================================================================
+def iter_range(reader, start, end, prev_size=0):
+    """
+    Creates an iterator which iterates over lines where
+    start <= line < end (end exclusive)
+    """

-    if prev_size > 1:
-        prev_deque = deque(max_len=prev_size)
+    iter_ = search(reader, start, prev_size=prev_size)

-    line = None
+    end_iter = itertools.takewhile(
+       lambda line: line < end,
+       iter_)

-    while True:
-        line = reader.readline()
-        if not line:
-            break
-        if compare_func(line, key) >= 0:
-            break
-
-        if prev_size == 1:
-            prev = line
-        elif prev_size > 1:
-            prev_deque.append(line)
-
-    def gen_iter(line):
-        """
-        Create iterator over any previous lines to
-        current matched line
-        """
-        if prev_size == 1:
-            yield prev.rstrip()
-        elif prev_size > 1:
-            for i in prev_deque:
-                yield i.rstrip()
-
-        while line:
-            yield line.rstrip()
-            line = reader.readline()
-
-    return gen_iter(line)
+    return end_iter


 #=================================================================
--- a/pywb/utils/bufferedreaders.py
+++ b/pywb/utils/bufferedreaders.py
@ -11,7 +11,7 @@ def gzip_decompressor():


 #=================================================================
-class BufferedReader(object):
+class DecompressingBufferedReader(object):
    """
    A wrapping line reader which wraps an existing reader.
    Read operations operate on underlying buffer, which is filled to
@ -29,7 +29,7 @@ class BufferedReader(object):

    DECOMPRESSORS = {'gzip': gzip_decompressor}

-    def __init__(self, stream, max_len=0, block_size=1024, decomp_type=None):
+    def __init__(self, stream, block_size=1024, decomp_type=None):
        self.stream = stream
        self.block_size = block_size

@ -44,24 +44,19 @@ class BufferedReader(object):

        self.buff = None
        self.num_read = 0
-        self.max_len = max_len

    def _fillbuff(self, block_size=None):
        if not block_size:
            block_size = self.block_size

        if not self.buff or self.buff.pos >= self.buff.len:
-            if self.max_len > 0:
-                to_read = min(self.max_len - self.num_read, self.block_size)
-            else:
-                to_read = self.block_size
-
-            data = self.stream.read(to_read)
+            data = self.stream.read(block_size)
            self._process_read(data)

    def _process_read(self, data):
        data = self._decompress(data)
-        self.num_read += len(data)
+        self.buff_size = len(data)
+        self.num_read += self.buff_size
        self.buff = StringIO.StringIO(data)

    def _decompress(self, data):
@ -78,12 +73,40 @@ class BufferedReader(object):
        return data

    def read(self, length=None):
+        """
+        Fill bytes and read some number of bytes
+        (up to length if specified)
+        < length bytes may be read if reached the end of input
+        or at a buffer boundary. If at a boundary, the subsequent
+        call will fill buffer anew.
+        """
        self._fillbuff()
        return self.buff.read(length)

    def readline(self, length=None):
+        """
+        Fill buffer and read a full line from the buffer
+        (up to specified length, if provided)
+        If no newline found at end, try filling buffer again in case
+        at buffer boundary.
+        """
        self._fillbuff()
-        return self.buff.readline(length)
+        linebuff = self.buff.readline(length)
+        # we may be at a boundary
+        while not linebuff.endswith('\n'):
+            if length:
+                length -= len(linebuff)
+                if length <= 0:
+                    break
+
+            self._fillbuff()
+
+            if self.buff_size == 0:
+                break
+
+            linebuff += self.buff.readline(length)
+
+        return linebuff

    def close(self):
        if self.stream:
@ -97,7 +120,7 @@ class ChunkedDataException(Exception):


 #=================================================================
-class ChunkedDataReader(BufferedReader):
+class ChunkedDataReader(DecompressingBufferedReader):
    r"""
    A ChunkedDataReader is a BufferedReader which also supports de-chunking
    of the data if it happens to be http 'chunk-encoded'.
@ -133,7 +156,7 @@ class ChunkedDataReader(BufferedReader):

    def _fillbuff(self, block_size=None):
        if self.not_chunked:
-            return BufferedReader._fillbuff(self, block_size)
+            return super(ChunkedDataReader, self)._fillbuff(block_size)

        if self.all_chunks_read:
            return
--- a/pywb/utils/loaders.py
+++ b/pywb/utils/loaders.py
@ -9,18 +9,50 @@ import urllib2
 import time


+def is_http(filename):
+    return any(filename.startswith(x) for x in ['http://', 'https://'])
+
+
 #=================================================================
-# load a reader from http
-#=================================================================
-class HttpLoader(object):
+class BlockLoader(object):
    """
-    Load a file-like reader over http using range requests
-    and an optional cookie created via a cookie_maker
+    a loader which can stream blocks of content
+    given a uri, offset and optional length.
+    Currently supports: http/https and file/local file system
    """
    def __init__(self, cookie_maker=None):
        self.cookie_maker = cookie_maker

    def load(self, url, offset, length):
+        """
+        Determine loading method based on uri
+        """
+        if is_http(url):
+            return self.load_http(url, offset, length)
+        else:
+            return self.load_file(url, offset, length)
+
+    def load_file(self, url, offset, length):
+        """
+        Load a file-like reader from the local file system
+        """
+
+        if url.startswith('file://'):
+            url = url[len('file://'):]
+
+        afile = open(url, 'rb')
+        afile.seek(offset)
+
+        if length > 0:
+            return LimitReader(afile, length)
+        else:
+            return afile
+
+    def load_http(self, url, offset, length):
+        """
+        Load a file-like reader over http using range requests
+        and an optional cookie created via a cookie_maker
+        """
        if length > 0:
            range_header = 'bytes={0}-{1}'.format(offset, offset + length - 1)
        else:
@ -71,25 +103,6 @@ class HMACCookieMaker(object):
        return cookie


-#=================================================================
-# load a reader from local filesystem
-#=================================================================
-class FileLoader(object):
-    """
-    Load a file-like reader from the local file system
-    """
-
-    def load(self, url, offset, length):
-        if url.startswith('file://'):
-            url = url[len('file://'):]
-
-        afile = open(url, 'rb')
-        afile.seek(offset)
-
-        if length > 0:
-            return LimitReader(afile, length)
-
-
 #=================================================================
 # Limit Reader
 #=================================================================
--- a/pywb/utils/statusandheaders.py
+++ b/pywb/utils/statusandheaders.py
@ -65,23 +65,36 @@ class StatusAndHeadersParser(object):
        """
        parse stream for status line and headers
        return a StatusAndHeaders object
+
+        support continuation headers starting with space or tab
        """
        statusline = stream.readline().rstrip()

        protocol_status = self.split_prefix(statusline, self.statuslist)

        if not protocol_status:
-            msg = 'Expected Status Line - Found: ' + statusline
+            msg = 'Expected Status Line starting with {0} - Found: {1}'
+            msg = msg.format(self.statuslist, statusline)
            raise StatusAndHeadersParserException(msg, statusline)

        headers = []

        line = stream.readline().rstrip()
-        while line and line != '\r\n':
+        while line:
            name, value = line.split(':', 1)
-            header = (name, value.strip())
+            name = name.rstrip(' \t')
+            value = value.lstrip()
+
+            next_line = stream.readline().rstrip()
+
+            # append continuation lines, if any
+            while next_line and next_line.startswith((' ', '\t')):
+                value += next_line
+                next_line = stream.readline().rstrip()
+
+            header = (name, value)
            headers.append(header)
-            line = stream.readline().rstrip()
+            line = next_line

        return StatusAndHeaders(statusline=protocol_status[1].strip(),
                                headers=headers,
@ -107,4 +120,3 @@ class StatusAndHeadersParserException(Exception):
    def __init__(self, msg, statusline):
        super(StatusAndHeadersParserException, self).__init__(msg)
        self.statusline = statusline
-
--- a/pywb/utils/test/binsearch_test.py
+++ b/pywb/utils/test/binsearch_test.py
@ -9,6 +9,7 @@ org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ te
 org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz
 org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz

+# Exact Search
 >>> print_binsearch_results('org,iana)/domains/root', iter_exact)
 org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz

@ -19,18 +20,45 @@ org,iana)/ 20140126200624 http://www.iana.org/ text/html 200 OSSAPWJ23L56IYVRW3G
 org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz
 org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz

-# Exact Search
+>>> print_binsearch_results('org,iana)/time-zones', iter_exact)
+org,iana)/time-zones 20140126200737 http://www.iana.org/time-zones text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
+
+# Exact search -- no matches
 >>> print_binsearch_results('org,iaana)/', iter_exact)
 >>> print_binsearch_results('org,ibna)/', iter_exact)

->>> print_binsearch_results('org,iana)/time-zones', iter_exact)
-org,iana)/time-zones 20140126200737 http://www.iana.org/time-zones text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
+
+# Range Search (end exclusive)
+>>> print_binsearch_results_range('org,iana)/about', 'org,iana)/domains', iter_range)
+org,iana)/about 20140126200706 http://www.iana.org/about text/html 200 6G77LZKFAVKH4PCWWKMW6TRJPSHWUBI3 - - 2962 483588 iana.warc.gz
+org,iana)/about/performance/ietf-draft-status 20140126200815 http://www.iana.org/about/performance/ietf-draft-status text/html 302 Y7CTA2QZUSCDTJCSECZNSPIBLJDO7PJJ - - 584 596566 iana.warc.gz
+org,iana)/about/performance/ietf-statistics 20140126200804 http://www.iana.org/about/performance/ietf-statistics text/html 302 HNYDN7XRX46RQTT2OFIWXKEYMZQAJWHD - - 582 581890 iana.warc.gz
+org,iana)/dnssec 20140126201306 http://www.iana.org/dnssec text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 442 772827 iana.warc.gz
+org,iana)/dnssec 20140126201307 https://www.iana.org/dnssec text/html 200 PHLRSX73EV3WSZRFXMWDO6BRKTVUSASI - - 2278 773766 iana.warc.gz
+
+
+# Range Search -- exact
+>>> print_binsearch_results_range('org,iana)/about', 'org,iana)/about!', iter_range)
+org,iana)/about 20140126200706 http://www.iana.org/about text/html 200 6G77LZKFAVKH4PCWWKMW6TRJPSHWUBI3 - - 2962 483588 iana.warc.gz
+
+# Range Search -- exact + 1 prev
+>>> print_binsearch_results_range('org,iana)/about', 'org,iana)/about!', iter_range, prev_size=1)
+org,iana)/_js/2013.1/jquery.js 20140126201307 https://www.iana.org/_js/2013.1/jquery.js warc/revisit - AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO - - 543 778507 iana.warc.gz
+org,iana)/about 20140126200706 http://www.iana.org/about text/html 200 6G77LZKFAVKH4PCWWKMW6TRJPSHWUBI3 - - 2962 483588 iana.warc.gz
+
+# Range Search -- exact + 2 prev
+>>> print_binsearch_results_range('org,iana)/about', 'org,iana)/about!', iter_range, prev_size=2)
+org,iana)/_js/2013.1/jquery.js 20140126201248 http://www.iana.org/_js/2013.1/jquery.js warc/revisit - AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO - - 544 765491 iana.warc.gz
+org,iana)/_js/2013.1/jquery.js 20140126201307 https://www.iana.org/_js/2013.1/jquery.js warc/revisit - AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO - - 543 778507 iana.warc.gz
+org,iana)/about 20140126200706 http://www.iana.org/about text/html 200 6G77LZKFAVKH4PCWWKMW6TRJPSHWUBI3 - - 2962 483588 iana.warc.gz
+
+
 """


 #=================================================================
 import os
-from pywb.utils.binsearch import iter_prefix, iter_exact
+from pywb.utils.binsearch import iter_prefix, iter_exact, iter_range
 from pywb.utils.loaders import SeekableTextFileReader

 from pywb import get_test_dir
@ -45,6 +73,13 @@ def print_binsearch_results(key, iter_func):
        print line


+def print_binsearch_results_range(key, end_key, iter_func, prev_size=0):
+    cdx =  SeekableTextFileReader(test_cdx_dir + 'iana.cdx')
+
+    for line in iter_func(cdx, key, end_key, prev_size=prev_size):
+        print line
+
+
 if __name__ == "__main__":
    import doctest
    doctest.testmod()
--- a/pywb/utils/test/loaders_test.py
+++ b/pywb/utils/test/loaders_test.py
@ -10,9 +10,9 @@
 >>> read_multiple(LimitReader(StringIO.StringIO('abcdefghjiklmnopqrstuvwxyz'), 10), [2, 2, 20])
 'efghji'

-# FileLoader Tests (includes LimitReader)
+# BlockLoader Tests (includes LimitReader)
 # Ensure attempt to read more than 100 bytes, reads exactly 100 bytes
->>> len(FileLoader().load(test_cdx_dir + 'iana.cdx', 0, 100).read('400'))
+>>> len(BlockLoader().load(test_cdx_dir + 'iana.cdx', 0, 100).read('400'))
 100

 # SeekableTextFileReader Test
@ -23,25 +23,39 @@
 >>> seek_read_full(sr, 100)
 'org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200826 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 34054 620049 iana.warc.gz\\n'

-#BufferedReader readline()
->>> BufferedReader(open(test_cdx_dir + 'iana.cdx', 'rb')).readline()
+# Buffered Reader Tests
+#=================================================================
+
+#DecompressingBufferedReader readline()
+>>> DecompressingBufferedReader(open(test_cdx_dir + 'iana.cdx', 'rb')).readline()
 ' CDX N b a m s k r M S V g\\n'

-#BufferedReader readline() with decompression
->>> BufferedReader(open(test_cdx_dir + 'iana.cdx.gz', 'rb'), decomp_type = 'gzip').readline()
+#DecompressingBufferedReader readline() with decompression
+>>> DecompressingBufferedReader(open(test_cdx_dir + 'iana.cdx.gz', 'rb'), decomp_type = 'gzip').readline()
 ' CDX N b a m s k r M S V g\\n'

->>> HttpLoader(HMACCookieMaker('test', 'test', 5)).load('http://example.com', 41, 14).read()
+>>> BlockLoader(HMACCookieMaker('test', 'test', 5)).load('http://example.com', 41, 14).read()
 'Example Domain'
+
+# test very small block size
+>>> dbr = DecompressingBufferedReader(StringIO.StringIO('ABCDEFG\\nHIJKLMN\\nOPQR\\nXYZ'), block_size = 3)
+>>> dbr.readline(); dbr.readline(4); dbr.readline(); dbr.readline(); dbr.readline(2); dbr.readline(); dbr.readline()
+'ABCDEFG\\n'
+'HIJK'
+'LMN\\n'
+'OPQR\\n'
+'XY'
+'Z'
+''
 """


 #=================================================================
 import os
 import StringIO
-from pywb.utils.loaders import FileLoader, HttpLoader, HMACCookieMaker
+from pywb.utils.loaders import BlockLoader, HMACCookieMaker
 from pywb.utils.loaders import LimitReader, SeekableTextFileReader
-from pywb.utils.bufferedreaders import BufferedReader
+from pywb.utils.bufferedreaders import DecompressingBufferedReader

 from pywb import get_test_dir
 #test_cdx_dir = os.path.dirname(os.path.realpath(__file__)) + '/../sample-data/'
--- a/pywb/utils/test/statusandheaders_test.py
+++ b/pywb/utils/test/statusandheaders_test.py
@ -0,0 +1,29 @@
+"""
+>>> StatusAndHeadersParser(['HTTP/1.0']).parse(StringIO.StringIO(status_headers_1))
+StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '200 OK', headers = [ ('Content-Type', 'ABC'),
+  ('Some', 'Value'),
+  ('Multi-Line', 'Value1    Also This')])
+
+>>> StatusAndHeadersParser(['Other']).parse(StringIO.StringIO(status_headers_1))
+Traceback (most recent call last):
+StatusAndHeadersParserException: Expected Status Line starting with ['Other'] - Found: HTTP/1.0 200 OK
+"""
+
+
+from pywb.utils.statusandheaders import StatusAndHeadersParser
+import StringIO
+
+
+status_headers_1 = "\
+HTTP/1.0 200 OK\r\n\
+Content-Type: ABC\r\n\
+Some: Value\r\n\
+Multi-Line: Value1\r\n\
+    Also This\r\n\
+\r\n\
+Body"
+
+
+if __name__ == "__main__":
+    import doctest
+    doctest.testmod()
--- a/pywb/utils/timeutils.py
+++ b/pywb/utils/timeutils.py
@ -17,7 +17,8 @@ DATE_TIMESPLIT = re.compile(r'[^\d]')

 TIMESTAMP_14 = '%Y%m%d%H%M%S'

-PAD_STAMP_END = '29991231235959'
+#PAD_STAMP_END = '29991231235959'
+PAD_6 = '299912'


 def iso_date_to_datetime(string):
@ -58,41 +59,145 @@ def iso_date_to_timestamp(string):
    return datetime_to_timestamp(iso_date_to_datetime(string))


-# default pad is end of range for compatibility
-def pad_timestamp(string, pad_str=PAD_STAMP_END):
+# pad to certain length (default 6)
+def _pad_timestamp(string, pad_str=PAD_6):
    """
-    >>> pad_timestamp('20')
-    '20991231235959'
+    >>> _pad_timestamp('20')
+    '209912'

-    >>> pad_timestamp('2014')
-    '20141231235959'
+    >>> _pad_timestamp('2014')
+    '201412'

-    >>> pad_timestamp('20141011')
-    '20141011235959'
+    >>> _pad_timestamp('20141011')
+    '20141011'

-    >>> pad_timestamp('201410110010')
-    '20141011001059'
+    >>> _pad_timestamp('201410110010')
+    '201410110010'
     """

    str_len = len(string)
    pad_len = len(pad_str)

-    return string if str_len >= pad_len else string + pad_str[str_len:]
+    if str_len < pad_len:
+        string = string + pad_str[str_len:]
+
+    return string


 def timestamp_to_datetime(string):
    """
-    >>> timestamp_to_datetime('20131226095010')
-    time.struct_time(tm_year=2013, tm_mon=12, tm_mday=26, \
-tm_hour=9, tm_min=50, tm_sec=10, tm_wday=3, tm_yday=360, tm_isdst=-1)
+    # >14-digit -- rest ignored
+    >>> timestamp_to_datetime('2014122609501011')
+    datetime.datetime(2014, 12, 26, 9, 50, 10)

+    # 14-digit
+    >>> timestamp_to_datetime('20141226095010')
+    datetime.datetime(2014, 12, 26, 9, 50, 10)
+
+    # 13-digit padding
+    >>> timestamp_to_datetime('2014122609501')
+    datetime.datetime(2014, 12, 26, 9, 50, 59)
+
+    # 12-digit padding
+    >>> timestamp_to_datetime('201412260950')
+    datetime.datetime(2014, 12, 26, 9, 50, 59)
+
+    # 11-digit padding
+    >>> timestamp_to_datetime('20141226095')
+    datetime.datetime(2014, 12, 26, 9, 59, 59)
+
+    # 10-digit padding
+    >>> timestamp_to_datetime('2014122609')
+    datetime.datetime(2014, 12, 26, 9, 59, 59)
+
+    # 9-digit padding
+    >>> timestamp_to_datetime('201412260')
+    datetime.datetime(2014, 12, 26, 23, 59, 59)
+
+    # 8-digit padding
+    >>> timestamp_to_datetime('20141226')
+    datetime.datetime(2014, 12, 26, 23, 59, 59)
+
+    # 7-digit padding
+    >>> timestamp_to_datetime('2014122')
+    datetime.datetime(2014, 12, 31, 23, 59, 59)
+
+    # 6-digit padding
+    >>> timestamp_to_datetime('201410')
+    datetime.datetime(2014, 10, 31, 23, 59, 59)
+
+    # 5-digit padding
+    >>> timestamp_to_datetime('20141')
+    datetime.datetime(2014, 12, 31, 23, 59, 59)
+
+    # 4-digit padding
    >>> timestamp_to_datetime('2014')
-    time.struct_time(tm_year=2014, tm_mon=12, tm_mday=31, \
-tm_hour=23, tm_min=59, tm_sec=59, tm_wday=2, tm_yday=365, tm_isdst=-1)
+    datetime.datetime(2014, 12, 31, 23, 59, 59)
+
+    # 3-digit padding
+    >>> timestamp_to_datetime('201')
+    datetime.datetime(2019, 12, 31, 23, 59, 59)
+
+    # 2-digit padding
+    >>> timestamp_to_datetime('20')
+    datetime.datetime(2099, 12, 31, 23, 59, 59)
+
+    # 1-digit padding
+    >>> timestamp_to_datetime('2')
+    datetime.datetime(2999, 12, 31, 23, 59, 59)
+
+    # 1-digit out-of-range padding
+    >>> timestamp_to_datetime('3')
+    datetime.datetime(2999, 12, 31, 23, 59, 59)
+
+    # 0-digit padding
+    >>> timestamp_to_datetime('')
+    datetime.datetime(2999, 12, 31, 23, 59, 59)
+
+    # bad month
+    >>> timestamp_to_datetime('20131709005601')
+    datetime.datetime(2013, 12, 9, 0, 56, 1)
+
+    # all out of range except minutes
+    >>> timestamp_to_datetime('40001965252477')
+    datetime.datetime(2999, 12, 31, 23, 24, 59)
+
    """

-    # Default pad to end of range for comptability
-    return time.strptime(pad_timestamp(string), TIMESTAMP_14)
+    # pad to 6 digits
+    string = _pad_timestamp(string, PAD_6)
+
+
+    def clamp(val, min_, max_):
+        try:
+            val = int(val)
+            val = max(min_, min(val, max_))
+            return val
+        except:
+            return max_
+
+    def extract(string, start, end, min_, max_):
+        if len(string) >= end:
+            return clamp(string[start:end], min_, max_)
+        else:
+            return max_
+
+    # now parse, clamp to boundary
+    year = extract(string, 0, 4, 1900, 2999)
+    month = extract(string, 4, 6, 1, 12)
+    day = extract(string, 6, 8, 1, calendar.monthrange(year, month)[1])
+    hour = extract(string, 8, 10, 0, 23)
+    minute = extract(string, 10, 12, 0, 59)
+    second = extract(string, 12, 14, 0, 59)
+
+    return datetime.datetime(year=year,
+                             month=month,
+                             day=day,
+                             hour=hour,
+                             minute=minute,
+                             second=second)
+
+    #return time.strptime(pad_timestamp(string), TIMESTAMP_14)


 def timestamp_to_sec(string):
@ -104,7 +209,7 @@ def timestamp_to_sec(string):
    1420070399
    """

-    return calendar.timegm(timestamp_to_datetime(string))
+    return calendar.timegm(timestamp_to_datetime(string).utctimetuple())


 if __name__ == "__main__":
--- a/pywb/views.py
+++ b/pywb/views.py
@ -56,9 +56,9 @@ class J2TemplateView:

    # Filters
    @staticmethod
-    def format_ts(value, format='%a, %b %d %Y %H:%M:%S'):
+    def format_ts(value, format_='%a, %b %d %Y %H:%M:%S'):
        value = timeutils.timestamp_to_datetime(value)
-        return time.strftime(format, value)
+        return value.strftime(format_)

    @staticmethod
    def get_host(url):
--- a/pywb/warc/recordloader.py
+++ b/pywb/warc/recordloader.py
@ -6,8 +6,8 @@ from pywb.utils.statusandheaders import StatusAndHeaders
 from pywb.utils.statusandheaders import StatusAndHeadersParser
 from pywb.utils.statusandheaders import StatusAndHeadersParserException

-from pywb.utils.loaders import FileLoader, HttpLoader
-from pywb.utils.bufferedreaders import BufferedReader
+from pywb.utils.loaders import BlockLoader
+from pywb.utils.bufferedreaders import DecompressingBufferedReader

 #=================================================================
 ArcWarcRecord = collections.namedtuple('ArchiveRecord',
@ -32,24 +32,12 @@ class ArcWarcRecordLoader:
    ARC_HEADERS = ["uri", "ip-address", "creation-date",
                   "content-type", "length"]

-    @staticmethod
-    def create_default_loaders(cookie_maker=None):
-        http = HttpLoader(cookie_maker)
-        file = FileLoader()
-        return {
-            'http': http,
-            'https': http,
-            'file': file,
-            '': file
-            }
+    def __init__(self, loader=None, cookie_maker=None, block_size=8192):
+        if not loader:
+            loader = BlockLoader(cookie_maker)

-    def __init__(self, loaders={}, cookie_maker=None, chunk_size=8192):
-        self.loaders = loaders
-
-        if not self.loaders:
-            self.loaders = self.create_default_loaders(cookie_maker)
-
-        self.chunk_size = chunk_size
+        self.loader = loader
+        self.block_size = block_size

        self.arc_parser = ARCHeadersParser(self.ARC_HEADERS)

@ -60,22 +48,25 @@ class ArcWarcRecordLoader:
    def load(self, url, offset, length):
        url_parts = urlparse.urlsplit(url)

-        loader = self.loaders.get(url_parts.scheme)
-        if not loader:
-            raise ArchiveLoadFailed('Unknown Protocol', url)
+        #loader = self.loaders.get(url_parts.scheme)
+        #if not loader:
+        #    raise ArchiveLoadFailed('Unknown Protocol', url)

        try:
            length = int(length)
        except:
            length = -1

-        raw = loader.load(url, long(offset), length)
+        raw = self.loader.load(url, long(offset), length)

        decomp_type = 'gzip'

-        stream = BufferedReader(raw, length, self.chunk_size, decomp_type)
+        # Create decompressing stream
+        stream = DecompressingBufferedReader(stream = raw,
+                                             decomp_type = decomp_type,
+                                             block_size = self.block_size)

-        (the_format, rec_headers) = self._load_headers(stream)
+        (the_format, rec_headers) = self._detect_type_load_headers(stream)

        if the_format == 'arc':
            rec_type = 'response'
@ -111,7 +102,7 @@ class ArcWarcRecordLoader:
        return ArcWarcRecord((the_format, rec_type),
                             rec_headers, stream, status_headers)

-    def _load_headers(self, stream):
+    def _detect_type_load_headers(self, stream):
        """
        Try parsing record as WARC, then try parsing as ARC.
        if neither one succeeds, we're out of luck.
--- a/pywb/warc/test/test_loading.py
+++ b/pywb/warc/test/test_loading.py
@ -213,3 +213,6 @@ def load_from_cdx_test(cdx):
    except Exception as e:
        print 'Exception: ' + e.__class__.__name__

+if __name__ == "__main__":
+    import doctest
+    doctest.testmod()
--- a/pywb/wbrequestresponse.py
+++ b/pywb/wbrequestresponse.py
@ -1,99 +1,75 @@
-from pywb.rewrite.wburl import WbUrl
-from pywb.rewrite.url_rewriter import UrlRewriter
 from pywb.utils.statusandheaders import StatusAndHeaders
-
 import pprint
-#WB Request and Response

+
+#=================================================================
 class WbRequest:
    """
-    >>> WbRequest.from_uri('/save/_embed/example.com/?a=b')
-    {'wb_url': ('latest_replay', '', '', 'http://_embed/example.com/?a=b', 'http://_embed/example.com/?a=b'), 'coll': 'save', 'wb_prefix': '/save/', 'request_uri': '/save/_embed/example.com/?a=b'}
+    Represents the main pywb request object.

-    >>> WbRequest.from_uri('/2345/20101024101112im_/example.com/?b=c')
-    {'wb_url': ('replay', '20101024101112', 'im_', 'http://example.com/?b=c', '20101024101112im_/http://example.com/?b=c'), 'coll': '2345', 'wb_prefix': '/2345/', 'request_uri': '/2345/20101024101112im_/example.com/?b=c'}
+    Contains various info from wsgi env, add additional info
+    about the request, such as coll, relative prefix,
+    host prefix, absolute prefix.

-    >>> WbRequest.from_uri('/2010/example.com')
-    {'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': '/2010/', 'request_uri': '/2010/example.com'}
-
-    >>> WbRequest.from_uri('../example.com')
-    {'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '', 'wb_prefix': '/', 'request_uri': '../example.com'}
-
-    # Abs path
-    >>> WbRequest.from_uri('/2010/example.com', {'wsgi.url_scheme': 'https', 'HTTP_HOST': 'localhost:8080'}, use_abs_prefix = True)
-    {'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': 'https://localhost:8080/2010/', 'request_uri': '/2010/example.com'}
-
-    # No Scheme, so stick to relative
-    >>> WbRequest.from_uri('/2010/example.com', {'HTTP_HOST': 'localhost:8080'}, use_abs_prefix = True)
-    {'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': '/2010/', 'request_uri': '/2010/example.com'}
+    If a wburl and url rewriter classes are specified, the class
+    also contains the url rewriter.

    """
-
-    @staticmethod
-    def from_uri(request_uri, env = {}, use_abs_prefix = False):
-        if not request_uri:
-            request_uri = env.get('REL_REQUEST_URI')
-
-        parts = request_uri.split('/', 2)
-
-        # Has coll prefix
-        if len(parts) == 3:
-            wb_prefix = '/' + parts[1] + '/'
-            wb_url_str = parts[2]
-            coll = parts[1]
-        # No Coll Prefix
-        elif len(parts) == 2:
-            wb_prefix = '/'
-            wb_url_str = parts[1]
-            coll = ''
-        else:
-            wb_prefix = '/'
-            wb_url_str = parts[0]
-            coll = ''
-
-        host_prefix = WbRequest.make_host_prefix(env) if use_abs_prefix else ''
-
-        return WbRequest(env, request_uri, wb_prefix, wb_url_str, coll, host_prefix = host_prefix)
-
-
    @staticmethod
    def make_host_prefix(env):
        try:
-            return env['wsgi.url_scheme'] + '://' + env['HTTP_HOST']
+            host = env.get('HTTP_HOST')
+            if not host:
+                host = env['SERVER_NAME'] + ':' + env['SERVER_PORT']
+
+            return env['wsgi.url_scheme'] + '://' + host
        except KeyError:
            return ''


-    def __init__(self, env, request_uri, wb_prefix, wb_url_str, coll,
-                 host_prefix = '',
-                 wburl_class = WbUrl,
-                 url_rewriter_class = UrlRewriter,
-                 is_proxy = False):
+    def __init__(self, env,
+                 request_uri=None,
+                 rel_prefix='',
+                 wb_url_str='/',
+                 coll='',
+                 host_prefix='',
+                 use_abs_prefix=False,
+                 wburl_class=None,
+                 urlrewriter_class=None,
+                 is_proxy=False):

        self.env = env

        self.request_uri = request_uri if request_uri else env.get('REL_REQUEST_URI')

-        self.host_prefix = host_prefix
+        self.coll = coll
+
+        if not host_prefix:
+            host_prefix = self.make_host_prefix(env)
+
+        self.host_prefix = host_prefix
+        self.rel_prefix = rel_prefix
+
+        if use_abs_prefix:
+            self.wb_prefix = host_prefix + rel_prefix
+        else:
+            self.wb_prefix = rel_prefix

-        self.wb_prefix = host_prefix + wb_prefix

        if not wb_url_str:
            wb_url_str = '/'

+        self.wb_url_str = wb_url_str
+
        # wb_url present and not root page
        if wb_url_str != '/' and wburl_class:
-            self.wb_url_str = wb_url_str
            self.wb_url = wburl_class(wb_url_str)
-            self.urlrewriter = url_rewriter_class(self.wb_url, self.wb_prefix)
+            self.urlrewriter = urlrewriter_class(self.wb_url, self.wb_prefix)
        else:
        # no wb_url, just store blank wb_url
-            self.wb_url_str = wb_url_str
            self.wb_url = None
            self.urlrewriter = None

-        self.coll = coll
-
        self.referrer = env.get('HTTP_REFERER')

        self.is_ajax = self._is_ajax()
@ -122,24 +98,19 @@ class WbRequest:


    def __repr__(self):
-        #return "WbRequest(env, '" + (self.wb_url) + "', '" + (self.coll) + "')"
-        #return str(vars(self))
        varlist = vars(self)
-        return str({k: varlist[k] for k in ('request_uri', 'wb_prefix', 'wb_url', 'coll')})
+        varstr = pprint.pformat(varlist)
+        return varstr


+#=================================================================
 class WbResponse:
    """
-    >>> WbResponse.text_response('Test')
-    {'body': ['Test'], 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [('Content-Type', 'text/plain')])}
+    Represnts a pywb wsgi response object.

-    >>> WbResponse.text_stream(['Test', 'Another'], '404')
-    {'body': ['Test', 'Another'], 'status_headers': StatusAndHeaders(protocol = '', statusline = '404', headers = [('Content-Type', 'text/plain')])}
-
-    >>> WbResponse.redir_response('http://example.com/otherfile')
-    {'body': [], 'status_headers': StatusAndHeaders(protocol = '', statusline = '302 Redirect', headers = [('Location', 'http://example.com/otherfile')])}
+    Holds a status_headers object and a response iter, to be
+    returned to wsgi container.
    """
-
    def __init__(self, status_headers, value = []):
        self.status_headers = status_headers
        self.body = value
@ -180,8 +151,3 @@ class WbResponse:

    def __repr__(self):
        return str(vars(self))
-
-if __name__ == "__main__":
-    import doctest
-    doctest.testmod()
-
--- a/tests/test_integration.py
+++ b/tests/test_integration.py
@ -75,6 +75,11 @@ class TestWb:
        assert 'wb.js' in resp.body
        assert '/pywb/20140127171238/http://www.iana.org/time-zones' in resp.body

+    def test_replay_content_length_1(self):
+        # test larger file, rewritten file (svg!)
+        resp = self.testapp.get('/pywb/20140126200654/http://www.iana.org/_img/2013.1/rir-map.svg')
+        assert resp.headers['Content-Length'] == str(len(resp.body))
+

    def test_redirect_1(self):
        resp = self.testapp.get('/pywb/20140127171237/http://www.iana.org/')
@ -119,6 +124,20 @@ class TestWb:
        assert resp.content_type == 'text/css'


+    def test_referrer_self_redirect(self):
+        uri = '/pywb/20140127171239/http://www.iana.org/_css/2013.1/screen.css'
+        host = 'somehost:8082'
+        referrer = 'http://' + host + uri
+
+        # capture is normally a 200
+        resp = self.testapp.get(uri)
+        assert resp.status_int == 200
+
+        # redirect causes skip of this capture, redirect to next
+        resp = self.testapp.get(uri, headers = [('Referer', referrer), ('Host', host)], status = 302)
+        assert resp.status_int == 302
+
+
    def test_excluded_content(self):
        resp = self.testapp.get('/pywb/http://www.iana.org/_img/bookmark_icon.ico', status = 403)
        assert resp.status_int == 403