pywb 0.2!

move to distinct packages: pywb.utils, pywb.cdx, pywb.warc, pywb.util, pywb.rewrite! each package will have its own README and tests shared sample_data and install
2025-03-14 15:53:28 +01:00 · 2014-02-17 02:34:39 -08:00 · 2014-02-17 02:34:39 -08:00 · 5345459298
commit 5345459298
parent 2528ee0a7c
61 changed files with 2951 additions and 2185 deletions
--- a/init.py
+++ b/init.py
@ -1,2 +0,0 @@
-#Allow importing
-
--- a/pywb/init.py
+++ b/pywb/init.py
@ -1,3 +1,4 @@
-#Allow importing
-
+import os

+def get_test_dir():
+    return os.path.dirname(os.path.realpath(__file__)) + '/../sample_archive/'
--- a/pywb/archivalrouter.py
+++ b/pywb/archivalrouter.py
@ -3,8 +3,8 @@ import re
 import wbexceptions

 from wbrequestresponse import WbRequest, WbResponse
-from url_rewriter import UrlRewriter
-from wburl import WbUrl
+from pywb.rewrite.url_rewriter import UrlRewriter
+from pywb.rewrite.wburl import WbUrl

 #=================================================================
 # ArchivalRouter -- route WB requests in archival mode
@ -45,20 +45,6 @@ class ArchivalRouter:
 # of request uri (excluding first '/')
 #=================================================================
 class Route:
-    """
-    # route with relative path
-    >>> Route('web', handlers.BaseHandler())({'REL_REQUEST_URI': '/web/test.example.com', 'SCRIPT_NAME': ''}, False)
-    {'wb_url': ('latest_replay', '', '', 'http://test.example.com', 'http://test.example.com'), 'coll': 'web', 'wb_prefix': '/web/', 'request_uri': '/web/test.example.com'}
-
-    # route with absolute path, running at script /my_pywb
-    >>> Route('web', handlers.BaseHandler())({'REL_REQUEST_URI': '/web/2013im_/test.example.com', 'SCRIPT_NAME': '/my_pywb', 'HTTP_HOST': 'localhost:8081', 'wsgi.url_scheme': 'https'}, True)
-    {'wb_url': ('replay', '2013', 'im_', 'http://test.example.com', '2013im_/http://test.example.com'), 'coll': 'web', 'wb_prefix': 'https://localhost:8081/my_pywb/web/', 'request_uri': '/web/2013im_/test.example.com'}
-
-
-    # not matching route -- skipped
-    >>> Route('web', handlers.BaseHandler())({'REL_REQUEST_URI': '/other/test.example.com', 'SCRIPT_NAME': ''}, False)
-    """
-
    # match upto next / or ? or end
    SLASH_QUERY_LOOKAHEAD ='(?=/|$|\?)'

@ -127,57 +113,6 @@ class Route:
 # ReferRedirect -- redirect urls that have 'fallen through' based on the referrer settings
 #=================================================================
 class ReferRedirect:
-
-    """
-    >>> ReferRedirect('http://localhost:8080/').match_prefixs
-    ['http://localhost:8080/']
-
-    >>> ReferRedirect(['http://example:9090/']).match_prefixs
-    ['http://example:9090/']
-
-    >>> test_redir('http://localhost:8080/', '/diff_path/other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html')
-    'http://localhost:8080/coll/20131010/http://example.com/diff_path/other.html'
-
-    >>> test_redir('http://localhost:8080/', '/../other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html')
-    'http://localhost:8080/coll/20131010/http://example.com/other.html'
-
-    >>> test_redir('http://localhost:8080/', '/../../other.html', 'http://localhost:8080/coll/20131010/http://example.com/index.html')
-    'http://localhost:8080/coll/20131010/http://example.com/other.html'
-
-    # Custom collection
-    >>> test_redir('http://localhost:8080/', '/other.html', 'http://localhost:8080/complex/123/20131010/http://example.com/path/page.html', coll='complex/123')
-    'http://localhost:8080/complex/123/20131010/http://example.com/other.html'
-
-    # With timestamp included
-    >>> test_redir('http://localhost:8080/', '/20131010/other.html', 'http://localhost:8080/coll/20131010/http://example.com/index.html')
-    'http://localhost:8080/coll/20131010/http://example.com/other.html'
-
-    # With timestamp included
-    >>> test_redir('http://localhost:8080/', '/20131010/path/other.html', 'http://localhost:8080/coll/20131010/http://example.com/some/index.html')
-    'http://localhost:8080/coll/20131010/http://example.com/path/other.html'
-
-    # Wrong Host
-    >>> test_redir('http://example:8080/', '/other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html')
-    False
-
-    # Right Host
-    >>> test_redir('http://localhost:8080/', '/other.html', 'http://example.com:8080/coll/20131010/http://example.com/path/page.html', http_host = 'example.com:8080')
-    'http://example.com:8080/coll/20131010/http://example.com/other.html'
-
-    # With custom SCRIPT_NAME
-    >>> test_redir('http://localhost:8080/', '/../other.html', 'http://localhost:8080/extra/coll/20131010/http://example.com/path/page.html', '/extra')
-    'http://localhost:8080/extra/coll/20131010/http://example.com/other.html'
-
-    # With custom SCRIPT_NAME + timestamp
-    >>> test_redir('http://localhost:8080/', '/20131010/other.html', 'http://localhost:8080/extra/coll/20131010/http://example.com/path/page.html', '/extra')
-    'http://localhost:8080/extra/coll/20131010/http://example.com/other.html'
-
-    # With custom SCRIPT_NAME, bad match
-    >>> test_redir('http://localhost:8080/', '/../other.html', 'http://localhost:8080/extra/coll/20131010/http://example.com/path/page.html', '/extr')
-    False
-
-    """
-
    def __init__(self, match_prefixs):
        if isinstance(match_prefixs, list):
            self.match_prefixs = match_prefixs
@ -240,31 +175,3 @@ class ReferRedirect:
        final_url = urlparse.urlunsplit((ref_split.scheme, ref_split.netloc, rewriter.rewrite(rel_request_uri), '', ''))

        return WbResponse.redir_response(final_url)
-
-
-import utils
-if __name__ == "__main__" or utils.enable_doctests():
-
-    import handlers
-
-    def test_redir(match_host, request_uri, referrer, script_name = '', coll = 'coll', http_host = None):
-        env = {'REL_REQUEST_URI': request_uri, 'HTTP_REFERER': referrer, 'SCRIPT_NAME': script_name}
-
-        if http_host:
-            env['HTTP_HOST'] = http_host
-
-        routes = [Route(coll, handlers.BaseHandler())]
-
-        redir = ReferRedirect(match_host)
-        #req = WbRequest.from_uri(request_uri, env)
-        rep = redir(env, routes)
-        if not rep:
-            return False
-
-        return rep.status_headers.get_header('Location')
-
-
-    import doctest
-    doctest.testmod()
-
-
--- a/pywb/archiveloader.py
+++ b/pywb/archiveloader.py
@ -1,461 +0,0 @@
-import itertools
-import utils
-import urllib2
-import StringIO
-import urlparse
-import collections
-import wbexceptions
-
-from wbrequestresponse import StatusAndHeaders
-
-#=================================================================
-# load a reader from http
-#=================================================================
-
-class HttpLoader:
-    """
-    Load content over http with range request and optional signature
-    """
-    def __init__(self, hmac = None, hmac_duration = 30):
-        self.hmac = hmac
-        self.hmac_duration = hmac_duration
-
-    def load(self, url, offset, length):
-        if length > 0:
-            range_header = 'bytes={0}-{1}'.format(offset, offset + length - 1)
-        else:
-            range_header = 'bytes={0}-'.format(offset)
-
-        headers = {}
-        headers['Range'] = range_header
-
-        if self.hmac:
-            headers['Cookie'] = self.hmac(self.hmac_duration)
-
-        request = urllib2.Request(url, headers = headers)
-        return urllib2.urlopen(request)
-
-
-#=================================================================
-# load a reader from local filesystem
-#=================================================================
-class FileLoader:
-    """
-    Load content from local file-system
-
-    # Ensure attempt to read more than 100 bytes, only reads 100 bytes
-    >>> len(FileLoader().load(utils.test_data_dir() + 'warcs/iana.warc.gz', 0, 100).read('400'))
-    100
-
-    """
-
-    def load(self, url, offset, length):
-        if url.startswith('file://'):
-            url = url[len('file://'):]
-
-        afile = open(url, 'rb')
-        afile.seek(offset)
-
-        if length > 0:
-            return LimitReader(afile, length)
-        else:
-            return afile
-
-#=================================================================
-# A reader which will not read past the specified limit
-#=================================================================
-class LimitReader:
-    """
-    >>> LimitReader(StringIO.StringIO('abcdefghjiklmnopqrstuvwxyz'), 10).read(26)
-    'abcdefghji'
-
-    >>> LimitReader(StringIO.StringIO('abcdefghjiklmnopqrstuvwxyz'), 8).readline(26)
-    'abcdefgh'
-
-    >>> test_multiple_reads(LimitReader(StringIO.StringIO('abcdefghjiklmnopqrstuvwxyz'), 10), [2, 2, 20])
-    'efghji'
-
-    """
-
-    def __init__(self, stream, limit):
-        self.stream = stream
-        self.limit = limit
-
-        if not self.limit:
-            self.limit = 1
-
-
-    def read(self, length = None):
-        length = min(length, self.limit) if length else self.limit
-        buff = self.stream.read(length)
-        self.limit -= len(buff)
-        return buff
-
-
-    def readline(self, length = None):
-        length = min(length, self.limit) if length else self.limit
-        buff = self.stream.readline(length)
-        self.limit -= len(buff)
-        return buff
-
-    def close(self):
-        self.stream.close()
-
-
-#=================================================================
-WBArchiveRecord = collections.namedtuple('WBArchiveRecord', 'type, rec_headers, stream, status_headers')
-
-#=================================================================
-
-class ArchiveLoader:
-    """
-    >>> load_test_archive('example.warc.gz', '333', '1043')
-    (('warc', 'response'),
-     StatusAndHeaders(protocol = 'WARC/1.0', statusline = '', headers = [ ('WARC-Type', 'response'),
-      ('WARC-Record-ID', '<urn:uuid:6d058047-ede2-4a13-be79-90c17c631dd4>'),
-      ('WARC-Date', '2014-01-03T03:03:21Z'),
-      ('Content-Length', '1610'),
-      ('Content-Type', 'application/http; msgtype=response'),
-      ('WARC-Payload-Digest', 'sha1:B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'),
-      ('WARC-Target-URI', 'http://example.com?example=1'),
-      ('WARC-Warcinfo-ID', '<urn:uuid:fbd6cf0a-6160-4550-b343-12188dc05234>')]),
-     StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
-      ('Cache-Control', 'max-age=604800'),
-      ('Content-Type', 'text/html'),
-      ('Date', 'Fri, 03 Jan 2014 03:03:21 GMT'),
-      ('Etag', '"359670651"'),
-      ('Expires', 'Fri, 10 Jan 2014 03:03:21 GMT'),
-      ('Last-Modified', 'Fri, 09 Aug 2013 23:54:35 GMT'),
-      ('Server', 'ECS (sjc/4FCE)'),
-      ('X-Cache', 'HIT'),
-      ('x-ec-custom-error', '1'),
-      ('Content-Length', '1270'),
-      ('Connection', 'close')]))
-
-
-    >>> load_test_archive('example.warc.gz', '1864', '553')
-    (('warc', 'revisit'),
-     StatusAndHeaders(protocol = 'WARC/1.0', statusline = '', headers = [ ('WARC-Type', 'revisit'),
-      ('WARC-Record-ID', '<urn:uuid:3619f5b0-d967-44be-8f24-762098d427c4>'),
-      ('WARC-Date', '2014-01-03T03:03:41Z'),
-      ('Content-Length', '340'),
-      ('Content-Type', 'application/http; msgtype=response'),
-      ('WARC-Payload-Digest', 'sha1:B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'),
-      ('WARC-Target-URI', 'http://example.com?example=1'),
-      ('WARC-Warcinfo-ID', '<urn:uuid:fbd6cf0a-6160-4550-b343-12188dc05234>'),
-      ( 'WARC-Profile',
-        'http://netpreserve.org/warc/0.18/revisit/identical-payload-digest'),
-      ('WARC-Refers-To-Target-URI', 'http://example.com?example=1'),
-      ('WARC-Refers-To-Date', '2014-01-03T03:03:21Z')]),
-     StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
-      ('Cache-Control', 'max-age=604800'),
-      ('Content-Type', 'text/html'),
-      ('Date', 'Fri, 03 Jan 2014 03:03:41 GMT'),
-      ('Etag', '"359670651"'),
-      ('Expires', 'Fri, 10 Jan 2014 03:03:41 GMT'),
-      ('Last-Modified', 'Fri, 09 Aug 2013 23:54:35 GMT'),
-      ('Server', 'ECS (sjc/4FCE)'),
-      ('X-Cache', 'HIT'),
-      ('x-ec-custom-error', '1'),
-      ('Content-Length', '1270'),
-      ('Connection', 'close')]))
-    """
-
-    # Standard ARC headers
-    ARC_HEADERS = ["uri", "ip-address", "creation-date", "content-type", "length"]
-
-    # Since loading a range request, can only determine gzip-ness based on file extension
-    FORMAT_MAP = {
-        '.warc.gz': ('warc', True),
-        '.arc.gz':  ('arc',  True),
-        '.warc':    ('warc', False),
-        '.arc':     ('arc',  False),
-    }
-
-    @staticmethod
-    def create_default_loaders(hmac = None):
-        http = HttpLoader(hmac)
-        file = FileLoader()
-        return {
-                'http': http,
-                'https': http,
-                'file': file,
-                '': file
-               }
-
-
-    def __init__(self, loaders = {}, hmac = None, chunk_size = 8192):
-        self.loaders = loaders if loaders else ArchiveLoader.create_default_loaders(hmac)
-        self.chunk_size = chunk_size
-
-        self.arc_parser = ARCHeadersParser(ArchiveLoader.ARC_HEADERS)
-        self.warc_parser = StatusAndHeadersParser(['WARC/1.0', 'WARC/0.17', 'WARC/0.18'])
-        self.http_parser = StatusAndHeadersParser(['HTTP/1.0', 'HTTP/1.1'])
-
-    def load(self, url, offset, length):
-        url_parts = urlparse.urlsplit(url)
-
-        loader = self.loaders.get(url_parts.scheme)
-        if not loader:
-            raise wbexceptions.UnknownLoaderProtocolException(url)
-
-        the_format = None
-
-        for ext, iformat in ArchiveLoader.FORMAT_MAP.iteritems():
-            if url.endswith(ext):
-                the_format = iformat
-                break
-
-        if the_format is None:
-            raise wbexceptions.UnknownArchiveFormatException(url)
-
-        (a_format, is_gzip) = the_format
-
-        decomp = utils.create_decompressor() if is_gzip else None
-
-        try:
-            length = int(length)
-        except:
-            length = -1
-
-
-        raw = loader.load(url, long(offset), length)
-
-        stream = LineReader(raw, length, self.chunk_size, decomp)
-
-        if a_format == 'arc':
-            rec_headers = self.arc_parser.parse(stream)
-            rec_type = 'response'
-            empty = (rec_headers.get_header('length') == 0)
-
-        elif a_format == 'warc':
-            rec_headers = self.warc_parser.parse(stream)
-            rec_type = rec_headers.get_header('WARC-Type')
-            empty = (rec_headers.get_header('Content-Length') == '0')
-
-        # special case: empty w/arc record (hopefully a revisit)
-        if empty:
-            status_headers = StatusAndHeaders('204 No Content', [])
-
-        # special case: warc records that are not expected to have http headers
-        # attempt to add 200 status and content-type
-        elif rec_type == 'metadata' or rec_type == 'resource':
-            status_headers = StatusAndHeaders('200 OK', [('Content-Type', rec_headers.get_header('Content-Type'))])
-
-        # special case: http 0.9 response, no status or headers
-        #elif rec_type == 'response':
-        #    content_type = rec_headers.get_header('Content-Type')
-        #    if content_type and (';version=0.9' in content_type):
-        #        status_headers = StatusAndHeaders('200 OK', [])
-
-        # response record: parse HTTP status and headers!
-        else:
-            #(statusline, http_headers) = self.parse_http_headers(stream)
-            status_headers = self.http_parser.parse(stream)
-
-        return WBArchiveRecord((a_format, rec_type), rec_headers, stream, status_headers)
-
-
-#=================================================================
-class StatusAndHeadersParser:
-    def __init__(self, statuslist):
-        self.statuslist = statuslist
-
-    def parse(self, stream):
-        statusline = stream.readline().rstrip()
-
-        protocol_status = utils.split_prefix(statusline, self.statuslist)
-
-        if not protocol_status:
-            raise wbexceptions.InvalidArchiveRecordException('Expected Status Line, Found: ' + statusline)
-
-        headers = []
-
-        line = stream.readline().rstrip()
-        while line and line != '\r\n':
-            name, value = line.split(':', 1)
-            header = (name, value.strip())
-            headers.append(header)
-            line = stream.readline().rstrip()
-
-        return StatusAndHeaders(statusline = protocol_status[1].strip(), headers = headers, protocol = protocol_status[0])
-
-#=================================================================
-class ARCHeadersParser:
-    def __init__(self, headernames):
-        self.headernames = headernames
-
-
-    def parse(self, stream):
-        headerline = stream.readline().rstrip()
-
-        parts = headerline.split()
-
-        headernames = self.headernames
-
-        if len(parts) != len(headernames):
-            raise wbexceptions.InvalidArchiveRecordException('Wrong # of heaeders, expected arc headers {0}, Found {1}'.format(headernames, parts))
-
-        headers = []
-
-        for name, value in itertools.izip(headernames, parts):
-            headers.append((name, value))
-
-        return StatusAndHeaders(statusline = '', headers = headers, protocol = 'ARC/1.0')
-
-#=================================================================
-class LineReader:
-    def __init__(self, stream, max_len = 0, chunk_size = 1024, decomp = None):
-        self.stream = stream
-        self.chunk_size = chunk_size
-        self.decomp = decomp
-        self.buff = None
-        self.num_read = 0
-        self.max_len = max_len
-
-    def _fillbuff(self, chunk_size = None):
-        if not chunk_size:
-            chunk_size = self.chunk_size
-
-        if not self.buff or self.buff.pos >= self.buff.len:
-            to_read =  min(self.max_len - self.num_read, self.chunk_size) if (self.max_len > 0) else self.chunk_size
-            data = self.stream.read(to_read)
-            self._process_read(data)
-
-    def _process_read(self, data):
-        if self.decomp and data:
-            try:
-                data = self.decomp.decompress(data)
-            except Exception:
-                # if first read attempt, assume non-gzipped stream
-                if self.num_read == 0:
-                    self.decomp = False
-                # otherwise (partly decompressed), something is wrong
-                else:
-                    raise
-
-        self.num_read += len(data)
-        self.buff = StringIO.StringIO(data)
-
-
-    def read(self, length = None):
-        self._fillbuff()
-        return self.buff.read(length)
-
-    def readline(self, length = None):
-        self._fillbuff()
-        return self.buff.readline(length)
-
-    def close(self):
-        if self.stream:
-            self.stream.close()
-            self.stream = None
-
-
-class ChunkedDataException(Exception):
-    pass
-
-
-class ChunkedLineReader(LineReader):
-    r"""
-    Properly formatted chunked data:
-    >>> c=ChunkedLineReader(StringIO.StringIO("4\r\n1234\r\n0\r\n\r\n")); c.read()+c.read()
-    '1234'
-
-    Non-chunked data:
-    >>> ChunkedLineReader(StringIO.StringIO("xyz123!@#")).read()
-    'xyz123!@#'
-
-    Starts like chunked data, but isn't:
-    >>> c=ChunkedLineReader(StringIO.StringIO("1\r\nxyz123!@#")); c.read()+c.read()
-    '1\r\nx123!@#'
-
-    Chunked data cut off part way through:
-    >>> c=ChunkedLineReader(StringIO.StringIO("4\r\n1234\r\n4\r\n12"));c.read()+c.read()
-    '123412'
-    """
-
-    all_chunks_read = False
-    not_chunked = False
-    raise_chunked_data_exceptions = False # if False, we'll use best-guess fallback for parse errors
-
-    def _fillbuff(self, chunk_size = None):
-        if self.not_chunked:
-            return LineReader._fillbuff(self, chunk_size)
-
-        if self.all_chunks_read:
-            return
-
-        if not self.buff or self.buff.pos >= self.buff.len:
-            length_header = self.stream.readline(64)
-            data = ''
-
-            try:
-                # decode length header
-                try:
-                    chunk_size = int(length_header.strip().split(';')[0], 16)
-                except ValueError:
-                    raise ChunkedDataException("Couldn't decode length header '%s'" % length_header)
-
-                if chunk_size:
-                    # read chunk
-                    while len(data) < chunk_size:
-                        new_data = self.stream.read(chunk_size - len(data))
-
-                        # if we unexpectedly run out of data, either raise an exception or just stop reading, assuming file was cut off
-                        if not new_data:
-                            if self.raise_chunked_data_exceptions:
-                                raise ChunkedDataException("Ran out of data before end of chunk")
-                            else:
-                                chunk_size = len(data)
-                                self.all_chunks_read = True
-
-                        data += new_data
-
-                    # if we successfully read a block without running out, it should end in \r\n
-                    if not self.all_chunks_read:
-                        clrf = self.stream.read(2)
-                        if clrf != '\r\n':
-                            raise ChunkedDataException("Chunk terminator not found.")
-
-                    if self.decomp:
-                        data = self.decomp.decompress(data)
-                else:
-                    # chunk_size 0 indicates end of file
-                    self.all_chunks_read = True
-                    data = ''
-
-                self._process_read(data)
-            except ChunkedDataException:
-                if self.raise_chunked_data_exceptions:
-                    raise
-                # Can't parse the data as chunked.
-                # It's possible that non-chunked data is set with a Transfer-Encoding: chunked
-                # Treat this as non-chunk encoded from here on
-                self._process_read(length_header + data)
-                self.not_chunked = True
-
-
-#=================================================================
-import utils
-if __name__ == "__main__" or utils.enable_doctests():
-    import os
-    import pprint
-
-    testloader = ArchiveLoader()
-
-    def load_test_archive(test_file, offset, length):
-        path = utils.test_data_dir() + 'warcs/' + test_file
-
-        archive = testloader.load(path, offset, length)
-        pprint.pprint((archive.type, archive.rec_headers, archive.status_headers))
-
-    def test_multiple_reads(reader, inc_reads):
-        result = None
-        for x in inc_reads:
-            result = reader.read(x)
-        return result
-
-    import doctest
-    doctest.testmod()
-
--- a/pywb/binsearch/binsearch.py
+++ b/pywb/binsearch/binsearch.py
@ -1,123 +0,0 @@
-from collections import deque
-import os
-import itertools
-
-#=================================================================
-# Binary Search over a text file
-#=================================================================
-class FileReader:
-    """
-    A very simple file-like object wrapper that knows it's size
-    getsize() method returns the filesize
-    """
-    def __init__(self, filename):
-        self.fh = open(filename, 'rb')
-        self.filename = filename
-        self.size = os.path.getsize(filename)
-
-    def getsize(self):
-        return self.size
-
-    def readline(self):
-        return self.fh.readline()
-
-    def seek(self, offset):
-        return self.fh.seek(offset)
-
-    def close(self):
-        return self.fh.close()
-
-
-#=================================================================
-def binsearch_offset(reader, key, compare_func=cmp, block_size=8192):
-    """
-    Find offset of the full line which matches a given 'key' using binary search
-    If key is not found, the offset is of the line after the key
-
-    File is subdivided into block_size (default 8192) sized blocks
-    Optional compare_func may be specified
-    """
-    min = 0
-    max = reader.getsize() / block_size
-
-    while (max - min > 1):
-        mid = min + ((max - min) / 2)
-        reader.seek(mid * block_size)
-
-        if mid > 0:
-            reader.readline() # skip partial line
-
-        line = reader.readline()
-
-        if compare_func(key, line) > 0:
-            min = mid
-        else:
-            max = mid
-
-    return (min * block_size)
-
-
-def search(reader, key, prev_size = 0, compare_func = cmp, block_size = 8192):
-    """
-    Perform a binsearch for a specified key down to block_size (8192) sized blocks,
-    followed by linear search within the block to find first matching line.
-
-    When performing linear search, keep track of up to N previous lines before
-    first matching line.
-    """
-    min = binsearch_offset(reader, key, compare_func, block_size)
-
-    reader.seek(min)
-
-    if min > 0:
-        reader.readline() # skip partial line
-
-    if prev_size > 1:
-        prev_deque = deque(maxlen = prev_size)
-
-    line = None
-
-    while True:
-        line = reader.readline()
-        if not line:
-            break
-        if compare_func(line, key) >= 0:
-            break
-
-        if prev_size == 1:
-            prev = line
-        elif prev_size > 1:
-            prev_deque.append(line)
-
-    def gen_iter(line):
-        if prev_size == 1:
-            yield prev.rstrip()
-        elif prev_size > 1:
-            for i in prev_deque:
-                yield i.rstrip()
-
-        while line:
-            yield line.rstrip()
-            line = reader.readline()
-
-    return gen_iter(line)
-
-
-# Iterate over prefix matches
-def iter_prefix(reader, key):
-    """
-    Creates an iterator which iterates over prefix matches for a key in a sorted text file
-    A line matches as long as it starts with key
-    """
-
-    return itertools.takewhile(lambda line: line.startswith(key), search(reader, key))
-
-
-def iter_exact(reader, key, token=' '):
-    """
-    Create an iterator which iterates over exact matches for a key in a sorted text file
-    Key is terminated by a token (default ' ')
-    """
-
-    return iter_prefix(reader, key + token)
-
--- a/pywb/cdx/README.md
+++ b/pywb/cdx/README.md
@ -0,0 +1,36 @@
+## PyWb CDX v0.2
+
+[![Build Status](https://travis-ci.org/ikreymer/pywb_cdx.png?branch=master)](https://travis-ci.org/ikreymer/pywb_cdx)
+
+
+This package contains the CDX processing suite of the pywb wayback tool suite.
+
+The CDX Server loads, filters and transforms cdx from multiple sources in response
+to a given query.
+
+### Installation and Tests
+
+`pip install -r requirements` -- to install
+
+`python run-tests.py` -- to run all tests
+
+
+### Sample App
+
+A very simple reference WSGI app is included.
+
+Run: `python -m pywb_cdx.wsgi_cdxserver` to start the app, keyboard interrupt to stop.
+
+The default [config.yaml](pywb_cdx/config.yaml) points to the sample data directory
+and uses port 8080
+
+### CDX Server API Reference
+
+Goal is to provide compatiblity with this feature set and more:
+https://github.com/internetarchive/wayback/tree/master/wayback-cdx-server
+
+TODO
+
+
+
+
--- a/pywb/binsearch/init.py
+++ b/pywb/binsearch/init.py
--- a/pywb/cdxserver/cdxobject.py
+++ b/pywb/cdxserver/cdxobject.py
@ -1,25 +1,31 @@
 from collections import OrderedDict
 import itertools

+
 #=================================================================
 class CDXObject(OrderedDict):
    CDX_FORMATS = [
        # Public CDX Format
-        ["urlkey","timestamp","original","mimetype","statuscode","digest","length"],
+        ["urlkey", "timestamp", "original", "mimetype", "statuscode",
+         "digest", "length"],

        # CDX 11 Format
-        ["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"],
+        ["urlkey", "timestamp", "original", "mimetype", "statuscode",
+         "digest", "redirect", "robotflags", "length", "offset", "filename"],

        # CDX 9 Format
-        ["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","offset","filename"],
+        ["urlkey", "timestamp", "original", "mimetype", "statuscode",
+         "digest", "redirect", "offset", "filename"],

        # CDX 11 Format + 3 revisit resolve fields
-        ["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename",
-         "orig.length","orig.offset","orig.filename"],
+        ["urlkey", "timestamp", "original", "mimetype", "statuscode",
+         "digest", "redirect", "robotflags", "length", "offset", "filename",
+         "orig.length", "orig.offset", "orig.filename"],

        # CDX 9 Format + 3 revisit resolve fields
-        ["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","offset","filename",
-         "orig.length","orig.offset","orig.filename"]
+        ["urlkey", "timestamp", "original", "mimetype", "statuscode",
+         "digest", "redirect", "offset", "filename",
+         "orig.length", "orig.offset", "orig.filename"]
        ]

    def __init__(self, cdxline):
@ -53,5 +59,3 @@ class CDXObject(OrderedDict):

        li = itertools.imap(lambda (n, val): val, self.items())
        return ' '.join(li)
-
-
--- a/pywb/cdxserver/cdxops.py
+++ b/pywb/cdxserver/cdxops.py
@ -1,8 +1,6 @@
 from cdxobject import CDXObject
+from pywb.utils.timeutils import timestamp_to_sec

-from ..binsearch.binsearch import iter_exact, iter_prefix, FileReader
-
-import timeutils
 import bisect
 import itertools
 import re
@ -11,7 +9,6 @@ from heapq import merge
 from collections import deque


-
 #=================================================================
 def cdx_text_out(cdx, fields):
    if not fields:
@ -26,30 +23,31 @@ def cdx_load(sources, params):

    cdx_iter = make_cdx_iter(cdx_iter)

-    resolve_revisits = params.get('resolve_revisits', False)
-    if resolve_revisits:
-        cdx_iter = cdx_resolve_revisits(cdx_iter)
+    if not params.get('proxy_all'):
+        resolve_revisits = params.get('resolve_revisits', False)
+        if resolve_revisits:
+            cdx_iter = cdx_resolve_revisits(cdx_iter)

-    filters = params.get('filter', None)
-    if filters:
-        cdx_iter = cdx_filter(cdx_iter, filters)
+        filters = params.get('filter', None)
+        if filters:
+            cdx_iter = cdx_filter(cdx_iter, filters)

-    collapse_time = params.get('collapse_time', None)
-    if collapse_time:
-        cdx_iter = cdx_collapse_time_status(cdx_iter, collapse_time)
+        collapse_time = params.get('collapse_time', None)
+        if collapse_time:
+            cdx_iter = cdx_collapse_time_status(cdx_iter, collapse_time)

-    limit = int(params.get('limit', 1000000))
+        limit = int(params.get('limit', 1000000))

-    reverse = params.get('reverse', False)
-    if reverse:
-        cdx_iter = cdx_reverse(cdx_iter, limit)
+        reverse = params.get('reverse', False)
+        if reverse:
+            cdx_iter = cdx_reverse(cdx_iter, limit)

-    closest_to = params.get('closest_to', None)
-    if closest_to:
-        cdx_iter = cdx_sort_closest(closest_to, cdx_iter, limit)
+        closest_to = params.get('closest', None)
+        if closest_to:
+            cdx_iter = cdx_sort_closest(closest_to, cdx_iter, limit)

-    if limit:
-        cdx_iter = cdx_limit(cdx_iter, limit)
+        if limit:
+            cdx_iter = cdx_limit(cdx_iter, limit)

    # output raw cdx objects
    if params.get('output') == 'raw':
@ -73,6 +71,7 @@ def load_cdx_streams(sources, params):
    merged_stream = merge(*(source_iters))
    return merged_stream

+
 #=================================================================
 # convert text cdx stream to CDXObject
 def make_cdx_iter(text_iter):
@ -98,7 +97,7 @@ def cdx_reverse(cdx_iter, limit):

        return [last] if last else []

-    reverse_cdxs = deque(maxlen = limit)
+    reverse_cdxs = deque(maxlen=limit)

    for cdx in cdx_iter:
        reverse_cdxs.appendleft(cdx)
@ -142,14 +141,13 @@ def cdx_filter(cdx_iter, filter_strings):
    filters = map(Filter, filter_strings)

    for cdx in cdx_iter:
-        if all (x(cdx) for x in filters):
+        if all(x(cdx) for x in filters):
            yield cdx


-
 #=================================================================
 # collapse by timestamp and status code
-def cdx_collapse_time_status(cdx_iter, timelen = 10):
+def cdx_collapse_time_status(cdx_iter, timelen=10):
    timelen = int(timelen)

    last_token = None
@ -163,16 +161,15 @@ def cdx_collapse_time_status(cdx_iter, timelen = 10):
            yield cdx


-
 #=================================================================
 # sort CDXCaptureResult by closest to timestamp
-def cdx_sort_closest(closest, cdx_iter, limit = 10):
+def cdx_sort_closest(closest, cdx_iter, limit=10):
    closest_cdx = []

-    closest_sec = timeutils.timestamp_to_sec(closest)
+    closest_sec = timestamp_to_sec(closest)

    for cdx in cdx_iter:
-        sec = timeutils.timestamp_to_sec(cdx['timestamp'])
+        sec = timestamp_to_sec(cdx['timestamp'])
        key = abs(closest_sec - sec)

        # create tuple to sort by key
@ -186,22 +183,22 @@ def cdx_sort_closest(closest, cdx_iter, limit = 10):
        if len(closest_cdx) > limit:
            closest_cdx.pop()

-
    return itertools.imap(lambda x: x[1], closest_cdx)


-
 #=================================================================
 # resolve revisits

 # Fields to append from cdx original to revisit
 ORIG_TUPLE = ['length', 'offset', 'filename']

+
 def cdx_resolve_revisits(cdx_iter):
    originals = {}

    for cdx in cdx_iter:
-        is_revisit = (cdx['mimetype'] == 'warc/revisit') or (cdx['filename'] == '-')
+        is_revisit = ((cdx['mimetype'] == 'warc/revisit') or
+                      (cdx['filename'] == '-'))

        digest = cdx['digest']

@ -210,7 +207,6 @@ def cdx_resolve_revisits(cdx_iter):
        if not original_cdx and not is_revisit:
            originals[digest] = cdx

-
        if original_cdx and is_revisit:
            fill_orig = lambda field: original_cdx[field]
            # Transfer mimetype and statuscode
@ -224,5 +220,3 @@ def cdx_resolve_revisits(cdx_iter):
            cdx['orig.' + field] = fill_orig(field)

        yield cdx
-
-
--- a/pywb/cdxserver/cdxserver.py
+++ b/pywb/cdxserver/cdxserver.py
@ -1,5 +1,4 @@
 import surt
-from ..binsearch.binsearch import iter_exact, iter_prefix, FileReader
 from cdxops import cdx_load

 import itertools
@ -7,39 +6,21 @@ import logging
 import os
 import urlparse

+from cdxsource import CDXSource, CDXFile, RemoteCDXSource

-#=================================================================
-class CDXFile:
-    def __init__(self, filename):
-        self.filename = filename
-
-    def load_cdx(self, params):
-        source = FileReader(self.filename)
-
-        match_type = params.get('match_type')
-
-        if match_type == 'prefix':
-            iter_func = iter_prefix
-        else:
-            iter_func = iter_exact
-
-        key = params.get('key')
-
-        return iter_func(source, key)
-
-    def __str__(self):
-        return 'CDX File - ' + self.filename

 #=================================================================
 class CDXException(Exception):
-    def __init__(self, msg, url = None):
-        Exception.__init__(self, msg)
-        self.url = url
-
    def status(self):
        return '400 Bad Request'


+#=================================================================
+class AccessException(CDXException):
+    def status(self):
+        return '403 Bad Request'
+
+
 #=================================================================
 class CDXServer:
    """
@ -47,33 +28,51 @@ class CDXServer:
    responds to queries and dispatches to the cdx ops for processing
    """

-    def __init__(self, sources, surt_ordered = True):
+    @staticmethod
+    def create_from_config(config):
+        paths = config.get('index_paths')
+        surt_ordered = config.get('surt_ordered', True)
+        return CDXServer(paths, surt_ordered)
+
+    def __init__(self, sources, surt_ordered=True):
        self.sources = []
        self.surt_ordered = surt_ordered
+
        logging.debug('CDX Surt-Ordered? ' + str(surt_ordered))

+        if not isinstance(sources, list):
+            sources = [sources]
+
        for src in sources:
-            if os.path.isdir(src):
-                for file in os.listdir(src):
-                    self.add_cdx_loader(src + file)
-            else:
-                self.add_cdx_loader(src)
+            if isinstance(src, CDXSource):
+                self.add_cdx_source(src)
+            elif isinstance(src, str):
+                if os.path.isdir(src):
+                    for file in os.listdir(src):
+                        self.add_cdx_source(src + file)
+                else:
+                    self.add_cdx_source(src)

        if len(self.sources) == 0:
-            logging.exception('No CDX Sources Found!')
+            logging.exception('No CDX Sources Found from: ' + str(sources))

-    def add_cdx_loader(self, filename):
-        source = self.create_cdx_loader(filename)
-        if not source:
-            return
+    def add_cdx_source(self, source):
+        if not isinstance(source, CDXSource):
+            source = self.create_cdx_source(source)
+            if not source:
+                return

        logging.debug('Adding CDX Source: ' + str(source))
        self.sources.append(source)

    @staticmethod
-    def create_cdx_loader(filename):
+    def create_cdx_source(filename):
+        if filename.startswith('http://') or filename.startswith('https://'):
+            return RemoteCDXSource(filename)
+
        if filename.endswith('.cdx'):
            return CDXFile(filename)
+
        return None
        #TODO: support zipnum
        #elif filename.endswith('.summary')
@ -81,27 +80,52 @@ class CDXServer:
        #elif filename.startswith('redis://')
        #    return RedisCDXSource(filename)

-
    def load_cdx(self, **params):
-        # canonicalize to surt (canonicalization is part of surt conversion)
+        # if key not set, assume 'url' is set and needs canonicalization
+        if not params.get('key'):
+            params['key'] = self._canonicalize(params)
+
+        self._convert_old_style(params)
+
+        return cdx_load(self.sources, params)
+
+    def _canonicalize(self, params):
+        """
+        Canonicalize url and convert to surt
+        If no surt-mode, convert back to url form
+        as surt conversion is currently part of canonicalization
+        """
        try:
            url = params['url']
        except KeyError:
-            raise CDXException('The url= param must be specified to query the cdx server')
+            msg = 'A url= param must be specified to query the cdx server'
+            raise CDXException(msg)

        try:
            key = surt.surt(url)
        except Exception as e:
-            raise CDXException('Invalid url: ', url)
+            raise CDXException('Invalid Url: ' + url)

        # if not surt, unsurt the surt to get canonicalized non-surt url
        if not self.surt_ordered:
            key = unsurt(key)

-        params['key'] = key
+        return key

-        return cdx_load(self.sources, params)
+    def _convert_old_style(self, params):
+        """
+        Convert old-style CDX Server param semantics
+        """
+        collapse_time = params.get('collapseTime')
+        if collapse_time:
+            params['collapse_time'] = collapse_time

+        resolve_revisits = params.get('resolveRevisits')
+        if resolve_revisits:
+            params['resolve_revisits'] = resolve_revisits
+
+        if params.get('sort') == 'reverse':
+            params['reverse'] = True

    def load_cdx_from_request(self, env):
        #url = wbrequest.wb_url.url
@ -113,7 +137,8 @@ class CDXServer:
            params['output'] = 'text'

        # parse_qs produces arrays for single values
-        # cdxreader expects singleton params for all except filters, so convert here
+        # cdx processing expects singleton params for all params,
+        # except filters, so convert here
        # use first value of the list
        for name, val in params.iteritems():
            if name != 'filter':
@ -122,13 +147,10 @@ class CDXServer:
        cdx_lines = self.load_cdx(**params)
        return cdx_lines

-
-
    def __str__(self):
        return 'load cdx indexes from ' + str(self.sources)


-
 #=================================================================
 def unsurt(surt):
    """
@ -141,7 +163,8 @@ def unsurt(surt):
    'com,example)'

    # Long surt
-    >>> unsurt('suffix,domain,sub,subsub,another,subdomain)/path/file/index.html?a=b?c=)/')
+    >>> unsurt('suffix,domain,sub,subsub,another,subdomain)/path/file/\
+index.html?a=b?c=)/')
    'subdomain.another.subsub.sub.domain.suffix)/path/file/index.html?a=b?c=)/'
    """

@ -158,3 +181,6 @@ def unsurt(surt):
        return surt


+if __name__ == "__main__":
+    import doctest
+    doctest.testmod()
--- a/pywb/cdx/cdxsource.py
+++ b/pywb/cdx/cdxsource.py
@ -0,0 +1,92 @@
+from pywb.utils.binsearch import iter_exact, iter_prefix
+from pywb.utils.loaders import SeekableTextFileReader
+
+import urllib
+import urllib2
+
+
+#=================================================================
+class CDXSource(object):
+    """
+    Represents any cdx index source
+    """
+    def load_cdx(self, params):
+        raise NotImplementedError('Implement in subclass')
+
+
+#=================================================================
+class CDXFile(CDXSource):
+    """
+    Represents a local plain-text .cdx file
+    """
+    def __init__(self, filename):
+        self.filename = filename
+
+    def load_cdx(self, params):
+        source = SeekableTextFileReader(self.filename)
+
+        match_type = params.get('match_type')
+
+        if match_type == 'prefix':
+            iter_func = iter_prefix
+        else:
+            iter_func = iter_exact
+
+        key = params.get('key')
+
+        return iter_func(source, key)
+
+    def __str__(self):
+        return 'CDX File - ' + self.filename
+
+
+#=================================================================
+class RemoteCDXSource(CDXSource):
+    """
+    Represents a remote cdx server, to which requests will be proxied.
+
+    Only url and match type params are proxied at this time,
+    the stream is passed through all other filters locally.
+    """
+    def __init__(self, filename, cookie=None, proxy_all=True):
+        self.remote_url = filename
+        self.cookie = cookie
+        self.proxy_all = proxy_all
+
+    def load_cdx(self, proxy_params):
+        if self.proxy_all:
+            params = proxy_params
+            params['proxy_all'] = True
+        else:
+            # Only send url and matchType params to remote
+            params = {}
+            params['url'] = proxy_params['url']
+            match_type = proxy_params.get('match_type')
+
+            if match_type:
+                proxy_params['matchType'] = match_type
+
+        urlparams = urllib.urlencode(params, True)
+
+        try:
+            request = urllib2.Request(self.remote_url, urlparams)
+
+            if self.cookie:
+                request.add_header('Cookie', self.cookie)
+
+            response = urllib2.urlopen(request)
+
+        except urllib2.HTTPError as e:
+            if e.code == 403:
+                exc_msg = e.read()
+                msg = ('Blocked By Robots' if 'Blocked By Robots' in exc_msg
+                       else 'Excluded')
+
+                raise AccessException(msg)
+            else:
+                raise
+
+        return iter(response)
+
+    def __str__(self):
+        return 'Remote CDX Server: ' + self.remote_url
--- a/pywb/cdx/config.yaml
+++ b/pywb/cdx/config.yaml
@ -0,0 +1,3 @@
+#CDX Server WSGI App Config
+index_paths: ./sample_data/
+port: 8090
--- a/pywb/cdx/test/cdxserver_test.py
+++ b/pywb/cdx/test/cdxserver_test.py
@ -0,0 +1,163 @@
+#=================================================================
+"""
+# Merge Sort Multipe CDX Sources
+>>> cdx_ops_test(url = 'http://iana.org/', sources = [test_cdx_dir + 'dupes.cdx', test_cdx_dir + 'iana.cdx'])
+org,iana)/ 20140126200624 http://www.iana.org/ text/html 200 OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 2258 334 iana.warc.gz
+org,iana)/ 20140127171238 http://iana.org unk 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 343 1858 dupes.warc.gz
+org,iana)/ 20140127171238 http://www.iana.org/ warc/revisit - OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 536 2678 dupes.warc.gz
+
+
+# Limit CDX Stream
+>>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', limit = 3)
+org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200625 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 117166 198285 iana.warc.gz
+org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200654 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf warc/revisit - YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 548 482544 iana.warc.gz
+org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200706 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf warc/revisit - YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 495230 iana.warc.gz
+
+
+# Reverse CDX Stream
+>>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', reverse = True, resolve_revisits = True, limit = 3)
+org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201308 https://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 551 783712 iana.warc.gz 117166 198285 iana.warc.gz
+org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201249 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 771773 iana.warc.gz 117166 198285 iana.warc.gz
+org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201240 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 551 757988 iana.warc.gz 117166 198285 iana.warc.gz
+
+>>> cdx_ops_test('http://iana.org/_js/2013.1/jquery.js', reverse = True, resolve_revisits = True, limit = 1)
+org,iana)/_js/2013.1/jquery.js 20140126201307 https://www.iana.org/_js/2013.1/jquery.js application/x-javascript 200 AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO - - 543 778507 iana.warc.gz 33449 7311 iana.warc.gz
+
+# No matching results
+>>> cdx_ops_test('http://iana.org/dont_have_this', reverse = True, resolve_revisits = True, limit = 2)
+
+
+# Filter cdx
+>>> cdx_ops_test(url = 'http://iana.org/domains', match_type = 'prefix', filter = ['mimetype:text/html'])
+org,iana)/domains 20140126200825 http://www.iana.org/domains text/html 200 7UPSCLNWNZP33LGW6OJGSF2Y4CDG4ES7 - - 2912 610534 iana.warc.gz
+org,iana)/domains/arpa 20140126201248 http://www.iana.org/domains/arpa text/html 200 QOFZZRN6JIKAL2JRL6ZC2VVG42SPKGHT - - 2939 759039 iana.warc.gz
+org,iana)/domains/idn-tables 20140126201127 http://www.iana.org/domains/idn-tables text/html 200 HNCUFTJMOQOGAEY6T56KVC3T7TVLKGEW - - 8118 715878 iana.warc.gz
+org,iana)/domains/int 20140126201239 http://www.iana.org/domains/int text/html 200 X32BBNNORV4SPEHTQF5KI5NFHSKTZK6Q - - 2482 746788 iana.warc.gz
+org,iana)/domains/reserved 20140126201054 http://www.iana.org/domains/reserved text/html 200 R5AAEQX5XY5X5DG66B23ODN5DUBWRA27 - - 3573 701457 iana.warc.gz
+org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz
+org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz
+org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz
+org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz
+
+
+>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', filter = 'statuscode:200')
+org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz
+
+
+# Collapse by timestamp
+# unresolved revisits, different statuscode results in an extra repeat
+>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', collapse_time = 11)
+org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz
+org,iana)/_css/2013.1/screen.css 20140126200653 http://www.iana.org/_css/2013.1/screen.css warc/revisit - BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 533 328367 iana.warc.gz
+org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css warc/revisit - BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz
+
+# resolved revisits
+>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', collapse_time = '11', resolve_revisits = True)
+org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz - - -
+org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz 8754 41238 iana.warc.gz
+
+
+# Sort by closest timestamp + field select output
+>>> cdx_ops_test(closest = '20140126200826', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', fields = 'timestamp', limit = 10)
+20140126200826
+20140126200816
+20140126200805
+20140126200912
+20140126200738
+20140126200930
+20140126200718
+20140126200706
+20140126200654
+20140126200625
+
+>>> cdx_ops_test(closest = '20140126201306', url = 'http://iana.org/dnssec', resolve_revisits = True, sources = [test_cdx_dir + 'dupes.cdx', test_cdx_dir + 'iana.cdx'])
+org,iana)/dnssec 20140126201306 http://www.iana.org/dnssec text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 442 772827 iana.warc.gz - - -
+org,iana)/dnssec 20140126201307 https://www.iana.org/dnssec text/html 200 PHLRSX73EV3WSZRFXMWDO6BRKTVUSASI - - 2278 773766 iana.warc.gz - - -
+
+
+>>> cdx_ops_test(closest = '20140126201307', url = 'http://iana.org/dnssec', resolve_revisits = True)
+org,iana)/dnssec 20140126201307 https://www.iana.org/dnssec text/html 200 PHLRSX73EV3WSZRFXMWDO6BRKTVUSASI - - 2278 773766 iana.warc.gz - - -
+org,iana)/dnssec 20140126201306 http://www.iana.org/dnssec text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 442 772827 iana.warc.gz - - -
+
+# equal dist prefer earlier
+>>> cdx_ops_test(closest = '20140126200700', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolve_revisits = True, limit = 2)
+org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200654 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 548 482544 iana.warc.gz 117166 198285 iana.warc.gz
+org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200706 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 495230 iana.warc.gz 117166 198285 iana.warc.gz
+
+>>> cdx_ops_test(closest = '20140126200659', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolve_revisits = True, limit = 2, fields = 'timestamp')
+20140126200654
+20140126200706
+
+>>> cdx_ops_test(closest = '20140126200701', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolve_revisits = True, limit = 2, fields = 'timestamp')
+20140126200706
+20140126200654
+
+
+# Resolve Revisits
+>>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/inconsolata.otf', resolve_revisits = True)
+org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200826 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 34054 620049 iana.warc.gz - - -
+org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200912 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 546 667073 iana.warc.gz 34054 620049 iana.warc.gz
+org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200930 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 534 697255 iana.warc.gz 34054 620049 iana.warc.gz
+org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201055 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 547 714833 iana.warc.gz 34054 620049 iana.warc.gz
+org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201249 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 551 768625 iana.warc.gz 34054 620049 iana.warc.gz
+
+>>> cdx_ops_test('http://iana.org/domains/root/db', resolve_revisits = True)
+org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz - - -
+org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz - - -
+
+
+# CDX Server init
+>>> x = CDXServer([test_cdx_dir]).load_cdx(url = 'example.com', limit = 1, output = 'raw')
+>>> pprint.pprint(x.next().items())
+[('urlkey', 'com,example)/'),
+ ('timestamp', '20140127171200'),
+ ('original', 'http://example.com'),
+ ('mimetype', 'text/html'),
+ ('statuscode', '200'),
+ ('digest', 'B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'),
+ ('redirect', '-'),
+ ('robotflags', '-'),
+ ('length', '1046'),
+ ('offset', '334'),
+ ('filename', 'dupes.warc.gz')]
+
+# NOTE: external dependency -- need self-contained test
+>>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'example.com', output = 'raw', limit = '2')
+>>> pprint.pprint(x.next().items())
+[('urlkey', 'com,example)/'),
+ ('timestamp', '20020120142510'),
+ ('original', 'http://example.com:80/'),
+ ('mimetype', 'text/html'),
+ ('statuscode', '200'),
+ ('digest', 'HT2DYGA5UKZCPBSFVCV3JOBXGW2G5UUA'),
+ ('length', '1792')]
+
+"""
+
+#=================================================================
+from pywb.cdx.cdxserver import CDXServer
+import os
+import sys
+import pprint
+
+from pywb import get_test_dir
+#test_cdx_dir = os.path.dirname(os.path.realpath(__file__)) + '/../sample_data/'
+test_cdx_dir = get_test_dir() + 'cdx/'
+
+def cdx_ops_test(url, sources = [test_cdx_dir + 'iana.cdx'], **kwparams):
+    kwparams['url'] = url
+    kwparams['output'] = 'text'
+
+    server = CDXServer(sources)
+    results = server.load_cdx(**kwparams)
+
+    for x in results:
+        sys.stdout.write(x)
+
+
+
+if __name__ == "__main__":
+    import doctest
+    doctest.testmod()
+
+
--- a/pywb/cdx/wsgi_cdxserver.py
+++ b/pywb/cdx/wsgi_cdxserver.py
@ -0,0 +1,72 @@
+from cdxserver import CDXServer
+import logging
+import os
+import yaml
+import pkgutil
+
+#=================================================================
+TEST_CDX_DIR = os.path.dirname(os.path.realpath(__file__)) + '/../sample_data/'
+
+CONFIG_FILE = 'config.yaml'
+
+DEFAULT_PORT = 8080
+
+if __package__:
+    config = pkgutil.get_data(__package__, CONFIG_FILE)
+    config = yaml.load(config)
+else:
+    config = None
+
+
+#=================================================================
+def main():
+    logging.basicConfig(format='%(asctime)s: [%(levelname)s]: %(message)s',
+                        level=logging.DEBUG)
+
+    cdx_config = config.get('index_paths') if config else None
+
+    if not cdx_config:
+        cdx_config = [TEST_CDX_DIR]
+
+    cdxserver = CDXServer(cdx_config)
+
+    def application(env, start_response):
+        try:
+            response = cdxserver.load_cdx_from_request(env)
+            start_response('200 OK', [('Content-Type', 'text/plain')])
+
+            response = list(response)
+
+        except Exception as exc:
+            import traceback
+            err_details = traceback.format_exc(exc)
+            start_response('400 Error', [('Content-Type', 'text/plain')])
+            response = [str(exc)]
+            print err_details
+
+        return response
+
+    return application
+
+
+if __name__ == "__main__":
+    from wsgiref.simple_server import make_server
+
+    app = main()
+
+    port = DEFAULT_PORT
+    if config:
+        port = config.get('port', DEFAULT_PORT)
+
+    httpd = make_server('', port, app)
+
+    logging.debug('Starting CDX Server on port ' + str(port))
+
+    try:
+        httpd.serve_forever()
+    except KeyboardInterrupt:
+        pass
+
+    logging.debug('Stopping CDX Server')
+else:
+    application = main()
--- a/pywb/cdxserver/cdxapp.py
+++ b/pywb/cdxserver/cdxapp.py
@ -1,42 +0,0 @@
-from cdxserver import CDXServer
-import logging
-import os
-
-
-test_cdx_dir = os.path.dirname(os.path.realpath(__file__)) + '/../../sample_archive/cdx/'
-
-#=================================================================
-def main(config = None):
-    logging.basicConfig(format = '%(asctime)s: [%(levelname)s]: %(message)s', level = logging.DEBUG)
-
-    if not config:
-        config = [test_cdx_dir]
-
-    cdxserver = CDXServer(config)
-
-    def application(env, start_response):
-        try:
-            response = cdxserver.load_cdx_from_request(env)
-            start_response('200 OK', [('Content-Type', 'text/plain')])
-
-            response = list(response)
-
-        except Exception as exc:
-            import traceback
-            err_details = traceback.format_exc(exc)
-            start_response('400 Error', [('Content-Type', 'text/plain')])
-            response = [str(exc)]
-            print err_details
-
-        return response
-
-
-    return application
-
-
-if __name__ == "__main__":
-    pass
-else:
-    application = main()
-
-
--- a/pywb/config_utils.py
+++ b/pywb/config_utils.py
@ -1,59 +1,34 @@
-import archiveloader
 import views
 import handlers
-import indexreader
 import replay_views
-import replay_resolvers
 import logging
-import hmac
-import time
+
+from pywb.warc.recordloader import ArcWarcRecordLoader
+from pywb.warc.resolvingloader import ResolvingLoader
+from pywb.rewrite.rewrite_content import RewriteContent

 #=================================================================
 # Config Loading
 #=================================================================
 def load_template_file(file, desc = None, view_class = views.J2TemplateView):
    if file:
-        logging.info('Adding {0}: {1}'.format(desc if desc else name, file))
+        logging.debug('Adding {0}: {1}'.format(desc if desc else name, file))
        file = view_class(file)

    return file

 #=================================================================
-# Cookie Signing
-#=================================================================
+def create_wb_handler(cdx_server, config):

-class HMACCookieMaker:
-    def __init__(self, key, name):
-        self.key = key
-        self.name = name
+    record_loader = ArcWarcRecordLoader(cookie_maker = config.get('cookie_maker'))
+    paths = config.get('archive_paths')

-    def __call__(self, duration, extra_id = ''):
-        expire = str(long(time.time() + duration))
+    resolving_loader = ResolvingLoader(paths = paths, cdx_server = cdx_server, record_loader = record_loader)

-        if extra_id:
-            msg = extra_id + '-' + expire
-        else:
-            msg = expire
+    replayer = replay_views.ReplayView(
+        content_loader = resolving_loader,

-        hmacdigest = hmac.new(self.key, msg)
-        hexdigest = hmacdigest.hexdigest()
-
-        if extra_id:
-            cookie = '{0}-{1}={2}-{3}'.format(self.name, extra_id, expire, hexdigest)
-        else:
-            cookie = '{0}={1}-{2}'.format(self.name, expire, hexdigest)
-
-        return cookie
-
-
-#=================================================================
-def create_wb_handler(cdx_source, config):
-
-    replayer = replay_views.RewritingReplayView(
-
-        resolvers = replay_resolvers.make_best_resolvers(config.get('archive_paths')),
-
-        loader = archiveloader.ArchiveLoader(hmac = config.get('hmac')),
+        content_rewriter = RewriteContent(),

        head_insert_view = load_template_file(config.get('head_insert_html'), 'Head Insert'),

@ -66,7 +41,7 @@ def create_wb_handler(cdx_source, config):


    wb_handler = handlers.WBHandler(
-        cdx_source,
+        cdx_server,

        replayer,

--- a/pywb/handlers.py
+++ b/pywb/handlers.py
@ -1,13 +1,12 @@
-import views
-import utils
 import urlparse
-
-from wbrequestresponse import WbResponse
-from wburl import WbUrl
-from wbexceptions import WbException, NotFoundException
-
 import pkgutil
 import mimetypes
+import time
+
+from pywb.rewrite.wburl import WbUrl
+from wbrequestresponse import WbResponse
+from wbexceptions import WbException, NotFoundException
+from views import TextCapturesView


 class BaseHandler:
@ -22,23 +21,22 @@ class BaseHandler:
 # Standard WB Handler
 #=================================================================
 class WBHandler(BaseHandler):
-    def __init__(self, cdx_reader, replay, html_view = None, search_view = None):
-        self.cdx_reader = cdx_reader
+    def __init__(self, index_reader, replay, html_view = None, search_view = None):
+        self.index_reader = index_reader
        self.replay = replay

-        self.text_view = views.TextCapturesView()
+        self.text_view = TextCapturesView()

        self.html_view = html_view
        self.search_view = search_view


    def __call__(self, wbrequest):
-
        if wbrequest.wb_url_str == '/':
            return self.render_search_page(wbrequest)

-        with utils.PerfTimer(wbrequest.env.get('X_PERF'), 'query') as t:
-            cdx_lines = self.cdx_reader.load_for_request(wbrequest, parsed_cdx = True)
+        with PerfTimer(wbrequest.env.get('X_PERF'), 'query') as t:
+            cdx_lines = self.index_reader.load_for_request(wbrequest)

        # new special modifier to always show cdx index
        if wbrequest.wb_url.mod == 'cdx_':
@ -48,8 +46,8 @@ class WBHandler(BaseHandler):
            query_view = self.html_view if self.html_view else self.text_view
            return query_view.render_response(wbrequest, cdx_lines)

-        with utils.PerfTimer(wbrequest.env.get('X_PERF'), 'replay') as t:
-            return self.replay(wbrequest, cdx_lines, self.cdx_reader)
+        with PerfTimer(wbrequest.env.get('X_PERF'), 'replay') as t:
+            return self.replay(wbrequest, cdx_lines)


    def render_search_page(self, wbrequest):
@ -60,18 +58,18 @@ class WBHandler(BaseHandler):


    def __str__(self):
-        return 'WBHandler: ' + str(self.cdx_reader) + ', ' + str(self.replay)
+        return 'WBHandler: ' + str(self.index_reader) + ', ' + str(self.replay)

 #=================================================================
 # CDX-Server Handler -- pass all params to cdx server
 #=================================================================
 class CDXHandler(BaseHandler):
-    def __init__(self, cdx_server, view = None):
-        self.cdx_server = cdx_server
-        self.view = view if view else views.TextCapturesView()
+    def __init__(self, index_reader, view = None):
+        self.index_reader = index_reader
+        self.view = view if view else TextCapturesView()

    def __call__(self, wbrequest):
-        cdx_lines = self.cdx_server.load_cdx_from_request(wbrequest.env)
+        cdx_lines = self.index_reader.cdx_server.load_cdx_from_request(wbrequest.env)

        return self.view.render_response(wbrequest, cdx_lines)

@ -81,7 +79,7 @@ class CDXHandler(BaseHandler):
        return None

    def __str__(self):
-        return 'CDX Server: ' + str(self.cdx_server)
+        return 'Index Reader: ' + str(self.index_reader)


 #=================================================================
@ -136,4 +134,19 @@ class DebugEchoHandler(BaseHandler):
        return WbResponse.text_response(str(wbrequest))


+#=================================================================
+class PerfTimer:
+    def __init__(self, perfdict, name):
+        self.perfdict = perfdict
+        self.name = name
+
+    def __enter__(self):
+        self.start = time.clock()
+        return self
+
+    def __exit__(self, *args):
+        self.end = time.clock()
+        if self.perfdict is not None:
+            self.perfdict[self.name] = str(self.end - self.start)
+

--- a/pywb/indexreader.py
+++ b/pywb/indexreader.py
@ -1,17 +1,22 @@
 import urllib
 import urllib2
 import wbexceptions
-import wbrequestresponse
-from collections import OrderedDict

-from cdxserver.cdxserver import CDXServer, CDXException
-from cdxserver.cdxobject import CDXObject
+from itertools import chain
+from pprint import pprint

-import logging
+from pywb.cdx.cdxserver import CDXServer, CDXException
+from pywb.cdx.cdxobject import CDXObject

 #=================================================================
-class IndexReader:
-    def load_for_request(self, wbrequest, parsed_cdx = True):
+class IndexReader(object):
+    def __init__(self, config):
+        if isinstance(config, str):
+            self.cdx_server = CDXServer(config)
+        else:
+            self.cdx_server = CDXServer.create_from_config(config)
+
+    def load_for_request(self, wbrequest):
        wburl = wbrequest.wb_url

        # init standard params
@ -24,147 +29,27 @@ class IndexReader:
        if wbrequest.custom_params:
            params.update(wbrequest.custom_params)

-        #params['url'] = wburl.url
-        output = 'raw' if parsed_cdx else 'text'
-
+        params['url'] = wburl.url
        try:
-            cdxlines = self.load_cdx(url = wburl.url, output = output, **params)
+            cdxlines = self.load_cdx(output='raw', **params)
        except CDXException:
            raise wbexceptions.BadUrlException('Bad Request Url: ' + wburl.url)

-        cdxlines = utils.peek_iter(cdxlines)
+        cdxlines = self.peek_iter(cdxlines)

        if cdxlines is None:
            raise wbexceptions.NotFoundException('WB Does Not Have Url: ' + wburl.url)

-        cdxlines = self.filter_cdx(wbrequest, cdxlines)
-
        return cdxlines

-    def filter_cdx(self, wbrequest, cdxlines):
-        # Subclasses may wrap cdxlines iterator in a filter
-        return cdxlines
+    def load_cdx(self, **params):
+        return self.cdx_server.load_cdx(**params)

-    def load_cdx(self, url, params = {}, parsed_cdx = True):
-        raise NotImplementedError('Override in subclasses')
-
-    @staticmethod
-    def make_best_cdx_source(paths, config):
-        # may be a string or list
-        surt_ordered = config.get('surt_ordered', True)
-
-        # support mixed cdx streams and remote servers?
-        # for now, list implies local sources
-        if isinstance(paths, list):
-            if len(paths) > 1:
-                return EmbeddedCDXServer(paths, surt_ordered)
-            else:
-                # treat as non-list
-                paths = paths[0]
-
-        # a single uri
-        uri = paths
-
-        # Check for remote cdx server
-        if (uri.startswith('http://') or uri.startswith('https://')) and not uri.endswith('.cdx'):
-            cookie = config.get('cookie', None)
-            return RemoteCDXServer(uri, cookie = cookie)
-        else:
-            return EmbeddedCDXServer([uri], surt_ordered)
-
-
-
-
-#=================================================================
-class EmbeddedCDXServer(CDXServer, IndexReader):
    def get_query_params(self, wburl, limit = 150000, collapse_time = None, replay_closest = 10):
-
        if wburl.type == wburl.URL_QUERY:
            raise NotImplementedError('Url Query Not Yet Supported')

        return {
-
-            wburl.QUERY:
-                {'collapse_time': collapse_time, 'filter': '!statuscode:(500|502|504)', 'limit': limit},
-
-            wburl.URL_QUERY:
-                {},
-#                raise Exception('Not Yet Implemented')
-#                {'collapse': 'urlkey', 'matchType': 'prefix', 'showGroupCount': True, 'showUniqCount': True, 'lastSkipTimestamp': True, 'limit': limit,
-#                 'fl': 'urlkey,original,timestamp,endtimestamp,groupcount,uniqcount',
-#                },
-
-            wburl.REPLAY:
-                {'filter': '!statuscode:(500|502|504)', 'limit': replay_closest, 'closest_to': wburl.timestamp, 'resolve_revisits': True},
-
-           wburl.LATEST_REPLAY:
-                {'reverse': True, 'filter': 'statuscode:[23]..', 'limit': '1', 'resolve_revisits': True}
-
-        }[wburl.type]
-
-
-    def __str__(self):
-        return 'load cdx indexes from ' + str(self.sources)
-
-
-
-#=================================================================
-class RemoteCDXServer(IndexReader):
-    """
-    >>> x = RemoteCDXServer('http://web.archive.org/cdx/search/cdx').load_cdx('example.com', parsed_cdx = True, limit = '2')
-    >>> pprint(x.next().items())
-    [('urlkey', 'com,example)/'),
-     ('timestamp', '20020120142510'),
-     ('original', 'http://example.com:80/'),
-     ('mimetype', 'text/html'),
-     ('statuscode', '200'),
-     ('digest', 'HT2DYGA5UKZCPBSFVCV3JOBXGW2G5UUA'),
-     ('length', '1792')]
-    """
-
-    def __init__(self, server_url, cookie = None):
-        self.server_url = server_url
-        self.auth_cookie = cookie
-
-    def load_cdx(self, url, params = {}, parsed_cdx = True, **kwvalues):
-        #url is required, must be passed explicitly!
-        params['url'] = url
-        params.update(**kwvalues)
-
-        urlparams = urllib.urlencode(params, True)
-
-        try:
-            request = urllib2.Request(self.server_url, urlparams)
-
-            if self.auth_cookie:
-                request.add_header('Cookie', self.auth_cookie)
-
-            response = urllib2.urlopen(request)
-        except urllib2.HTTPError, e:
-            if e.code == 403:
-                exc_msg = e.read()
-                msg = 'Blocked By Robots' if 'Blocked By Robots' in exc_msg else 'Excluded'
-                raise wbexceptions.AccessException(msg)
-            else:
-                raise
-
-        if parsed_cdx:
-            return (CDXObject(cdx) for cdx in response)
-        else:
-            return iter(response)
-
-
-    # Note: this params are designed to make pywb compatible with the original Java wayback-cdx-server API:
-    # https://github.com/internetarchive/wayback/tree/master/wayback-cdx-server
-    # Soon, this will be switched over to support the native pywb cdx server
-
-    # BUG: Setting replayClosest to high number for now, as cdx server sometimes returns wrong result
-    # with lower values if there are too many captures. Ideally, should be around 10-20
-    # The replayClosest is the max number of cdx lines, so max number of retry attempts that WB will make
-
-    def get_query_params(self, wburl, limit = '150000', collapse_time = '10', replay_closest = '4000'):
-        return {
-
            wburl.QUERY:
                {'collapseTime': collapse_time, 'filter': '!statuscode:(500|502|504)', 'limit': limit},

@ -184,18 +69,20 @@ class RemoteCDXServer(IndexReader):

        }[wburl.type]

+    @staticmethod
+    def peek_iter(iterable):
+        try:
+            first = next(iterable)
+        except StopIteration:
+            return None

-    def __str__(self):
-        return 'server cdx from ' + self.server_url
+        return chain([first], iterable)

+#=================================================================
+class RemoteCDXServer(IndexReader):
+    def __init__(self, remote_url, cookie=None):
+        self.remote = RemoteCDXSource(remote_url=remote_url, cookie=cookie, proxy_all=True)
+        self.cdx_server = CDXServer(self.remote)

-# Testing
-
-import utils
-if __name__ == "__main__" or utils.enable_doctests():
-    from pprint import pprint
-
-    test_dir = utils.test_data_dir() + 'cdx/'
-
-    import doctest
-    doctest.testmod()
+    #def load_cdx(self, **params):
+        #return remote.load_cdx(**params)
--- a/pywb/pywb_init.py
+++ b/pywb/pywb_init.py
@ -1,11 +1,12 @@
 import handlers
 import indexreader
 import archivalrouter
+import config_utils
+import proxy
+
 import os
 import yaml
-import config_utils
 import logging
-import proxy

 #=================================================================
 DEFAULTS = {
@ -49,24 +50,20 @@ def pywb_config_manual(passed_config = {}):
    collections = config.get('collections')

    for name, value in collections.iteritems():
-        route_config = config
-
-        if isinstance(value, dict):
-            # if a dict, extend with base properies
-            index_paths = value['index_paths']
-            route_config = DictChain(value, config)
+        if isinstance(value, str):
+            route_config = config
+            cdx_server = indexreader.IndexReader(value)
        else:
-            index_paths = str(value)
-
-        cdx_source = indexreader.IndexReader.make_best_cdx_source(index_paths, route_config)
+            route_config = DictChain(value, config)
+            cdx_server = indexreader.IndexReader(route_config)


        wb_handler = config_utils.create_wb_handler(
-            cdx_source = cdx_source,
+            cdx_server = cdx_server,
            config = route_config,
        )

-        logging.info('Adding Collection: ' + name)
+        logging.debug('Adding Collection: ' + name)

        route_class = route_config.get('route_class', archivalrouter.Route)

@ -74,7 +71,7 @@ def pywb_config_manual(passed_config = {}):

        # cdx query handler
        if route_config.get('enable_cdx_api', False):
-            routes.append(archivalrouter.Route(name + '-cdx', handlers.CDXHandler(cdx_source)))
+            routes.append(archivalrouter.Route(name + '-cdx', handlers.CDXHandler(cdx_server)))


    if config.get('debug_echo_env', False):
@ -125,11 +122,3 @@ def pywb_config(config_file = None):

    return pywb_config_manual(config)

-
-import utils
-if __name__ == "__main__" or utils.enable_doctests():
-    # Just test for execution for now
-    #pywb_config(os.path.dirname(os.path.realpath(__file__)) + '/../config.yaml')
-    pywb_config_manual()
-
-
--- a/pywb/regex_rewriters.py
+++ b/pywb/regex_rewriters.py
@ -1,269 +0,0 @@
-import re
-import sys
-import itertools
-
-from url_rewriter import UrlRewriter
-
-#=================================================================
-class RegexRewriter:
-    """
-    # Test https->http converter (other tests below in subclasses)
-    >>> RegexRewriter([(RegexRewriter.HTTPX_MATCH_STR, RegexRewriter.remove_https, 0)]).rewrite('a = https://example.com; b = http://example.com; c = https://some-url/path/https://embedded.example.com')
-    'a = http://example.com; b = http://example.com; c = http://some-url/path/http://embedded.example.com'
-    """
-
-    @staticmethod
-    def comment_out(string):
-        return '/*' + string + '*/'
-
-    @staticmethod
-    def remove_https(string):
-        return string.replace("https", "http")
-
-    @staticmethod
-    def add_prefix(prefix):
-        return lambda string: prefix + string
-
-    @staticmethod
-    def archival_rewrite(rewriter):
-        return lambda x: rewriter.rewrite(x)
-
-    @staticmethod
-    def replacer(string):
-        return lambda x: string
-
-    HTTPX_MATCH_STR = r'https?:\\?/\\?/[A-Za-z0-9:_@.-]+'
-
-
-
-    DEFAULT_OP = add_prefix
-
-
-    def __init__(self, rules):
-        #rules = self.create_rules(http_prefix)
-
-        # Build regexstr, concatenating regex list
-        regex_str = '|'.join(['(' + rx + ')' for rx, op, count in rules])
-
-        # ensure it's not middle of a word, wrap in non-capture group
-        regex_str = '(?<!\w)(?:' + regex_str + ')'
-
-        self.regex = re.compile(regex_str, re.M)
-        self.rules = rules
-
-    def filter(self, m):
-        return True
-
-    def rewrite(self, string):
-        return self.regex.sub(lambda x: self.replace(x), string)
-
-    def close(self):
-        return ''
-
-    def replace(self, m):
-        i = 0
-        for _, op, count in self.rules:
-            i += 1
-
-            full_m = i
-            while count > 0:
-                i += 1
-                count -= 1
-
-            if not m.group(i):
-                continue
-
-            # Optional filter to skip matches
-            if not self.filter(m):
-                return m.group(0)
-
-            # Custom func
-            if not hasattr(op, '__call__'):
-                op = RegexRewriter.DEFAULT_OP(op)
-
-            result = op(m.group(i))
-
-            # if extracting partial match
-            if i != full_m:
-                result = m.string[m.start(full_m):m.start(i)] + result + m.string[m.end(i):m.end(full_m)]
-
-            return result
-
-
-
-#=================================================================
-class JSRewriter(RegexRewriter):
-    """
-    >>> test_js('location = "http://example.com/abc.html"')
-    'WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"'
-
-    >>> test_js(r'location = "http:\/\/example.com/abc.html"')
-    'WB_wombat_location = "/web/20131010im_/http:\\\\/\\\\/example.com/abc.html"'
-
-    >>> test_js(r'location = "http:\\/\\/example.com/abc.html"')
-    'WB_wombat_location = "/web/20131010im_/http:\\\\/\\\\/example.com/abc.html"'
-
-    >>> test_js(r"location = 'http://example.com/abc.html/'")
-    "WB_wombat_location = '/web/20131010im_/http://example.com/abc.html/'"
-
-    >>> test_js(r'location = http://example.com/abc.html/')
-    'WB_wombat_location = http://example.com/abc.html/'
-
-    >>> test_js(r'location = /http:\/\/example.com/abc.html/')
-    'WB_wombat_location = /http:\\\\/\\\\/example.com/abc.html/'
-
-    >>> test_js('"/location" == some_location_val; locations = location;')
-    '"/location" == some_location_val; locations = WB_wombat_location;'
-
-    >>> test_js('cool_Location = "http://example.com/abc.html"')
-    'cool_Location = "/web/20131010im_/http://example.com/abc.html"'
-
-    >>> test_js('window.location = "http://example.com/abc.html" document.domain = "anotherdomain.com"')
-    'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html" document.WB_wombat_domain = "anotherdomain.com"'
-
-    >>> test_js('document_domain = "anotherdomain.com"; window.document.domain = "example.com"')
-    'document_domain = "anotherdomain.com"; window.document.WB_wombat_domain = "example.com"'
-
-    # custom rules added
-    >>> test_js('window.location = "http://example.com/abc.html"; some_func(); ', [('some_func\(\).*', RegexRewriter.comment_out, 0)])
-    'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"; /*some_func(); */'
-
-    # scheme-agnostic
-    >>> test_js('cool_Location = "//example.com/abc.html" //comment')
-    'cool_Location = "/web/20131010im_///example.com/abc.html" //comment'
-
-    """
-
-    JS_HTTPX = r'(?<="|\')(?:https?:)?\\?/\\?/[A-Za-z0-9:_@.-]+'
-
-    def __init__(self, rewriter, extra = []):
-        rules = self._create_rules(rewriter.get_abs_url())
-        rules.extend(extra)
-
-        RegexRewriter.__init__(self, rules)
-
-
-    def _create_rules(self, http_prefix):
-        return [
-             (self.JS_HTTPX, http_prefix, 0),
-             (r'(?<!/)\blocation\b', 'WB_wombat_', 0),
-             (r'(?<=document\.)domain', 'WB_wombat_', 0),
-        ]
-
-
-#=================================================================
-class XMLRewriter(RegexRewriter):
-    """
-    >>> test_xml('<tag xmlns="http://www.example.com/ns" attr="http://example.com"></tag>')
-    '<tag xmlns="http://www.example.com/ns" attr="/web/20131010im_/http://example.com"></tag>'
-
-    >>> test_xml('<tag xmlns:xsi="http://www.example.com/ns" attr=" http://example.com"></tag>')
-    '<tag xmlns:xsi="http://www.example.com/ns" attr=" /web/20131010im_/http://example.com"></tag>'
-
-    >>> test_xml('<tag> http://example.com<other>abchttp://example.com</other></tag>')
-    '<tag> /web/20131010im_/http://example.com<other>abchttp://example.com</other></tag>'
-
-    >>> test_xml('<main>   http://www.example.com/blah</tag> <other xmlns:abcdef= " http://example.com"/> http://example.com </main>')
-    '<main>   /web/20131010im_/http://www.example.com/blah</tag> <other xmlns:abcdef= " http://example.com"/> /web/20131010im_/http://example.com </main>'
-
-    """
-
-    def __init__(self, rewriter, extra = []):
-        rules = self._create_rules(rewriter.get_abs_url())
-
-        RegexRewriter.__init__(self, rules)
-
-    # custom filter to reject 'xmlns' attr
-    def filter(self, m):
-        attr = m.group(1)
-        if attr and attr.startswith('xmlns'):
-            return False
-
-        return True
-
-    def _create_rules(self, http_prefix):
-        return [
-             ('([A-Za-z:]+[\s=]+)?["\'\s]*(' + RegexRewriter.HTTPX_MATCH_STR + ')', http_prefix, 2),
-        ]
-
-#=================================================================
-class CSSRewriter(RegexRewriter):
-    r"""
-    >>> test_css("background: url('/some/path.html')")
-    "background: url('/web/20131010im_/http://example.com/some/path.html')"
-
-    >>> test_css("background: url('../path.html')")
-    "background: url('/web/20131010im_/http://example.com/path.html')"
-
-    >>> test_css("background: url(\"http://domain.com/path.html\")")
-    'background: url("/web/20131010im_/http://domain.com/path.html")'
-
-    >>> test_css("background: url(file.jpeg)")
-    'background: url(/web/20131010im_/http://example.com/file.jpeg)'
-
-    >>> test_css("background: url('')")
-    "background: url('')"
-
-    >>> test_css("background: url (\"weirdpath\')")
-    'background: url ("/web/20131010im_/http://example.com/weirdpath\')'
-
-    >>> test_css("@import   url ('path.css')")
-    "@import   url ('/web/20131010im_/http://example.com/path.css')"
-
-    >>> test_css("@import url('path.css')")
-    "@import url('/web/20131010im_/http://example.com/path.css')"
-
-    >>> test_css("@import ( 'path.css')")
-    "@import ( '/web/20131010im_/http://example.com/path.css')"
-
-    >>> test_css("@import  \"path.css\"")
-    '@import  "/web/20131010im_/http://example.com/path.css"'
-
-    >>> test_css("@import ('../path.css\"")
-    '@import (\'/web/20131010im_/http://example.com/path.css"'
-
-    >>> test_css("@import ('../url.css\"")
-    '@import (\'/web/20131010im_/http://example.com/url.css"'
-
-    >>> test_css("@import (\"url.css\")")
-    '@import ("/web/20131010im_/http://example.com/url.css")'
-
-    >>> test_css("@import url(/url.css)\n@import  url(/anotherurl.css)\n @import  url(/and_a_third.css)")
-    '@import url(/web/20131010im_/http://example.com/url.css)\n@import  url(/web/20131010im_/http://example.com/anotherurl.css)\n @import  url(/web/20131010im_/http://example.com/and_a_third.css)'
-
-    """
-
-    CSS_URL_REGEX = "url\\s*\\(\\s*[\\\\\"']*([^)'\"]+)[\\\\\"']*\\s*\\)"
-    CSS_IMPORT_NO_URL_REGEX = "@import\\s+(?!url)\\(?\\s*['\"]?(?!url[\\s\\(])([\w.:/\\\\-]+)"
-
-    def __init__(self, rewriter):
-        rules = self._create_rules(rewriter)
-
-        RegexRewriter.__init__(self, rules)
-
-
-    def _create_rules(self, rewriter):
-        return [
-             (CSSRewriter.CSS_URL_REGEX, RegexRewriter.archival_rewrite(rewriter), 1),
-             (CSSRewriter.CSS_IMPORT_NO_URL_REGEX, RegexRewriter.archival_rewrite(rewriter), 1),
-        ]
-
-import utils
-if __name__ == "__main__" or utils.enable_doctests():
-    arcrw = UrlRewriter('20131010im_/http://example.com/', '/web/')
-
-    def test_js(string, extra = []):
-        return JSRewriter(arcrw, extra).rewrite(string)
-
-    def test_xml(string):
-        return XMLRewriter(arcrw).rewrite(string)
-
-    def test_css(string):
-        return CSSRewriter(arcrw).rewrite(string)
-
-
-    import doctest
-    doctest.testmod()
-
-
-
--- a/pywb/replay_views.py
+++ b/pywb/replay_views.py
@ -1,30 +1,30 @@
 import StringIO
-from urllib2 import URLError
-import chardet
-import copy
-import itertools

-import archiveloader
-from wbrequestresponse import WbResponse, StatusAndHeaders
-import utils
-
-from url_rewriter import UrlRewriter
-from header_rewriter import HeaderRewriter
-import html_rewriter
-import regex_rewriters
+from pywb.rewrite.url_rewriter import UrlRewriter
+from pywb.utils.bufferedreaders import ChunkedDataReader
+from wbrequestresponse import WbResponse

 import wbexceptions


 #=================================================================
 class ReplayView:
-    def __init__(self, resolvers, loader = None, reporter = None):
-        self.resolvers = resolvers
-        self.loader = loader if loader else archiveloader.ArchiveLoader()
+    def __init__(self, content_loader, content_rewriter, head_insert_view = None,
+                 redir_to_exact = True, buffer_response = False, reporter = None):
+
+        self.content_loader = content_loader
+        self.content_rewriter = content_rewriter
+
+        self.head_insert_view = head_insert_view
+
+        self.redir_to_exact = redir_to_exact
+        # buffer or stream rewritten response
+        self.buffer_response = buffer_response
+
        self._reporter = reporter


-    def __call__(self, wbrequest, cdx_lines, cdx_reader):
+    def __call__(self, wbrequest, cdx_lines):
        last_e = None
        first = True

@ -40,9 +40,22 @@ class ReplayView:
                    self._redirect_if_needed(wbrequest, cdx)
                    first = False

-                (cdx, status_headers, stream) = self.resolve_headers_and_payload(cdx, wbrequest, cdx_reader, failed_files)
+                (status_headers, stream) = self.content_loader.resolve_headers_and_payload(cdx, failed_files)

-                response = self.make_response(wbrequest, cdx, status_headers, stream)
+                # check and reject self-redirect
+                self._reject_self_redirect(wbrequest, cdx, status_headers)
+
+                # check if redir is needed
+                self._redirect_if_needed(wbrequest, cdx)
+
+                response = None
+
+                if self.content_rewriter and wbrequest.wb_url.mod != 'id_':
+                    response = self.rewrite_content(wbrequest, cdx, status_headers, stream)
+                else:
+                    (status_headers, stream) = self.sanitize_content(status_headers, stream)
+                    response_iter = self.stream_to_iter(stream)
+                    response = WbResponse(status_headers, response_iter)

                # notify reporter callback, if any
                if self._reporter:
@ -62,288 +75,57 @@ class ReplayView:
        else:
            raise wbexceptions.UnresolvedArchiveFileException()

-
-    # callback to issue a redirect to another request
-    # subclasses may provide custom logic
-    def _redirect_if_needed(self, wbrequest, cdx):
-        pass
-
-
-    def _load(self, cdx, revisit, failed_files):
-        if revisit:
-            (filename, offset, length) = (cdx['orig.filename'], cdx['orig.offset'], cdx['orig.length'])
-        else:
-            (filename, offset, length) = (cdx['filename'], cdx['offset'], cdx['length'])
-
-        #optimization: if same file already failed this request, don't try again
-        if failed_files and filename in failed_files:
-            raise wbexceptions.ArchiveLoadFailed(filename, 'Skipping Already Failed')
-
-        any_found = False
-        last_exc = None
-        for resolver in self.resolvers:
-            possible_paths = resolver(filename)
-
-            if possible_paths:
-                for path in possible_paths:
-                    any_found = True
-                    try:
-                        return self.loader.load(path, offset, length)
-
-                    except Exception as ue:
-                        last_exc = ue
-                        print last_exc
-                        pass
-
-        # Unsuccessful if reached here
-        if failed_files:
-           failed_files.append(filename)
-
-        if not any_found:
-            raise wbexceptions.UnresolvedArchiveFileException('Archive File Not Found: ' + filename)
-        else:
-            raise wbexceptions.ArchiveLoadFailed(filename, last_exc.reason if last_exc else '')
-
-
-    def resolve_headers_and_payload(self, cdx, wbrequest, cdx_reader, failed_files):
-        has_curr = (cdx['filename'] != '-')
-        has_orig = (cdx.get('orig.filename','-') != '-')
-
-        # load headers record from cdx['filename'] unless it is '-' (rare)
-        headers_record = self._load(cdx, False, failed_files) if has_curr else None
-
-        # two index lookups
-        # Case 1: if mimetype is still warc/revisit
-        if cdx['mimetype'] == 'warc/revisit' and headers_record:
-            payload_record = self._load_different_url_payload(wbrequest, cdx_reader, cdx, headers_record, failed_files)
-
-        # single lookup cases
-        # case 2: non-revisit
-        elif (has_curr and not has_orig):
-            payload_record = headers_record
-
-        # case 3: identical url revisit, load payload from orig.filename
-        elif (has_orig):
-            payload_record = self._load(cdx, True, failed_files)
-
-        # special case: set header to payload if old-style revisit with missing header
-        if not headers_record:
-            headers_record = payload_record
-        elif headers_record != payload_record:
-            # close remainder of stream as this record only used for (already parsed) headers
-            headers_record.stream.close()
-
-            # special case: check if headers record is actually empty (eg empty revisit), then use headers from revisit
-            if not headers_record.status_headers.headers:
-                headers_record = payload_record
-
-
-        if not headers_record or not payload_record:
-            raise wbexceptions.CaptureException('Invalid CDX' + str(cdx))
-
-
-        #response = WbResponse(headers_record.status_headers, self.create_stream_gen(payload_record.stream))
-        #response._stream = payload_record.stream
-        return (cdx, headers_record.status_headers, payload_record.stream)
-
-
-    # done here! just return response
-    # subclasses make override to do additional processing
-    def make_response(self, wbrequest, cdx, status_headers, stream):
-        return self.create_stream_response(status_headers, stream)
-
-
-    # create response from headers and wrapping stream in generator
-    def create_stream_response(self, status_headers, stream):
-        return WbResponse(status_headers, self.create_stream_gen(stream))
-
-
-    # Handle the case where a duplicate of a capture with same digest exists at a different url
-    # Must query the index at that url filtering by matching digest
-    # Raise exception if no matches found
-    def _load_different_url_payload(self, wbrequest, cdx_reader, cdx, headers_record, failed_files):
-        ref_target_uri = headers_record.rec_headers.get_header('WARC-Refers-To-Target-URI')
-
-        # Check for unresolved revisit error, if refers to target uri not present or same as the current url
-        if not ref_target_uri or (ref_target_uri == headers_record.rec_headers.get_header('WARC-Target-URI')):
-            raise wbexceptions.CaptureException('Missing Revisit Original' + str(cdx))
-
-        ref_target_date = headers_record.rec_headers.get_header('WARC-Refers-To-Date')
-
-        if not ref_target_date:
-            ref_target_date = cdx['timestamp']
-        else:
-            ref_target_date = utils.iso_date_to_timestamp(ref_target_date)
-
-        # clone WbRequest
-        orig_wbreq = copy.copy(wbrequest)
-        orig_wbreq.wb_url = copy.copy(orig_wbreq.wb_url)
-
-        orig_wbreq.wb_url.url = ref_target_uri
-        orig_wbreq.wb_url.timestamp = ref_target_date
-
-        # Must also match digest
-        orig_wbreq.query_filter.append('digest:' + cdx['digest'])
-
-        orig_cdx_lines = cdx_reader.load_for_request(orig_wbreq, parsed_cdx = True)
-
-        for cdx in orig_cdx_lines:
-            try:
-                #cdx = cdx_reader.CDXCaptureResult(cdx)
-                #print cdx
-                payload_record = self._load(cdx, False, failed_files)
-                return payload_record
-
-            except wbexceptions.CaptureException as e:
-                pass
-
-        raise wbexceptions.CaptureException('Original for revisit could not be loaded')
-
-
-    def resolve_full(self, filename):
-        # Attempt to resolve cdx file to full path
-        full_url = None
-        for resolver in self.resolvers:
-            full_url = resolver(filename)
-            if full_url:
-                return full_url
-
-        raise wbexceptions.UnresolvedArchiveFileException('Archive File Not Found: ' + filename)
-
-
-    # Create a generator reading from a stream, with optional rewriting and final read call
    @staticmethod
-    def create_stream_gen(stream, rewrite_func = None, final_read_func = None, first_buff = None):
+    def stream_to_iter(stream):
        try:
-            buff = first_buff if first_buff else stream.read()
+            buff = stream.read()
            while buff:
-                if rewrite_func:
-                    buff = rewrite_func(buff)
                yield buff
                buff = stream.read()

-            # For adding a tail/handling final buffer
-            if final_read_func:
-                buff = final_read_func()
-                if buff:
-                    yield buff
-
        finally:
            stream.close()

+    def sanitize_content(self, status_headers, stream):
+        # remove transfer encoding chunked and wrap in a dechunking stream
+        if (status_headers.remove_header('transfer-encoding')):
+            stream = ChunkedDataReader(stream)

-    def __str__(self):
-        return 'find archive files from ' + str(self.resolvers)
-
-#=================================================================
-class RewritingReplayView(ReplayView):
-
-    def __init__(self, resolvers, loader = None, head_insert_view = None, header_rewriter = None, redir_to_exact = True, buffer_response = False, reporter = None):
-        ReplayView.__init__(self, resolvers, loader, reporter)
-        self.head_insert_view = head_insert_view
-        self.header_rewriter = header_rewriter if header_rewriter else HeaderRewriter()
-        self.redir_to_exact = redir_to_exact
-
-        # buffer or stream rewritten response
-        self.buffer_response = buffer_response
-
-
-    def _text_content_type(self, content_type):
-        for ctype, mimelist in self.REWRITE_TYPES.iteritems():
-            if any ((mime in content_type) for mime in mimelist):
-                return ctype
-
-        return None
-
-
-    def make_response(self, wbrequest, cdx, status_headers, stream):
-        # check and reject self-redirect
-        self._reject_self_redirect(wbrequest, cdx, status_headers)
-
-        # check if redir is needed
-        self._redirect_if_needed(wbrequest, cdx)
+        return (status_headers, stream)

+    def rewrite_content(self, wbrequest, cdx, status_headers, stream):
        urlrewriter = wbrequest.urlrewriter

-        rewritten_headers = self.header_rewriter.rewrite(status_headers, urlrewriter)
+        (rewritten_headers, stream) = self.content_rewriter.rewrite_headers(urlrewriter, status_headers, stream)

-        # de_chunking in case chunk encoding is broken
-        # TODO: investigate further
-        de_chunk = False
-
-        # handle transfer-encoding: chunked
-        if (rewritten_headers.contains_removed_header('transfer-encoding', 'chunked')):
-            stream = archiveloader.ChunkedLineReader(stream)
-            de_chunk = True
-
-        # transparent, though still may need to dechunk
-        if wbrequest.wb_url.mod == 'id_':
-            if de_chunk:
-                status_headers.remove_header('transfer-encoding')
-
-            return self.create_stream_response(status_headers, stream)
-
-        # non-text content type, just send through with rewritten headers
-        # but may need to dechunk
+        # no rewriting needed!
        if rewritten_headers.text_type is None:
-            status_headers = rewritten_headers.status_headers
+            response_iter = self.stream_to_iter(stream)
+            return WbResponse(rewritten_headers.status_headers, response_iter)

-            return self.create_stream_response(status_headers, stream)
-
-        # Handle text rewriting
-
-        # special case -- need to ungzip the body
-        if (rewritten_headers.contains_removed_header('content-encoding', 'gzip')):
-            stream = archiveloader.LineReader(stream, decomp = utils.create_decompressor())
-
-        # TODO: is this right?
-        if rewritten_headers.charset:
-            encoding = rewritten_headers.charset
-            first_buff = None
+        # do head insert
+        if self.head_insert_view:
+            head_insert_str = self.head_insert_view.render_to_string(wbrequest = wbrequest, cdx = cdx)
        else:
-            (encoding, first_buff) = self._detect_charset(stream)
+            head_insert_str = None

-            # if chardet thinks its ascii, use utf-8
-            if encoding == 'ascii':
-                #encoding = None
-                encoding = 'utf-8'
-
-        # Buffering response for html, streaming for others?
-        #if rewritten_headers.text_type == 'html':
-        #    return self._rewrite_html(encoding, urlrewriter, stream, rewritten_headers.status_headers, first_buff)
-        #else:
-        #    return self._rewrite_other(rewritten_headers.text_type, encoding, urlrewriter, stream, rewritten_headers.status_headers, first_buff)
-
-        text_type = rewritten_headers.text_type
-        status_headers = rewritten_headers.status_headers
-
-        if text_type == 'html':
-            head_insert_str = self.head_insert_view.render_to_string(wbrequest = wbrequest, cdx = cdx) if self.head_insert_view else None
-            rewriter = html_rewriter.HTMLRewriter(urlrewriter, outstream = None, head_insert = head_insert_str)
-        elif text_type == 'css':
-            rewriter = regex_rewriters.CSSRewriter(urlrewriter)
-        elif text_type == 'js':
-            rewriter = regex_rewriters.JSRewriter(urlrewriter)
-        elif text_type == 'xml':
-            rewriter = regex_rewriters.XMLRewriter(urlrewriter)
-        else:
-            raise Exception('Unknown Text Type for Rewrite: ' + text_type)
-
-        # Create generator for response
-        response_gen = self._create_rewrite_stream(rewriter, encoding, stream, first_buff)
+        (status_headers, response_gen) = self.content_rewriter.rewrite_content(urlrewriter, rewritten_headers, stream, head_insert_str)

        if self.buffer_response:
-            return self._create_buffer_response(status_headers, response_gen)
-        else:
-            return WbResponse(status_headers, value = response_gen)
+            if wbrequest.wb_url.mod == 'id_':
+                status_headers.remove_header('content-length')
+
+            return self.buffered_response(status_headers, response_gen)
+
+        return WbResponse(status_headers, response_gen)


-    # Buffer rewrite generator and return a response from a string
-    def _create_buffer_response(self, status_headers, generator):
+    # Buffer rewrite iterator and return a response from a string
+    def buffered_response(self, status_headers, iterator):
        out = StringIO.StringIO()

        try:
-            for buff in generator:
+            for buff in iterator:
                out.write(buff)

        finally:
@ -355,53 +137,9 @@ class RewritingReplayView(ReplayView):

        return WbResponse(status_headers, value = [content])

-    # Create rewrite response from record (no Content-Length), may even be chunked by front-end
-    def _create_rewrite_stream(self, rewriter, encoding, stream, first_buff = None):
-        def do_rewrite(buff):
-            if encoding:
-                buff = self._decode_buff(buff, stream, encoding)
-
-            buff = rewriter.rewrite(buff)
-
-            if encoding:
-                buff = buff.encode(encoding)
-
-            return buff
-
-        def do_finish():
-            return rewriter.close()
-
-        return self.create_stream_gen(stream, rewrite_func = do_rewrite, final_read_func = do_finish, first_buff = first_buff)
-
-
-    def _decode_buff(self, buff, stream, encoding):
-        try:
-            buff = buff.decode(encoding)
-        except UnicodeDecodeError, e:
-            # chunk may have cut apart unicode bytes -- add 1-3 bytes and retry
-            for i in range(3):
-                buff += stream.read(1)
-                try:
-                    buff = buff.decode(encoding)
-                    break
-                except UnicodeDecodeError:
-                    pass
-            else:
-                raise
-
-        return buff
-
-
-    def _detect_charset(self, stream):
-        buff = stream.read(8192)
-        result = chardet.detect(buff)
-        print "chardet result: " + str(result)
-        return (result['encoding'], buff)
-

    def _redirect_if_needed(self, wbrequest, cdx):
-        is_proxy = wbrequest.is_proxy
-        if self.redir_to_exact and not is_proxy and cdx and (cdx['timestamp'] != wbrequest.wb_url.timestamp):
+        if self.redir_to_exact and not wbrequest.is_proxy and cdx and (cdx['timestamp'] != wbrequest.wb_url.timestamp):
            new_url = wbrequest.urlrewriter.get_timestamp_url(cdx['timestamp'], cdx['original'])
            raise wbexceptions.InternalRedirect(new_url)

--- a/pywb/rewrite/README.md
+++ b/pywb/rewrite/README.md
@ -0,0 +1,47 @@
+## PyWb Rewrite v0.2
+
+[![Build Status](https://travis-ci.org/ikreymer/pywb_rewrite.png?branch=master)](https://travis-ci.org/ikreymer/pywb_rewrite)
+
+This package includes the content rewriting component of the pywb wayback tool suite.
+
+This package applies standard rewriting content rewriting, in the form of url rewriting, for
+HTTP headers, html, css, js and xml content.
+
+An additional domain-specific rewritin is planned, especially for JS, to allow for proper
+replay of difficult pages.
+
+
+### Command-Line Rewriter
+
+To enable easier testing of rewriting, this package includes a command-line rewriter 
+which will fetch a live url and apply the registered rewriting rules to that url:
+
+After installing with:
+
+`pip install -r requirements.txt`
+
+Run:
+
+`python ./pywb_rewrite/rewrite_live.py http://example.com`
+
+To specify custom timestamp and prefix:
+
+```
+python ./pywb_rewrite/rewrite_live.py http://example.com /mycoll/20141026000102/http://mysite.example.com/path.html
+```
+
+This will print to stdout the content of `http://example.com` with all urls rewritten relative to 
+`/mycoll/20141026000102/http://mysite.example.com/path.html`.
+
+Headers are also rewritten, for further details, consult the `get_rewritten` function in
+[pywb_rewrite/rewrite_live.py](pywb_rewrite/rewrite_live.py)
+
+
+### Tests
+
+Rewriting doctests as well as live rewriting tests (subject to change) are provided.
+To run full test suite: `python run-tests.py`
+
+
+
+
--- a/pywb/cdxserver/init.py
+++ b/pywb/cdxserver/init.py
--- a/pywb/rewrite/header_rewriter.py
+++ b/pywb/rewrite/header_rewriter.py
@ -1,4 +1,4 @@
-from wbrequestresponse import StatusAndHeaders
+from pywb.utils.statusandheaders import StatusAndHeaders

 #=================================================================
 class RewrittenStatusAndHeaders:
@ -14,37 +14,6 @@ class RewrittenStatusAndHeaders:

 #=================================================================
 class HeaderRewriter:
-    """
-    # Text with charset
-    >>> test_rewrite([('Date', 'Fri, 03 Jan 2014 03:03:21 GMT'), ('Content-Length', '5'), ('Content-Type', 'text/html;charset=UTF-8')])
-    {'text_type': 'html', 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Date', 'Fri, 03 Jan 2014 03:03:21 GMT'),
-      ('X-Archive-Orig-Content-Length', '5'),
-      ('Content-Type', 'text/html;charset=UTF-8')]), 'removed_header_dict': {}, 'charset': 'utf-8'}
-
-    # Redirect
-    >>> test_rewrite([('Connection', 'close'), ('Location', '/other.html')], '302 Redirect')
-    {'text_type': None, 'status_headers': StatusAndHeaders(protocol = '', statusline = '302 Redirect', headers = [ ('X-Archive-Orig-Connection', 'close'),
-      ('Location', '/web/20131226101010/http://example.com/other.html')]), 'removed_header_dict': {}, 'charset': None}
-
-    # gzip
-    >>> test_rewrite([('Content-Length', '199999'), ('Content-Type', 'text/javascript'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked')])
-    {'text_type': 'js', 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Content-Length', '199999'),
-      ('Content-Type', 'text/javascript')]), 'removed_header_dict': {'transfer-encoding': 'chunked', 'content-encoding': 'gzip'}, 'charset': None}
-
-    # Binary
-    >>> test_rewrite([('Content-Length', '200000'), ('Content-Type', 'image/png'), ('Cookie', 'blah'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked')])
-    {'text_type': None, 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('Content-Length', '200000'),
-      ('Content-Type', 'image/png'),
-      ('X-Archive-Orig-Cookie', 'blah'),
-      ('Content-Encoding', 'gzip')]), 'removed_header_dict': {'transfer-encoding': 'chunked'}, 'charset': None}
-
-    Removing Transfer-Encoding always, Was:
-      ('Content-Encoding', 'gzip'),
-      ('Transfer-Encoding', 'chunked')]), 'charset': None, 'text_type': None, 'removed_header_dict': {}}
-
-    """
-
-
    REWRITE_TYPES = {
        'html': ['text/html', 'application/xhtml'],
        'css':  ['text/css'],
@ -122,20 +91,3 @@ class HeaderRewriter:

        return (new_headers, removed_header_dict)

-import utils
-if __name__ == "__main__" or utils.enable_doctests():
-    import os
-    import pprint
-    import url_rewriter
-
-    urlrewriter = url_rewriter.UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/web/')
-
-    headerrewriter = HeaderRewriter()
-
-    def test_rewrite(headers, status = '200 OK'):
-        rewritten = headerrewriter.rewrite(StatusAndHeaders(status, headers), urlrewriter)
-        return vars(rewritten)
-
-    import doctest
-    doctest.testmod()
-
--- a/pywb/rewrite/html_rewriter.py
+++ b/pywb/rewrite/html_rewriter.py
@ -12,75 +12,8 @@ from regex_rewriters import JSRewriter, CSSRewriter
 # HTMLRewriter -- html parser for custom rewriting, also handlers for script and css
 #=================================================================
 class HTMLRewriter(HTMLParser):
-    r"""
-    >>> parse('<HTML><A Href="page.html">Text</a></hTmL>')
-    <HTML><a href="/web/20131226101010/http://example.com/some/path/page.html">Text</a></html>
-
-    >>> parse('<body x="y"><img src="../img.gif"/><br/></body>')
-    <body x="y"><img src="/web/20131226101010im_/http://example.com/some/img.gif"/><br/></body>
-
-    >>> parse('<body x="y"><img src="/img.gif"/><br/></body>')
-    <body x="y"><img src="/web/20131226101010im_/http://example.com/img.gif"/><br/></body>
-
-    >>> parse('<input "selected"><img src></div>')
-    <input "selected"=""><img src=""></div>
-
-    >>> parse('<html><head><base href="http://example.com/some/path/index.html"/>')
-    <html><head><base href="/web/20131226101010/http://example.com/some/path/index.html"/>
-
-    # HTML Entities
-    >>> parse('<a href="">&rsaquo; &nbsp; &#62;</div>')
-    <a href="">&rsaquo; &nbsp; &#62;</div>
-
-    # Don't rewrite anchors
-    >>> parse('<HTML><A Href="#abc">Text</a></hTmL>')
-    <HTML><a href="#abc">Text</a></html>
-
-    # Unicode
-    >>> parse('<a href="http://испытание.испытание/">испытание</a>')
-    <a href="/web/20131226101010/http://испытание.испытание/">испытание</a>
-
-    # Meta tag
-    >>> parse('<META http-equiv="refresh" content="10; URL=/abc/def.html">')
-    <meta http-equiv="refresh" content="10; URL=/web/20131226101010/http://example.com/abc/def.html">
-
-    >>> parse('<meta http-equiv="Content-type" content="text/html; charset=utf-8" />')
-    <meta http-equiv="Content-type" content="text/html; charset=utf-8"/>
-
-    >>> parse('<META http-equiv="refresh" content>')
-    <meta http-equiv="refresh" content="">
-
-    # Script tag
-    >>> parse('<script>window.location = "http://example.com/a/b/c.html"</script>')
-    <script>window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html"</script>
-
-    # Unterminated script tag auto-terminate
-    >>> parse('<script>window.location = "http://example.com/a/b/c.html"</sc>')
-    <script>window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html"</sc></script>
-
-    >>> parse('<script>/*<![CDATA[*/window.location = "http://example.com/a/b/c.html;/*]]>*/"</script>')
-    <script>/*<![CDATA[*/window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html;/*]]>*/"</script>
-
-    >>> parse('<div style="background: url(\'abc.html\')" onblah onclick="location = \'redirect.html\'"></div>')
-    <div style="background: url('/web/20131226101010/http://example.com/some/path/abc.html')" onblah="" onclick="WB_wombat_location = 'redirect.html'"></div>
-
-    >>> parse('<style>@import "styles.css" .a { font-face: url(\'myfont.ttf\') }</style>')
-    <style>@import "/web/20131226101010/http://example.com/some/path/styles.css" .a { font-face: url('/web/20131226101010/http://example.com/some/path/myfont.ttf') }</style>
-
-    # Unterminated style tag auto-terminate
-    >>> parse('<style>@import url(styles.css)')
-    <style>@import url(/web/20131226101010/http://example.com/some/path/styles.css)</style>
-
-    # Head Insertion
-    >>> parse('<html><head><script src="other.js"></script></head><body>Test</body></html>', head_insert = '<script src="cool.js"></script>')
-    <html><head><script src="cool.js"></script><script src="/web/20131226101010js_/http://example.com/some/path/other.js"></script></head><body>Test</body></html>
-
-    >>> parse('<body><div>SomeTest</div>', head_insert = '/* Insert */')
-    /* Insert */<body><div>SomeTest</div>
-
-    >>> parse('<link href="abc.txt"><div>SomeTest</div>', head_insert = '<script>load_stuff();</script>')
-    <link href="/web/20131226101010oe_/http://example.com/some/path/abc.txt"><script>load_stuff();</script><div>SomeTest</div>
-
+    """
+    HTML-Parsing Rewriter
    """

    REWRITE_TAGS = {
@ -307,16 +240,4 @@ class HTMLRewriter(HTMLParser):
        self.out.write(']>')


-import utils
-if __name__ == "__main__" or utils.enable_doctests():
-
-    url_rewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/web/')
-
-    def parse(data, head_insert = None):
-        parser = HTMLRewriter(url_rewriter, head_insert = head_insert)
-        print parser.rewrite(data) + parser.close()
-
-    import doctest
-    doctest.testmod()
-

--- a/pywb/rewrite/regex_rewriters.py
+++ b/pywb/rewrite/regex_rewriters.py
@ -0,0 +1,156 @@
+import re
+import sys
+import itertools
+
+from url_rewriter import UrlRewriter
+
+#=================================================================
+class RegexRewriter(object):
+    @staticmethod
+    def comment_out(string):
+        return '/*' + string + '*/'
+
+    @staticmethod
+    def remove_https(string):
+        return string.replace("https", "http")
+
+    @staticmethod
+    def add_prefix(prefix):
+        return lambda string: prefix + string
+
+    @staticmethod
+    def archival_rewrite(rewriter):
+        return lambda x: rewriter.rewrite(x)
+
+    @staticmethod
+    def replacer(string):
+        return lambda x: string
+
+    HTTPX_MATCH_STR = r'https?:\\?/\\?/[A-Za-z0-9:_@.-]+'
+
+
+
+    DEFAULT_OP = add_prefix
+
+
+    def __init__(self, rules):
+        #rules = self.create_rules(http_prefix)
+
+        # Build regexstr, concatenating regex list
+        regex_str = '|'.join(['(' + rx + ')' for rx, op, count in rules])
+
+        # ensure it's not middle of a word, wrap in non-capture group
+        regex_str = '(?<!\w)(?:' + regex_str + ')'
+
+        self.regex = re.compile(regex_str, re.M)
+        self.rules = rules
+
+    def filter(self, m):
+        return True
+
+    def rewrite(self, string):
+        return self.regex.sub(lambda x: self.replace(x), string)
+
+    def close(self):
+        return ''
+
+    def replace(self, m):
+        i = 0
+        for _, op, count in self.rules:
+            i += 1
+
+            full_m = i
+            while count > 0:
+                i += 1
+                count -= 1
+
+            if not m.group(i):
+                continue
+
+            # Optional filter to skip matches
+            if not self.filter(m):
+                return m.group(0)
+
+            # Custom func
+            if not hasattr(op, '__call__'):
+                op = RegexRewriter.DEFAULT_OP(op)
+
+            result = op(m.group(i))
+
+            # if extracting partial match
+            if i != full_m:
+                result = m.string[m.start(full_m):m.start(i)] + result + m.string[m.end(i):m.end(full_m)]
+
+            return result
+
+
+
+#=================================================================
+class JSLinkRewriter(RegexRewriter):
+    """
+    JS Rewriter which rewrites absolute http://, https:// and // urls
+    at the beginning of a string
+    """
+    JS_HTTPX = r'(?<="|\')(?:https?:)?\\{0,2}/\\{0,2}/[A-Za-z0-9:_@.-]+'
+
+    def __init__(self, rewriter, rules = []):
+        rules = rules + [(self.JS_HTTPX, rewriter.get_abs_url(), 0)]
+        super(JSLinkRewriter, self).__init__(rules)
+
+#=================================================================
+class JSLocationAndLinkRewriter(JSLinkRewriter):
+    """
+    JS Rewriter which also rewrites location and domain to the
+    specified prefix (default: 'WB_wombat_')
+    """
+
+    def __init__(self, rewriter, rules = [], prefix = 'WB_wombat_'):
+        rules = rules + [
+             (r'(?<!/)\blocation\b', prefix, 0),
+             (r'(?<=document\.)domain', prefix, 0),
+        ]
+        super(JSLocationAndLinkRewriter, self).__init__(rewriter, rules)
+
+#=================================================================
+# Set 'default' JSRewriter
+JSRewriter = JSLocationAndLinkRewriter
+
+
+#=================================================================
+class XMLRewriter(RegexRewriter):
+    def __init__(self, rewriter, extra = []):
+        rules = self._create_rules(rewriter.get_abs_url())
+
+        RegexRewriter.__init__(self, rules)
+
+    # custom filter to reject 'xmlns' attr
+    def filter(self, m):
+        attr = m.group(1)
+        if attr and attr.startswith('xmlns'):
+            return False
+
+        return True
+
+    def _create_rules(self, http_prefix):
+        return [
+             ('([A-Za-z:]+[\s=]+)?["\'\s]*(' + RegexRewriter.HTTPX_MATCH_STR + ')', http_prefix, 2),
+        ]
+
+#=================================================================
+class CSSRewriter(RegexRewriter):
+    CSS_URL_REGEX = "url\\s*\\(\\s*[\\\\\"']*([^)'\"]+)[\\\\\"']*\\s*\\)"
+    CSS_IMPORT_NO_URL_REGEX = "@import\\s+(?!url)\\(?\\s*['\"]?(?!url[\\s\\(])([\w.:/\\\\-]+)"
+
+    def __init__(self, rewriter):
+        rules = self._create_rules(rewriter)
+
+        RegexRewriter.__init__(self, rules)
+
+
+    def _create_rules(self, rewriter):
+        return [
+             (CSSRewriter.CSS_URL_REGEX, RegexRewriter.archival_rewrite(rewriter), 1),
+             (CSSRewriter.CSS_IMPORT_NO_URL_REGEX, RegexRewriter.archival_rewrite(rewriter), 1),
+        ]
+
+
--- a/pywb/rewrite/rewrite_content.py
+++ b/pywb/rewrite/rewrite_content.py
@ -0,0 +1,151 @@
+import chardet
+
+from url_rewriter import UrlRewriter
+from html_rewriter import HTMLRewriter
+from regex_rewriters import RegexRewriter, JSRewriter, CSSRewriter, XMLRewriter
+from header_rewriter import HeaderRewriter, RewrittenStatusAndHeaders
+
+from pywb.utils.statusandheaders import StatusAndHeaders
+from pywb.utils.bufferedreaders import BufferedReader, ChunkedDataReader
+
+class RewriteContent:
+
+    DEFAULT_CONTENT_REWRITERS = {
+      'header': HeaderRewriter,
+      'js': JSRewriter,
+      'css': CSSRewriter,
+      'xml': XMLRewriter,
+      'html': HTMLRewriter
+    }
+
+
+    def __init__(self, rewriters = {}):
+        self.rewriters = dict(self.DEFAULT_CONTENT_REWRITERS.items() + rewriters.items())
+
+
+    def rewrite_headers(self, urlrewriter, status_headers, stream):
+        rewritten_headers = self.rewriters['header']().rewrite(status_headers, urlrewriter)
+
+        # note: since chunking may be broken, approach taken here is to *always* attempt
+        # to dechunk if transfer-encoding: chunked is present
+        #
+        # an alternative may be to serve chunked unless content rewriting is needed
+        # todo: possible revisit this approach
+
+        if (rewritten_headers.contains_removed_header('transfer-encoding', 'chunked')):
+            stream = ChunkedDataReader(stream)
+
+        return (rewritten_headers, stream)
+
+    def rewrite_content(self, urlrewriter, headers, stream, head_insert_str = None):
+        # see if we've already rewritten headers
+        if isinstance(headers, RewrittenStatusAndHeaders):
+            rewritten_headers = headers
+        elif isinstance(headers, StatusAndHeaders):
+        # otherwise, need to determine if rewriting is even necessary
+            (rewritten_headers, stream) = self.rewrite_headers(urlrewriter, headers, stream)
+            # no rewriting needed here
+            if rewritten_headers.text_type is None:
+                gen = self.stream_to_gen(stream)
+                return (status_headers, gen)
+
+        status_headers = rewritten_headers.status_headers
+        # Handle text content rewriting
+        # =========================================================================
+        # special case -- need to ungzip the body
+        if (rewritten_headers.contains_removed_header('content-encoding', 'gzip')):
+            stream = BufferedReader(stream, 'gzip')
+
+
+        if rewritten_headers.charset:
+            encoding = rewritten_headers.charset
+            first_buff = None
+        else:
+            (encoding, first_buff) = self._detect_charset(stream)
+
+            # if chardet thinks its ascii, use utf-8
+            if encoding == 'ascii':
+                encoding = 'utf-8'
+
+        text_type = rewritten_headers.text_type
+
+        rewriter_class = self.rewriters.get(text_type)
+        if not rewriter_class:
+            raise Exception('Unknown Text Type for Rewrite: ' + text_type)
+
+
+        if text_type == 'html':
+            rewriter = rewriter_class(urlrewriter, outstream = None, head_insert = head_insert_str)
+        else:
+            rewriter = rewriter_class(urlrewriter)
+
+        # Create rewriting generator
+        gen = self._rewriting_stream_gen(rewriter, encoding, stream, first_buff)
+        return (status_headers, gen)
+
+
+    # Create rewrite stream,  may even be chunked by front-end
+    def _rewriting_stream_gen(self, rewriter, encoding, stream, first_buff = None):
+        def do_rewrite(buff):
+            if encoding:
+                buff = self._decode_buff(buff, stream, encoding)
+
+            buff = rewriter.rewrite(buff)
+
+            if encoding:
+                buff = buff.encode(encoding)
+
+            return buff
+
+        def do_finish():
+            return rewriter.close()
+
+        return self.stream_to_gen(stream, rewrite_func = do_rewrite, final_read_func = do_finish, first_buff = first_buff)
+
+
+    def _decode_buff(self, buff, stream, encoding):
+        try:
+            buff = buff.decode(encoding)
+        except UnicodeDecodeError, e:
+            # chunk may have cut apart unicode bytes -- add 1-3 bytes and retry
+            for i in range(3):
+                buff += stream.read(1)
+                try:
+                    buff = buff.decode(encoding)
+                    break
+                except UnicodeDecodeError:
+                    pass
+            else:
+                raise
+
+        return buff
+
+
+    def _detect_charset(self, stream):
+        buff = stream.read(8192)
+        result = chardet.detect(buff)
+        print "chardet result: " + str(result)
+        return (result['encoding'], buff)
+
+
+    # Create a generator reading from a stream, with optional rewriting and final read call
+    @staticmethod
+    def stream_to_gen(stream, rewrite_func = None, final_read_func = None, first_buff = None):
+        try:
+            buff = first_buff if first_buff else stream.read()
+            while buff:
+                if rewrite_func:
+                    buff = rewrite_func(buff)
+                yield buff
+                buff = stream.read()
+
+            # For adding a tail/handling final buffer
+            if final_read_func:
+                buff = final_read_func()
+                if buff:
+                    yield buff
+
+        finally:
+            stream.close()
+
+
--- a/pywb/rewrite/rewrite_live.py
+++ b/pywb/rewrite/rewrite_live.py
@ -0,0 +1,68 @@
+import urllib2
+import os
+import sys
+import datetime
+
+from pywb.utils.timeutils import datetime_to_timestamp
+from pywb.utils.statusandheaders import StatusAndHeaders
+from pywb.rewrite.url_rewriter import UrlRewriter
+from pywb.rewrite.rewrite_content import RewriteContent
+
+"""
+Fetch a url from live web and apply rewriting rules
+"""
+
+#=================================================================
+def get_status_and_stream(url):
+    resp = urllib2.urlopen(url)
+
+    headers = []
+    for name, value in resp.info().dict.iteritems():
+        headers.append((name, value))
+
+    status_headers = StatusAndHeaders('200 OK', headers)
+    stream = resp
+
+    return (status_headers, stream)
+
+#=================================================================
+def get_rewritten(url, urlrewriter):
+    (status_headers, stream) = get_status_and_stream(url)
+
+    status_headers, gen = RewriteContent().rewrite_content(urlrewriter, status_headers, stream)
+
+    buff = ''
+    for x in gen:
+        buff += x
+
+    return (status_headers, buff)
+
+#=================================================================
+def main():
+    if len(sys.argv) < 2:
+        print 'Usage: {0} url-to-fetch [wb-url-target] [extra-prefix]'.format(sys.argv[0])
+        exit(1)
+    else:
+        url = sys.argv[1]
+
+    if len(sys.argv) >= 3:
+        wburl_str = sys.argv[2]
+        if wburl_str.startswith('/'):
+            wburl_str = wburl_str[1:]
+
+        prefix, wburl_str = wburl_str.split('/', 1)
+        prefix = '/' + prefix + '/'
+    else:
+        wburl_str = datetime_to_timestamp(datetime.datetime.now()) + '/http://example.com/path/sample.html'
+        prefix = '/pywb_rewrite/'
+
+    urlrewriter = UrlRewriter(wburl_str, prefix)
+
+    status_headers, buff = get_rewritten(url, urlrewriter)
+
+    sys.stdout.write(buff)
+
+
+#=================================================================
+if __name__ == "__main__":
+    main()
--- a/pywb/rewrite/test/test_rewrite.py
+++ b/pywb/rewrite/test/test_rewrite.py
@ -0,0 +1,266 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+r"""
+
+#=================================================================
+# HTML Rewriting
+#=================================================================
+
+>>> parse('<HTML><A Href="page.html">Text</a></hTmL>')
+<HTML><a href="/web/20131226101010/http://example.com/some/path/page.html">Text</a></html>
+
+>>> parse('<body x="y"><img src="../img.gif"/><br/></body>')
+<body x="y"><img src="/web/20131226101010im_/http://example.com/some/img.gif"/><br/></body>
+
+>>> parse('<body x="y"><img src="/img.gif"/><br/></body>')
+<body x="y"><img src="/web/20131226101010im_/http://example.com/img.gif"/><br/></body>
+
+>>> parse('<input "selected"><img src></div>')
+<input "selected"=""><img src=""></div>
+
+>>> parse('<html><head><base href="http://example.com/some/path/index.html"/>')
+<html><head><base href="/web/20131226101010/http://example.com/some/path/index.html"/>
+
+# HTML Entities
+>>> parse('<a href="">&rsaquo; &nbsp; &#62;</div>')
+<a href="">&rsaquo; &nbsp; &#62;</div>
+
+# Don't rewrite anchors
+>>> parse('<HTML><A Href="#abc">Text</a></hTmL>')
+<HTML><a href="#abc">Text</a></html>
+
+# Unicode
+>>> parse('<a href="http://испытание.испытание/">испытание</a>')
+<a href="/web/20131226101010/http://испытание.испытание/">испытание</a>
+
+# Meta tag
+>>> parse('<META http-equiv="refresh" content="10; URL=/abc/def.html">')
+<meta http-equiv="refresh" content="10; URL=/web/20131226101010/http://example.com/abc/def.html">
+
+>>> parse('<meta http-equiv="Content-type" content="text/html; charset=utf-8" />')
+<meta http-equiv="Content-type" content="text/html; charset=utf-8"/>
+
+>>> parse('<META http-equiv="refresh" content>')
+<meta http-equiv="refresh" content="">
+
+# Script tag
+>>> parse('<script>window.location = "http://example.com/a/b/c.html"</script>')
+<script>window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html"</script>
+
+# Unterminated script tag auto-terminate
+>>> parse('<script>window.location = "http://example.com/a/b/c.html"</sc>')
+<script>window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html"</sc></script>
+
+>>> parse('<script>/*<![CDATA[*/window.location = "http://example.com/a/b/c.html;/*]]>*/"</script>')
+<script>/*<![CDATA[*/window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html;/*]]>*/"</script>
+
+>>> parse('<div style="background: url(\'abc.html\')" onblah onclick="location = \'redirect.html\'"></div>')
+<div style="background: url('/web/20131226101010/http://example.com/some/path/abc.html')" onblah="" onclick="WB_wombat_location = 'redirect.html'"></div>
+
+>>> parse('<style>@import "styles.css" .a { font-face: url(\'myfont.ttf\') }</style>')
+<style>@import "/web/20131226101010/http://example.com/some/path/styles.css" .a { font-face: url('/web/20131226101010/http://example.com/some/path/myfont.ttf') }</style>
+
+# Unterminated style tag auto-terminate
+>>> parse('<style>@import url(styles.css)')
+<style>@import url(/web/20131226101010/http://example.com/some/path/styles.css)</style>
+
+# Head Insertion
+>>> parse('<html><head><script src="other.js"></script></head><body>Test</body></html>', head_insert = '<script src="cool.js"></script>')
+<html><head><script src="cool.js"></script><script src="/web/20131226101010js_/http://example.com/some/path/other.js"></script></head><body>Test</body></html>
+
+>>> parse('<body><div>SomeTest</div>', head_insert = '/* Insert */')
+/* Insert */<body><div>SomeTest</div>
+
+>>> parse('<link href="abc.txt"><div>SomeTest</div>', head_insert = '<script>load_stuff();</script>')
+<link href="/web/20131226101010oe_/http://example.com/some/path/abc.txt"><script>load_stuff();</script><div>SomeTest</div>
+
+#=================================================================
+# Custom Regex
+# Test https->http converter (other tests below in subclasses)
+>>> RegexRewriter([(RegexRewriter.HTTPX_MATCH_STR, RegexRewriter.remove_https, 0)]).rewrite('a = https://example.com; b = http://example.com; c = https://some-url/path/https://embedded.example.com')
+'a = http://example.com; b = http://example.com; c = http://some-url/path/http://embedded.example.com'
+
+
+#=================================================================
+# JS Rewriting
+#=================================================================
+
+>>> _test_js('location = "http://example.com/abc.html"')
+'WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"'
+
+>>> _test_js(r'location = "http:\/\/example.com/abc.html"')
+'WB_wombat_location = "/web/20131010im_/http:\\/\\/example.com/abc.html"'
+
+>>> _test_js(r'location = "http:\\/\\/example.com/abc.html"')
+'WB_wombat_location = "/web/20131010im_/http:\\\\/\\\\/example.com/abc.html"'
+
+>>> _test_js(r"location = 'http://example.com/abc.html/'")
+"WB_wombat_location = '/web/20131010im_/http://example.com/abc.html/'"
+
+>>> _test_js(r'location = http://example.com/abc.html/')
+'WB_wombat_location = http://example.com/abc.html/'
+
+# not rewritten -- to be handled on client side
+>>> _test_js(r'location = "/abc.html"')
+'WB_wombat_location = "/abc.html"'
+
+>>> _test_js(r'location = /http:\/\/example.com/abc.html/')
+'WB_wombat_location = /http:\\/\\/example.com/abc.html/'
+
+>>> _test_js('"/location" == some_location_val; locations = location;')
+'"/location" == some_location_val; locations = WB_wombat_location;'
+
+>>> _test_js('cool_Location = "http://example.com/abc.html"')
+'cool_Location = "/web/20131010im_/http://example.com/abc.html"'
+
+>>> _test_js('window.location = "http://example.com/abc.html" document.domain = "anotherdomain.com"')
+'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html" document.WB_wombat_domain = "anotherdomain.com"'
+
+>>> _test_js('document_domain = "anotherdomain.com"; window.document.domain = "example.com"')
+'document_domain = "anotherdomain.com"; window.document.WB_wombat_domain = "example.com"'
+
+# custom rules added
+>>> _test_js('window.location = "http://example.com/abc.html"; some_func(); ', [('some_func\(\).*', RegexRewriter.comment_out, 0)])
+'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"; /*some_func(); */'
+
+# scheme-agnostic
+>>> _test_js('cool_Location = "//example.com/abc.html" //comment')
+'cool_Location = "/web/20131010im_///example.com/abc.html" //comment'
+
+
+#=================================================================
+# XML Rewriting
+#=================================================================
+
+>>> _test_xml('<tag xmlns="http://www.example.com/ns" attr="http://example.com"></tag>')
+'<tag xmlns="http://www.example.com/ns" attr="/web/20131010im_/http://example.com"></tag>'
+
+>>> _test_xml('<tag xmlns:xsi="http://www.example.com/ns" attr=" http://example.com"></tag>')
+'<tag xmlns:xsi="http://www.example.com/ns" attr=" /web/20131010im_/http://example.com"></tag>'
+
+>>> _test_xml('<tag> http://example.com<other>abchttp://example.com</other></tag>')
+'<tag> /web/20131010im_/http://example.com<other>abchttp://example.com</other></tag>'
+
+>>> _test_xml('<main>   http://www.example.com/blah</tag> <other xmlns:abcdef= " http://example.com"/> http://example.com </main>')
+'<main>   /web/20131010im_/http://www.example.com/blah</tag> <other xmlns:abcdef= " http://example.com"/> /web/20131010im_/http://example.com </main>'
+
+#=================================================================
+# CSS Rewriting
+#=================================================================
+
+>>> _test_css("background: url('/some/path.html')")
+"background: url('/web/20131010im_/http://example.com/some/path.html')"
+
+>>> _test_css("background: url('../path.html')")
+"background: url('/web/20131010im_/http://example.com/path.html')"
+
+>>> _test_css("background: url(\"http://domain.com/path.html\")")
+'background: url("/web/20131010im_/http://domain.com/path.html")'
+
+>>> _test_css("background: url(file.jpeg)")
+'background: url(/web/20131010im_/http://example.com/file.jpeg)'
+
+>>> _test_css("background: url('')")
+"background: url('')"
+
+>>> _test_css("background: url (\"weirdpath\')")
+'background: url ("/web/20131010im_/http://example.com/weirdpath\')'
+
+>>> _test_css("@import   url ('path.css')")
+"@import   url ('/web/20131010im_/http://example.com/path.css')"
+
+>>> _test_css("@import url('path.css')")
+"@import url('/web/20131010im_/http://example.com/path.css')"
+
+>>> _test_css("@import ( 'path.css')")
+"@import ( '/web/20131010im_/http://example.com/path.css')"
+
+>>> _test_css("@import  \"path.css\"")
+'@import  "/web/20131010im_/http://example.com/path.css"'
+
+>>> _test_css("@import ('../path.css\"")
+'@import (\'/web/20131010im_/http://example.com/path.css"'
+
+>>> _test_css("@import ('../url.css\"")
+'@import (\'/web/20131010im_/http://example.com/url.css"'
+
+>>> _test_css("@import (\"url.css\")")
+'@import ("/web/20131010im_/http://example.com/url.css")'
+
+>>> _test_css("@import url(/url.css)\n@import  url(/anotherurl.css)\n @import  url(/and_a_third.css)")
+'@import url(/web/20131010im_/http://example.com/url.css)\n@import  url(/web/20131010im_/http://example.com/anotherurl.css)\n @import  url(/web/20131010im_/http://example.com/and_a_third.css)'
+
+#=================================================================
+HTTP Headers Rewriting
+#=================================================================
+
+# Text with charset
+>>> _test_headers([('Date', 'Fri, 03 Jan 2014 03:03:21 GMT'), ('Content-Length', '5'), ('Content-Type', 'text/html;charset=UTF-8')])
+{'text_type': 'html', 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Date', 'Fri, 03 Jan 2014 03:03:21 GMT'),
+  ('X-Archive-Orig-Content-Length', '5'),
+  ('Content-Type', 'text/html;charset=UTF-8')]), 'removed_header_dict': {}, 'charset': 'utf-8'}
+
+# Redirect
+>>> _test_headers([('Connection', 'close'), ('Location', '/other.html')], '302 Redirect')
+{'text_type': None, 'status_headers': StatusAndHeaders(protocol = '', statusline = '302 Redirect', headers = [ ('X-Archive-Orig-Connection', 'close'),
+  ('Location', '/web/20131226101010/http://example.com/other.html')]), 'removed_header_dict': {}, 'charset': None}
+
+# gzip
+>>> _test_headers([('Content-Length', '199999'), ('Content-Type', 'text/javascript'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked')])
+{'text_type': 'js', 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Content-Length', '199999'),
+  ('Content-Type', 'text/javascript')]), 'removed_header_dict': {'transfer-encoding': 'chunked', 'content-encoding': 'gzip'}, 'charset': None}
+
+# Binary
+>>> _test_headers([('Content-Length', '200000'), ('Content-Type', 'image/png'), ('Cookie', 'blah'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked')])
+{'text_type': None, 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('Content-Length', '200000'),
+  ('Content-Type', 'image/png'),
+  ('X-Archive-Orig-Cookie', 'blah'),
+  ('Content-Encoding', 'gzip')]), 'removed_header_dict': {'transfer-encoding': 'chunked'}, 'charset': None}
+
+Removing Transfer-Encoding always, Was:
+  ('Content-Encoding', 'gzip'),
+  ('Transfer-Encoding', 'chunked')]), 'charset': None, 'text_type': None, 'removed_header_dict': {}}
+
+
+"""
+
+#=================================================================
+from pywb.rewrite.url_rewriter import UrlRewriter
+from pywb.rewrite.html_rewriter import HTMLRewriter
+from pywb.rewrite.regex_rewriters import RegexRewriter, JSRewriter, CSSRewriter, XMLRewriter
+from pywb.rewrite.header_rewriter import HeaderRewriter
+
+from pywb.utils.statusandheaders import StatusAndHeaders
+
+
+urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/web/')
+
+def parse(data, head_insert = None):
+    parser = HTMLRewriter(urlrewriter, head_insert = head_insert)
+    print parser.rewrite(data) + parser.close()
+
+arcrw = UrlRewriter('20131010im_/http://example.com/', '/web/')
+
+
+def _test_js(string, extra = []):
+    return JSRewriter(arcrw, extra).rewrite(string)
+
+def _test_xml(string):
+    return XMLRewriter(arcrw).rewrite(string)
+
+def _test_css(string):
+    return CSSRewriter(arcrw).rewrite(string)
+
+headerrewriter = HeaderRewriter()
+
+def _test_headers(headers, status = '200 OK'):
+    rewritten = headerrewriter.rewrite(StatusAndHeaders(status, headers), urlrewriter)
+    return vars(rewritten)
+
+
+if __name__ == "__main__":
+    import doctest
+    doctest.testmod()
+
+
--- a/pywb/rewrite/test/test_rewrite_live.py
+++ b/pywb/rewrite/test/test_rewrite_live.py
@ -0,0 +1,32 @@
+from pywb.rewrite.rewrite_live import get_rewritten
+from pywb.rewrite.url_rewriter import UrlRewriter
+
+# This module has some rewriting tests against the 'live web'
+# As such, the content may change and the test may break
+
+urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/pywb/')
+
+
+def test_example_1():
+    status_headers, buff = get_rewritten('http://example.com/', urlrewriter)
+
+    # verify header rewriting
+    assert (('X-Archive-Orig-connection', 'close') in status_headers.headers), status_headers
+
+
+def test_example_2():
+    status_headers, buff = get_rewritten('http://example.com/', urlrewriter)
+
+    # verify header rewriting
+    assert (('X-Archive-Orig-connection', 'close') in status_headers.headers), status_headers
+
+    assert '/pywb/20131226101010/http://www.iana.org/domains/example' in buff, buff
+
+
+
+def test_example_3():
+    status_headers, buff = get_rewritten('http://archive.org/', urlrewriter)
+
+    assert '/pywb/20131226101010/http://example.com/about/terms.php' in buff, buff
+
+
--- a/pywb/rewrite/url_rewriter.py
+++ b/pywb/rewrite/url_rewriter.py
@ -6,43 +6,43 @@ from wburl import WbUrl

 class UrlRewriter:
    """
-    >>> test_rewrite('other.html', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/')
+    >>> do_rewrite('other.html', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/')
    'https://web.archive.org/web/20131010/http://example.com/path/other.html'

-    >>> test_rewrite('file.js', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/', 'js_')
+    >>> do_rewrite('file.js', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/', 'js_')
    'https://web.archive.org/web/20131010js_/http://example.com/path/file.js'

-    >>> test_rewrite('/other.html', '20130907*/http://example.com/path/page.html', '/coll/')
+    >>> do_rewrite('/other.html', '20130907*/http://example.com/path/page.html', '/coll/')
    '/coll/20130907*/http://example.com/other.html'

-    >>> test_rewrite('./other.html', '20130907*/http://example.com/path/page.html', '/coll/')
+    >>> do_rewrite('./other.html', '20130907*/http://example.com/path/page.html', '/coll/')
    '/coll/20130907*/http://example.com/path/other.html'

-    >>> test_rewrite('../other.html', '20131112im_/http://example.com/path/page.html', '/coll/')
+    >>> do_rewrite('../other.html', '20131112im_/http://example.com/path/page.html', '/coll/')
    '/coll/20131112im_/http://example.com/other.html'

-    >>> test_rewrite('../../other.html', '*/http://example.com/index.html', 'localhost:8080/')
+    >>> do_rewrite('../../other.html', '*/http://example.com/index.html', 'localhost:8080/')
    'localhost:8080/*/http://example.com/other.html'

-    >>> test_rewrite('path/../../other.html', '*/http://example.com/index.html', 'localhost:8080/')
+    >>> do_rewrite('path/../../other.html', '*/http://example.com/index.html', 'localhost:8080/')
    'localhost:8080/*/http://example.com/other.html'

-    >>> test_rewrite('http://some-other-site.com', '20101226101112/http://example.com/index.html', 'localhost:8080/')
+    >>> do_rewrite('http://some-other-site.com', '20101226101112/http://example.com/index.html', 'localhost:8080/')
    'localhost:8080/20101226101112/http://some-other-site.com'

-    >>> test_rewrite('../../other.html', '2020/http://example.com/index.html', '/')
+    >>> do_rewrite('../../other.html', '2020/http://example.com/index.html', '/')
    '/2020/http://example.com/other.html'

-    >>> test_rewrite('../../other.html', '2020/http://example.com/index.html', '')
+    >>> do_rewrite('../../other.html', '2020/http://example.com/index.html', '')
    '2020/http://example.com/other.html'

-    >>> test_rewrite('', '20131010010203/http://example.com/file.html', '/web/')
+    >>> do_rewrite('', '20131010010203/http://example.com/file.html', '/web/')
    '/web/20131010010203/http://example.com/file.html'

-    >>> test_rewrite('#anchor', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/')
+    >>> do_rewrite('#anchor', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/')
    '#anchor'

-    >>> test_rewrite('mailto:example@example.com', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/')
+    >>> do_rewrite('mailto:example@example.com', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/')
    'mailto:example@example.com'

    >>> UrlRewriter('19960708im_/http://domain.example.com/path.txt', '/abc/').get_abs_url()
@ -62,7 +62,6 @@ class UrlRewriter:
    def __init__(self, wburl, prefix):
        self.wburl = wburl if isinstance(wburl, WbUrl) else WbUrl(wburl)
        self.prefix = prefix
-        self.archivalurl_class = self.wburl.__class__

        #if self.prefix.endswith('/'):
        #    self.prefix = self.prefix[:-1]
@ -74,7 +73,7 @@ class UrlRewriter:

        wburl = self.wburl

-        isAbs = any (url.startswith(x) for x in self.PROTOCOLS)
+        isAbs = any(url.startswith(x) for x in self.PROTOCOLS)

        # Optimized rewriter for
        # -rel urls that don't start with / and  don't contain ../ and no special mod
@ -117,12 +116,11 @@ class UrlRewriter:
        return url


-import utils
-if __name__ == "__main__" or utils.enable_doctests():
-    def test_rewrite(rel_url, base_url, prefix, mod = None):
-        rewriter = UrlRewriter(base_url, prefix)
-        return rewriter.rewrite(rel_url, mod)
+def do_rewrite(rel_url, base_url, prefix, mod = None):
+    rewriter = UrlRewriter(base_url, prefix)
+    return rewriter.rewrite(rel_url, mod)

+if __name__ == "__main__":
    import doctest
    doctest.testmod()

--- a/pywb/rewrite/wburl.py
+++ b/pywb/rewrite/wburl.py
@ -3,9 +3,38 @@
 import re
 import rfc3987

-import wbexceptions
-
 # WbUrl : wb archival url representation for WB
+"""
+WbUrl represents the standard wayback archival url format.
+A regular url is a subset of the WbUrl (latest replay).
+
+The WbUrl expresses the common interface for interacting
+with the wayback machine.
+
+There WbUrl may represent one of the following forms:
+
+query form: [/modifier]/[timestamp][-end_timestamp]*/<url>
+
+modifier, timestamp and end_timestamp are optional
+
+*/example.com
+20101112030201*/http://example.com
+2009-2015*/http://example.com
+/cdx/*/http://example.com
+
+url query form: used to indicate query across urls
+same as query form but with a final *
+*/example.com*
+20101112030201*/http://example.com*
+
+
+replay form:
+20101112030201/http://example.com
+20101112030201im_/http://example.com
+
+latest_replay: (no timestamp)
+http://example.com
+"""

 class WbUrl:
    """
@ -38,6 +67,13 @@ class WbUrl:
    >>> repr(WbUrl('*/http://example.com/abc?def=a*'))
    "('url_query', '', '', 'http://example.com/abc?def=a', '*/http://example.com/abc?def=a*')"

+    >>> repr(WbUrl('2010*/http://example.com/abc?def=a'))
+    "('query', '2010', '', 'http://example.com/abc?def=a', '2010*/http://example.com/abc?def=a')"
+
+    # timestamp range query
+    >>> repr(WbUrl('2009-2015*/http://example.com/abc?def=a'))
+    "('query', '2009', '', 'http://example.com/abc?def=a', '2009-2015*/http://example.com/abc?def=a')"
+
    >>> repr(WbUrl('json/*/http://example.com/abc?def=a'))
    "('query', '', 'json', 'http://example.com/abc?def=a', 'json/*/http://example.com/abc?def=a')"

@ -59,16 +95,16 @@ class WbUrl:
    # ======================
    >>> x = WbUrl('/#$%#/')
    Traceback (most recent call last):
-    BadUrlException: Bad Request Url: http://#$%#/
+    Exception: Bad Request Url: http://#$%#/

    >>> x = WbUrl('/http://example.com:abc/')
    Traceback (most recent call last):
-    BadUrlException: Bad Request Url: http://example.com:abc/
+    Exception: Bad Request Url: http://example.com:abc/
    """

    # Regexs
    # ======================
-    QUERY_REGEX = re.compile('^(?:([\w\-:]+)/)?(\d*)\*/?(.*)$')
+    QUERY_REGEX = re.compile('^(?:([\w\-:]+)/)?(\d*)(?:-(\d+))?\*/?(.*)$')
    REPLAY_REGEX = re.compile('^(\d*)([a-z]+_)?/{0,3}(.*)$')

    QUERY = 'query'
@ -85,13 +121,14 @@ class WbUrl:
        self.type = None
        self.url = ''
        self.timestamp = ''
+        self.end_timestamp = ''
        self.mod = ''

        if not any (f(url) for f in [self._init_query, self._init_replay]):
-            raise wbexceptions.RequestParseException('Invalid WB Request Url: ', url)
+            raise Exception('Invalid WbUrl: ', url)

        if len(self.url) == 0:
-            raise wbexceptions.RequestParseException('Invalid WB Request Url: ', url)
+            raise Exception('Invalid WbUrl: ', url)

        # protocol agnostic url -> http://
        #if self.url.startswith('//'):
@ -105,7 +142,7 @@ class WbUrl:
        matcher = rfc3987.match(self.url.upper(), 'IRI')

        if not matcher:
-            raise wbexceptions.BadUrlException('Bad Request Url: ' + self.url)
+            raise Exception('Bad Request Url: ' + self.url)

    # Match query regex
    # ======================
@ -118,7 +155,8 @@ class WbUrl:

        self.mod = res[0]
        self.timestamp = res[1]
-        self.url = res[2]
+        self.end_timestamp = res[2]
+        self.url = res[3]
        if self.url.endswith('*'):
            self.type = self.URL_QUERY
            self.url = self.url[:-1]
@ -151,6 +189,7 @@ class WbUrl:
        atype = overrides['type'] if 'type' in overrides else self.type
        mod = overrides['mod'] if 'mod' in overrides else self.mod
        timestamp = overrides['timestamp'] if 'timestamp' in overrides else self.timestamp
+        end_timestamp = overrides['end_timestamp'] if 'end_timestamp' in overrides else self.end_timestamp
        url = overrides['url'] if 'url' in overrides else self.url

        if atype == self.QUERY or atype == self.URL_QUERY:
@ -159,6 +198,8 @@ class WbUrl:
                tsmod += mod + "/"
            if timestamp:
                tsmod += timestamp
+            if end_timestamp:
+                tsmod += '-' + end_timestamp

            tsmod += "*/" + url
            if atype == self.URL_QUERY:
--- a/pywb/utils.py
+++ b/pywb/utils.py
@ -1,122 +0,0 @@
-import itertools
-import time
-import zlib
-import time
-import datetime
-import calendar
-import re
-
-def peek_iter(iterable):
-    try:
-        first = next(iterable)
-    except StopIteration:
-        return None
-
-    return itertools.chain([first], iterable)
-
-
-def split_prefix(key, prefixs):
-    for p in prefixs:
-        if key.startswith(p):
-            plen = len(p)
-            return (key[:plen], key[plen:])
-
-
-def create_decompressor():
-    return zlib.decompressobj(16 + zlib.MAX_WBITS)
-
-
-#=================================================================
-# Adapted from example at
-class PerfTimer:
-    def __init__(self, perfdict, name):
-        self.perfdict = perfdict
-        self.name = name
-
-    def __enter__(self):
-        self.start = time.clock()
-        return self
-
-    def __exit__(self, *args):
-        self.end = time.clock()
-        if self.perfdict is not None:
-            self.perfdict[self.name] = str(self.end - self.start)
-
-
-#=================================================================
-# adapted -from wsgiref.request_uri, but doesn't include domain name and allows all characters
-# allowed in the path segment according to: http://tools.ietf.org/html/rfc3986#section-3.3
-# explained here: http://stackoverflow.com/questions/4669692/valid-characters-for-directory-part-of-a-url-for-short-links
-def rel_request_uri(environ, include_query=1):
-    """
-    Return the requested path, optionally including the query string
-
-    # Simple test:
-    >>> rel_request_uri({'PATH_INFO': '/web/example.com'})
-    '/web/example.com'
-
-    # Test all unecoded special chars and double-quote
-    # (double-quote must be encoded but not single quote)
-    >>> rel_request_uri({'PATH_INFO': "/web/example.com/0~!+$&'()*+,;=:\\\""})
-    "/web/example.com/0~!+$&'()*+,;=:%22"
-    """
-    from urllib import quote
-    url = quote(environ.get('PATH_INFO',''), safe='/~!$&\'()*+,;=:@')
-    if include_query and environ.get('QUERY_STRING'):
-        url += '?' + environ['QUERY_STRING']
-
-    return url
-
-
-
-#=================================================================
-def unsurt(surt):
-    """
-    # Simple surt
-    >>> unsurt('com,example)/')
-    'example.com)/'
-
-    # Broken surt
-    >>> unsurt('com,example)')
-    'com,example)'
-
-    # Long surt
-    >>> unsurt('suffix,domain,sub,subsub,another,subdomain)/path/file/index.html?a=b?c=)/')
-    'subdomain.another.subsub.sub.domain.suffix)/path/file/index.html?a=b?c=)/'
-    """
-
-    try:
-        index = surt.index(')/')
-        parts = surt[0:index].split(',')
-        parts.reverse()
-        host = '.'.join(parts)
-        host += surt[index:]
-        return host
-
-    except ValueError:
-        # May not be a valid surt
-        return surt
-
-
-#=================================================================
-# Support for bulk doctest testing via nose or py.test
-# nosetests --with-doctest
-# py.test --doctest_modules
-
-import sys
-is_in_testtool = any(sys.argv[0].endswith(tool) for tool in ['py.test', 'nosetests'])
-
-def enable_doctests():
-    return is_in_testtool
-
-
-def test_data_dir():
-    import os
-    return os.path.dirname(os.path.realpath(__file__)) + '/../sample_archive/'
-
-#=================================================================
-
-if __name__ == "__main__" or enable_doctests():
-    import doctest
-    doctest.testmod()
-
--- a/pywb/utils/README.md
+++ b/pywb/utils/README.md
@ -0,0 +1,16 @@
+## PyWb Utils v0.2 ##
+
+[![Build Status](https://travis-ci.org/ikreymer/pywb_utils.png?branch=master)](https://travis-ci.org/ikreymer/pywb_utils)
+
+This is a standalone module contains a variety of utils used by pywb wayback tool suite.
+
+`python run-tests.py` will run all tests
+
+#### Modules
+
+[binsearch.py](pywb_utils/binsearch.py) -- Binary search implementation over text files
+
+[loaders.py](pywb_utils/loaders.py) -- Loading abstraction for http, local file system, as well as buffered and seekable file readers
+
+[timeutils.py](pywb_utils/timeutils.py) -- Utility functions for converting between standard datetime formats 14-digit timestamp
+
--- a/pywb/utils/init.py
+++ b/pywb/utils/init.py
--- a/pywb/utils/binsearch.py
+++ b/pywb/utils/binsearch.py
@ -0,0 +1,110 @@
+"""
+Utility functions for performing binary search over a sorted text file
+"""
+
+from collections import deque
+import itertools
+
+
+#=================================================================
+def binsearch_offset(reader, key, compare_func=cmp, block_size=8192):
+    """
+    Find offset of the line which matches a given 'key' using binary search
+    If key is not found, the offset is of the line after the key
+
+    File is subdivided into block_size (default 8192) sized blocks
+    Optional compare_func may be specified
+    """
+    min_ = 0
+    max_ = reader.getsize() / block_size
+
+    while max_ - min_ > 1:
+        mid = min_ + ((max_ - min_) / 2)
+        reader.seek(mid * block_size)
+
+        if mid > 0:
+            reader.readline()  # skip partial line
+
+        line = reader.readline()
+
+        if compare_func(key, line) > 0:
+            min_ = mid
+        else:
+            max_ = mid
+
+    return min_ * block_size
+
+
+#=================================================================
+def search(reader, key, prev_size=0, compare_func=cmp, block_size=8192):
+    """
+    Perform a binary search for a specified key to within a 'block_size'
+    (default 8192) sized block followed by linear search
+    within the block to find first matching line.
+
+    When performin_g linear search, keep track of up to N previous lines before
+    first matching line.
+    """
+    min_ = binsearch_offset(reader, key, compare_func, block_size)
+
+    reader.seek(min_)
+
+    if min_ > 0:
+        reader.readline()  # skip partial line
+
+    if prev_size > 1:
+        prev_deque = deque(max_len=prev_size)
+
+    line = None
+
+    while True:
+        line = reader.readline()
+        if not line:
+            break
+        if compare_func(line, key) >= 0:
+            break
+
+        if prev_size == 1:
+            prev = line
+        elif prev_size > 1:
+            prev_deque.append(line)
+
+    def gen_iter(line):
+        """
+        Create iterator over any previous lines to
+        current matched line
+        """
+        if prev_size == 1:
+            yield prev.rstrip()
+        elif prev_size > 1:
+            for i in prev_deque:
+                yield i.rstrip()
+
+        while line:
+            yield line.rstrip()
+            line = reader.readline()
+
+    return gen_iter(line)
+
+
+#=================================================================
+def iter_prefix(reader, key):
+    """
+    Creates an iterator which iterates over lines that start with prefix
+    'key' in a sorted text file.
+    """
+
+    return itertools.takewhile(
+        lambda line: line.startswith(key),
+        search(reader, key))
+
+
+#=================================================================
+def iter_exact(reader, key, token=' '):
+    """
+    Create an iterator which iterates over lines where the first field matches
+    the 'key', equivalent to token + sep prefix.
+    Default field termin_ator/seperator is ' '
+    """
+
+    return iter_prefix(reader, key + token)
--- a/pywb/utils/bufferedreaders.py
+++ b/pywb/utils/bufferedreaders.py
@ -0,0 +1,204 @@
+import StringIO
+import zlib
+
+
+#=================================================================
+def gzip_decompressor():
+    """
+    Decompressor which can handle decompress gzip stream
+    """
+    return zlib.decompressobj(16 + zlib.MAX_WBITS)
+
+
+#=================================================================
+class BufferedReader(object):
+    """
+    A wrapping line reader which wraps an existing reader.
+    Read operations operate on underlying buffer, which is filled to
+    block_size (1024 default)
+
+    If an optional decompress type is specified,
+    data is fed through the decompressor when read from the buffer.
+    Currently supported decompression: gzip
+
+    If decompression fails on first try, data is assumed to be decompressed
+    and no exception is thrown. If a failure occurs after data has been
+    partially decompressed, the exception is propagated.
+
+    """
+
+    DECOMPRESSORS = {'gzip': gzip_decompressor}
+
+    def __init__(self, stream, max_len=0, block_size=1024, decomp_type=None):
+        self.stream = stream
+        self.block_size = block_size
+
+        if decomp_type:
+            try:
+                self.decompressor = self.DECOMPRESSORS[decomp_type.lower()]()
+            except KeyError:
+                raise Exception('Decompression type not supported: ' +
+                                decomp_type)
+        else:
+            self.decompressor = None
+
+        self.buff = None
+        self.num_read = 0
+        self.max_len = max_len
+
+    def _fillbuff(self, block_size=None):
+        if not block_size:
+            block_size = self.block_size
+
+        if not self.buff or self.buff.pos >= self.buff.len:
+            if self.max_len > 0:
+                to_read = min(self.max_len - self.num_read, self.block_size)
+            else:
+                to_read = self.block_size
+
+            data = self.stream.read(to_read)
+            self._process_read(data)
+
+    def _process_read(self, data):
+        data = self._decompress(data)
+        self.num_read += len(data)
+        self.buff = StringIO.StringIO(data)
+
+    def _decompress(self, data):
+        if self.decompressor and data:
+            try:
+                data = self.decompressor.decompress(data)
+            except Exception:
+                # if first read attempt, assume non-gzipped stream
+                if self.num_read == 0:
+                    self.decompressor = None
+                # otherwise (partly decompressed), something is wrong
+                else:
+                    raise
+        return data
+
+    def read(self, length=None):
+        self._fillbuff()
+        return self.buff.read(length)
+
+    def readline(self, length=None):
+        self._fillbuff()
+        return self.buff.readline(length)
+
+    def close(self):
+        if self.stream:
+            self.stream.close()
+            self.stream = None
+
+
+#=================================================================
+class ChunkedDataException(Exception):
+    pass
+
+
+#=================================================================
+class ChunkedDataReader(BufferedReader):
+    r"""
+    A ChunkedDataReader is a BufferedReader which also supports de-chunking
+    of the data if it happens to be http 'chunk-encoded'.
+
+    If at any point the chunked header is not available, the stream is
+    assumed to not be chunked and no more dechunking occurs.
+
+    Properly formatted chunked data:
+    >>> c = ChunkedDataReader(StringIO.StringIO("4\r\n1234\r\n0\r\n\r\n"));
+    >>> c.read() + c.read()
+    '1234'
+
+    Non-chunked data:
+    >>> ChunkedDataReader(StringIO.StringIO("xyz123!@#")).read()
+    'xyz123!@#'
+
+    Starts like chunked data, but isn't:
+    >>> c = ChunkedDataReader(StringIO.StringIO("1\r\nxyz123!@#"));
+    >>> c.read() + c.read()
+    '1\r\nx123!@#'
+
+    Chunked data cut off part way through:
+    >>> c = ChunkedDataReader(StringIO.StringIO("4\r\n1234\r\n4\r\n12"));
+    >>> c.read() + c.read()
+    '123412'
+    """
+
+    all_chunks_read = False
+    not_chunked = False
+
+    # if False, we'll use best-guess fallback for parse errors
+    raise_chunked_data_exceptions = False
+
+    def _fillbuff(self, block_size=None):
+        if self.not_chunked:
+            return BufferedReader._fillbuff(self, block_size)
+
+        if self.all_chunks_read:
+            return
+
+        if not self.buff or self.buff.pos >= self.buff.len:
+            length_header = self.stream.readline(64)
+            self._data = ''
+
+            try:
+                self._try_decode(length_header)
+            except ChunkedDataException:
+                if self.raise_chunked_data_exceptions:
+                    raise
+
+                # Can't parse the data as chunked.
+                # It's possible that non-chunked data is served
+                # with a Transfer-Encoding: chunked.
+                # Treat this as non-chunk encoded from here on.
+                self._process_read(length_header + self._data)
+                self.not_chunked = True
+
+    def _try_decode(self, length_header):
+        # decode length header
+        try:
+            chunk_size = int(length_header.strip().split(';')[0], 16)
+        except ValueError:
+            raise ChunkedDataException("Couldn't decode length header " +
+                                       length_header)
+
+        if not chunk_size:
+            # chunk_size 0 indicates end of file
+            self.all_chunks_read = True
+            #self._process_read('')
+            return
+
+        data_len = len(self._data)
+
+        # read chunk
+        while data_len < chunk_size:
+            new_data = self.stream.read(chunk_size - data_len)
+
+            # if we unexpectedly run out of data,
+            # either raise an exception or just stop reading,
+            # assuming file was cut off
+            if not new_data:
+                if self.raise_chunked_data_exceptions:
+                    msg = 'Ran out of data before end of chunk'
+                    raise ChunkedDataException(msg)
+                else:
+                    chunk_size = data_len
+                    self.all_chunks_read = True
+
+            self._data += new_data
+            data_len = len(self._data)
+
+        # if we successfully read a block without running out,
+        # it should end in \r\n
+        if not self.all_chunks_read:
+            clrf = self.stream.read(2)
+            if clrf != '\r\n':
+                raise ChunkedDataException("Chunk terminator not found.")
+
+        # hand to base class for further processing
+        self._process_read(self._data)
+
+if __name__ == "__main__":
+    import doctest
+    doctest.testmod()
--- a/pywb/utils/loaders.py
+++ b/pywb/utils/loaders.py
@ -0,0 +1,152 @@
+"""
+This module provides loaders for local file system and over http
+local and remote access
+"""
+
+import os
+import hmac
+import urllib2
+import time
+
+
+#=================================================================
+# load a reader from http
+#=================================================================
+class HttpLoader(object):
+    """
+    Load a file-like reader over http using range requests
+    and an optional cookie created via a cookie_maker
+    """
+    def __init__(self, cookie_maker=None):
+        self.cookie_maker = cookie_maker
+
+    def load(self, url, offset, length):
+        if length > 0:
+            range_header = 'bytes={0}-{1}'.format(offset, offset + length - 1)
+        else:
+            range_header = 'bytes={0}-'.format(offset)
+
+        headers = {}
+        headers['Range'] = range_header
+
+        if self.cookie_maker:
+            headers['Cookie'] = self.cookie_maker.make()
+
+        request = urllib2.Request(url, headers=headers)
+        return urllib2.urlopen(request)
+
+
+#=================================================================
+# Signed Cookie-Maker
+#=================================================================
+
+class HMACCookieMaker(object):
+    """
+    Utility class to produce signed HMAC digest cookies
+    to be used with each http request
+    """
+    def __init__(self, key, name, duration=10):
+        self.key = key
+        self.name = name
+        # duration in seconds
+        self.duration = duration
+
+    def make(self, extra_id=''):
+        expire = str(long(time.time() + self.duration))
+
+        if extra_id:
+            msg = extra_id + '-' + expire
+        else:
+            msg = expire
+
+        hmacdigest = hmac.new(self.key, msg)
+        hexdigest = hmacdigest.hexdigest()
+
+        if extra_id:
+            cookie = '{0}-{1}={2}-{3}'.format(self.name, extra_id,
+                                              expire, hexdigest)
+        else:
+            cookie = '{0}={1}-{2}'.format(self.name, expire, hexdigest)
+
+        return cookie
+
+
+#=================================================================
+# load a reader from local filesystem
+#=================================================================
+class FileLoader(object):
+    """
+    Load a file-like reader from the local file system
+    """
+
+    def load(self, url, offset, length):
+        if url.startswith('file://'):
+            url = url[len('file://'):]
+
+        afile = open(url, 'rb')
+        afile.seek(offset)
+
+        if length > 0:
+            return LimitReader(afile, length)
+
+
+#=================================================================
+# Limit Reader
+#=================================================================
+class LimitReader(object):
+    """
+    A reader which will not read more than specified limit
+    """
+
+    def __init__(self, stream, limit):
+        self.stream = stream
+        self.limit = limit
+
+        if not self.limit:
+            self.limit = 1
+
+    def read(self, length=None):
+        length = min(length, self.limit) if length else self.limit
+        buff = self.stream.read(length)
+        self.limit -= len(buff)
+        return buff
+
+    def readline(self, length=None):
+        length = min(length, self.limit) if length else self.limit
+        buff = self.stream.readline(length)
+        self.limit -= len(buff)
+        return buff
+
+    def close(self):
+        self.stream.close()
+
+
+#=================================================================
+# Local text file with known size -- used for binsearch
+#=================================================================
+class SeekableTextFileReader(object):
+    """
+    A very simple file-like object wrapper that knows it's total size,
+    via getsize()
+    Supports seek() operation.
+    Assumed to be a text file. Used for binsearch.
+    """
+    def __init__(self, filename):
+        self.fh = open(filename, 'rb')
+        self.filename = filename
+        self.size = os.path.getsize(filename)
+
+    def getsize(self):
+        return self.size
+
+    def read(self):
+        return self.fh.read()
+
+    def readline(self):
+        return self.fh.readline()
+
+    def seek(self, offset):
+        return self.fh.seek(offset)
+
+    def close(self):
+        return self.fh.close()
--- a/pywb/utils/statusandheaders.py
+++ b/pywb/utils/statusandheaders.py
@ -0,0 +1,107 @@
+"""
+Representation and parsing of HTTP-style status + headers
+"""
+
+import pprint
+
+
+#=================================================================
+class StatusAndHeaders(object):
+    """
+    Representation of parsed http-style status line and headers
+    Status Line if first line of request/response
+    Headers is a list of (name, value) tuples
+    An optional protocol which appears on first line may be specified
+    """
+    def __init__(self, statusline, headers, protocol=''):
+        self.statusline = statusline
+        self.headers = headers
+        self.protocol = protocol
+
+    def get_header(self, name):
+        """
+        return header (name, value)
+        if found
+        """
+        name_lower = name.lower()
+        for value in self.headers:
+            if value[0].lower() == name_lower:
+                return value[1]
+
+    def remove_header(self, name):
+        """
+        remove header (case-insensitive)
+        return True if header removed, False otherwise
+        """
+        name_lower = name.lower()
+        for index in xrange(len(self.headers) - 1, -1, -1):
+            if self.headers[index][0].lower() == name_lower:
+                del self.headers[index]
+                return True
+
+        return False
+
+    def __repr__(self):
+        headers_str = pprint.pformat(self.headers, indent=2)
+        return "StatusAndHeaders(protocol = '{0}', statusline = '{1}', \
+headers = {2})".format(self.protocol, self.statusline, headers_str)
+
+    def __eq__(self, other):
+        return (self.statusline == other.statusline and
+                self.headers == other.headers and
+                self.protocol == other.protocol)
+
+
+#=================================================================
+class StatusAndHeadersParser(object):
+    """
+    Parser which consumes a stream support readline() to read
+    status and headers and return a StatusAndHeaders object
+    """
+    def __init__(self, statuslist):
+        self.statuslist = statuslist
+
+    def parse(self, stream):
+        """
+        parse stream for status line and headers
+        return a StatusAndHeaders object
+        """
+        statusline = stream.readline().rstrip()
+
+        protocol_status = self.split_prefix(statusline, self.statuslist)
+
+        if not protocol_status:
+            msg = 'Expected Status Line - Found: ' + statusline
+            raise StatusAndHeadersParserException(msg)
+
+        headers = []
+
+        line = stream.readline().rstrip()
+        while line and line != '\r\n':
+            name, value = line.split(':', 1)
+            header = (name, value.strip())
+            headers.append(header)
+            line = stream.readline().rstrip()
+
+        return StatusAndHeaders(statusline=protocol_status[1].strip(),
+                                headers=headers,
+                                protocol=protocol_status[0])
+
+    @staticmethod
+    def split_prefix(key, prefixs):
+        """
+        split key string into prefix and remainder
+        for first matching prefix from a list
+        """
+        for prefix in prefixs:
+            if key.startswith(prefix):
+                plen = len(prefix)
+                return (key[:plen], key[plen:])
+
+
+#=================================================================
+class StatusAndHeadersParserException(Exception):
+    """
+    status + headers parsing exception
+    """
+    pass
--- a/pywb/utils/test/binsearch_test.py
+++ b/pywb/utils/test/binsearch_test.py
@ -0,0 +1,52 @@
+#=================================================================
+"""
+# binsearch tests
+
+# Prefix Search
+>>> print_binsearch_results('org,iana)/domains/root', iter_prefix)
+org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz
+org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz
+org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz
+org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz
+
+>>> print_binsearch_results('org,iana)/domains/root', iter_exact)
+org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz
+
+>>> print_binsearch_results('org,iana)/', iter_exact)
+org,iana)/ 20140126200624 http://www.iana.org/ text/html 200 OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 2258 334 iana.warc.gz
+
+>>> print_binsearch_results('org,iana)/domains/root/db', iter_exact)
+org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz
+org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz
+
+# Exact Search
+>>> print_binsearch_results('org,iaana)/', iter_exact)
+>>> print_binsearch_results('org,ibna)/', iter_exact)
+
+>>> print_binsearch_results('org,iana)/time-zones', iter_exact)
+org,iana)/time-zones 20140126200737 http://www.iana.org/time-zones text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
+"""
+
+
+#=================================================================
+import os
+from pywb.utils.binsearch import iter_prefix, iter_exact
+from pywb.utils.loaders import SeekableTextFileReader
+
+from pywb import get_test_dir
+
+#test_cdx_dir = os.path.dirname(os.path.realpath(__file__)) + '/../sample-data/'
+test_cdx_dir = get_test_dir() + 'cdx/'
+
+def print_binsearch_results(key, iter_func):
+    cdx =  SeekableTextFileReader(test_cdx_dir + 'iana.cdx')
+
+    for line in iter_func(cdx, key):
+        print line
+
+
+if __name__ == "__main__":
+    import doctest
+    doctest.testmod()
+
+
--- a/pywb/utils/test/loaders_test.py
+++ b/pywb/utils/test/loaders_test.py
@ -0,0 +1,69 @@
+#=================================================================
+"""
+# LimitReader Tests
+>>> LimitReader(StringIO.StringIO('abcdefghjiklmnopqrstuvwxyz'), 10).read(26)
+'abcdefghji'
+
+>>> LimitReader(StringIO.StringIO('abcdefghjiklmnopqrstuvwxyz'), 8).readline(26)
+'abcdefgh'
+
+>>> read_multiple(LimitReader(StringIO.StringIO('abcdefghjiklmnopqrstuvwxyz'), 10), [2, 2, 20])
+'efghji'
+
+# FileLoader Tests (includes LimitReader)
+# Ensure attempt to read more than 100 bytes, reads exactly 100 bytes
+>>> len(FileLoader().load(test_cdx_dir + 'iana.cdx', 0, 100).read('400'))
+100
+
+# SeekableTextFileReader Test
+>>> sr = SeekableTextFileReader(test_cdx_dir + 'iana.cdx')
+>>> sr.getsize()
+30399
+
+>>> seek_read_full(sr, 100)
+'org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200826 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 34054 620049 iana.warc.gz\\n'
+
+#BufferedReader readline()
+>>> BufferedReader(open(test_cdx_dir + 'iana.cdx', 'rb')).readline()
+' CDX N b a m s k r M S V g\\n'
+
+#BufferedReader readline() with decompression
+>>> BufferedReader(open(test_cdx_dir + 'iana.cdx.gz', 'rb'), decomp_type = 'gzip').readline()
+' CDX N b a m s k r M S V g\\n'
+
+>>> HttpLoader(HMACCookieMaker('test', 'test', 5)).load('http://example.com', 41, 14).read()
+'Example Domain'
+"""
+
+
+#=================================================================
+import os
+import StringIO
+from pywb.utils.loaders import FileLoader, HttpLoader, HMACCookieMaker
+from pywb.utils.loaders import LimitReader, SeekableTextFileReader
+from pywb.utils.bufferedreaders import BufferedReader
+
+from pywb import get_test_dir
+#test_cdx_dir = os.path.dirname(os.path.realpath(__file__)) + '/../sample-data/'
+test_cdx_dir = get_test_dir() + 'cdx/'
+
+
+def read_multiple(reader, inc_reads):
+    result = None
+    for x in inc_reads:
+        result = reader.read(x)
+    return result
+
+
+def seek_read_full(seekable_reader, offset):
+    seekable_reader.seek(offset)
+    seekable_reader.readline() #skip
+    return seekable_reader.readline()
+
+
+
+if __name__ == "__main__":
+    import doctest
+    doctest.testmod()
+
+
--- a/pywb/cdxserver/timeutils.py
+++ b/pywb/cdxserver/timeutils.py
@ -1,20 +1,25 @@
+"""
+utility functions for converting between
+datetime, iso date and 14-digit timestamp
+"""
+
 import re
 import time
 import datetime
 import calendar
+from itertools import imap

 #=================================================================
 # str <-> datetime conversion
 #=================================================================

-DATE_TIMESPLIT = re.compile('[^\d]')
+DATE_TIMESPLIT = re.compile(r'[^\d]')

 TIMESTAMP_14 = '%Y%m%d%H%M%S'

 PAD_STAMP_END = '29991231235959'


-
 def iso_date_to_datetime(string):
    """
    >>> iso_date_to_datetime('2013-12-26T10:11:12Z')
@ -28,16 +33,18 @@ def iso_date_to_datetime(string):
    if nums[-1] == '':
        nums = nums[:-1]

-    dt = datetime.datetime(*map(int, nums))
-    return dt
+    the_datetime = datetime.datetime(*imap(int, nums))
+    return the_datetime

-def datetime_to_timestamp(dt):
+
+def datetime_to_timestamp(the_datetime):
    """
    >>> datetime_to_timestamp(datetime.datetime(2013, 12, 26, 10, 11, 12))
    '20131226101112'
    """

-    return dt.strftime(TIMESTAMP_14)
+    return the_datetime.strftime(TIMESTAMP_14)
+

 def iso_date_to_timestamp(string):
    """
@ -52,7 +59,7 @@ def iso_date_to_timestamp(string):


 # default pad is end of range for compatibility
-def pad_timestamp(string, pad_str = PAD_STAMP_END):
+def pad_timestamp(string, pad_str=PAD_STAMP_END):
    """
    >>> pad_timestamp('20')
    '20991231235959'
@ -76,10 +83,12 @@ def pad_timestamp(string, pad_str = PAD_STAMP_END):
 def timestamp_to_datetime(string):
    """
    >>> timestamp_to_datetime('20131226095010')
-    time.struct_time(tm_year=2013, tm_mon=12, tm_mday=26, tm_hour=9, tm_min=50, tm_sec=10, tm_wday=3, tm_yday=360, tm_isdst=-1)
+    time.struct_time(tm_year=2013, tm_mon=12, tm_mday=26, \
+tm_hour=9, tm_min=50, tm_sec=10, tm_wday=3, tm_yday=360, tm_isdst=-1)

    >>> timestamp_to_datetime('2014')
-    time.struct_time(tm_year=2014, tm_mon=12, tm_mday=31, tm_hour=23, tm_min=59, tm_sec=59, tm_wday=2, tm_yday=365, tm_isdst=-1)
+    time.struct_time(tm_year=2014, tm_mon=12, tm_mday=31, \
+tm_hour=23, tm_min=59, tm_sec=59, tm_wday=2, tm_yday=365, tm_isdst=-1)
    """

    # Default pad to end of range for comptability
--- a/pywb/views.py
+++ b/pywb/views.py
@ -1,4 +1,4 @@
-import cdxserver.timeutils as timeutils
+import pywb.utils.timeutils as timeutils

 import wbrequestresponse
 import wbexceptions
--- a/pywb/warc/README.md
+++ b/pywb/warc/README.md
@ -0,0 +1,22 @@
+## PyWb Warc v0.2
+
+[![Build Status](https://travis-ci.org/ikreymer/pywb_warc.png?branch=master)](https://travis-ci.org/ikreymer/pywb_warc)
+
+This is the WARC/ARC record loading component of pywb wayback tool suite.
+
+
+This package provides the following facilities:
+
+* Resolve relative WARC/ARC filenames to a full path based on configurable resolvers
+
+* Resolve 'revisit' records from provided index to find a full record with headers and payload content
+
+* Load WARC and ARC records either locally or via http using http 1.1 range requests
+
+
+### Tests
+
+This package will include a test suite for different WARC and ARC loading formats.
+
+To run: `python run-tests.py`
+
--- a/pywb/warc/init.py
+++ b/pywb/warc/init.py
--- a/pywb/warc/pathresolvers.py
+++ b/pywb/warc/pathresolvers.py
@ -1,13 +1,27 @@
 import redis
-import binsearch.binsearch
+
+from pywb.utils.binsearch import iter_exact
+from pywb.utils.loaders import SeekableTextFileReader

 import urlparse
 import os
 import logging

-#======================================
-# PrefixResolver - convert cdx file entry to url with prefix if url contains specified string
-#======================================
+"""
+The purpose of this module is to 'resolve' a warc/arc filename,
+often found in a CDX file, to a full loadable url.
+
+Supported resolvers are: url prefix, path index lookup and redis
+
+make_best_resolver() attempts to guess the resolver method for given uri
+
+"""
+
+
+#=================================================================
+# PrefixResolver - convert cdx file entry to url with prefix
+# if url contains specified string
+#=================================================================
 class PrefixResolver:
    def __init__(self, prefix, contains):
        self.prefix = prefix
@ -18,14 +32,15 @@ class PrefixResolver:

    def __repr__(self):
        if self.contains:
-            return "PrefixResolver('{0}', contains = '{1}')".format(self.prefix, self.contains)
+            return ("PrefixResolver('{0}', contains = '{1}')"
+                    .format(self.prefix, self.contains))
        else:
            return "PrefixResolver('{0}')".format(self.prefix)


-#======================================
+#=================================================================
 class RedisResolver:
-    def __init__(self, redis_url, key_prefix = None):
+    def __init__(self, redis_url, key_prefix=None):
        self.redis_url = redis_url
        self.key_prefix = key_prefix if key_prefix else 'w:'
        self.redis = redis.StrictRedis.from_url(redis_url)
@ -42,14 +57,14 @@ class RedisResolver:
        return "RedisResolver('{0}')".format(self.redis_url)


-#======================================
+#=================================================================
 class PathIndexResolver:
    def __init__(self, pathindex_file):
        self.pathindex_file = pathindex_file
-        self.reader = binsearch.binsearch.FileReader(pathindex_file)
+        self.reader = SeekableTextFileReader(pathindex_file)

    def __call__(self, filename):
-        result = binsearch.binsearch.iter_exact(self.reader, filename, '\t')
+        result = iter_exact(self.reader, filename, '\t')

        def gen_list(result):
            for pathline in result:
@ -63,6 +78,7 @@ class PathIndexResolver:
        return "PathIndexResolver('{0}')".format(self.pathindex_file)


+#=================================================================
 #TODO: more options (remote files, contains param, etc..)
 # find best resolver given the path
 def make_best_resolver(param):
@ -80,11 +96,14 @@ def make_best_resolver(param):
    RedisResolver('redis://myhost.example.com:1234/1')

    # a file
-    >>> class_name(make_best_resolver('file://' + os.path.realpath(__file__)))
+    >>> r = make_best_resolver('file://' + os.path.realpath(__file__))
+    >>> r.__class__.__name__
    'PathIndexResolver'

    # a dir
-    >>> class_name(make_best_resolver('file://' + os.path.dirname(os.path.realpath(__file__))))
+    >>> path = os.path.realpath(__file__)
+    >>> r = make_best_resolver('file://' + os.path.dirname(path))
+    >>> r.__class__.__name__
    'PrefixResolver'

    """
@ -99,27 +118,29 @@ def make_best_resolver(param):
    url_parts = urlparse.urlsplit(path)

    if url_parts.scheme == 'redis':
-        logging.info('Adding Redis Index: ' + path)
+        logging.debug('Adding Redis Index: ' + path)
        return RedisResolver(path, arg)

    if url_parts.scheme == 'file':
        path = url_parts.path

    if os.path.isfile(path):
-        logging.info('Adding Path Index: ' + path)
+        logging.debug('Adding Path Index: ' + path)
        return PathIndexResolver(path)

    # non-file paths always treated as prefix for now
    else:
-        logging.info('Adding Archive Path Source: ' + path)
+        logging.debug('Adding Archive Path Source: ' + path)
        return PrefixResolver(path, arg)


 #=================================================================
 def make_best_resolvers(paths):
    """
-    >>> make_best_resolvers(['http://myhost.example.com/warcs/', 'redis://myhost.example.com:1234/1'])
-    [PrefixResolver('http://myhost.example.com/warcs/'), RedisResolver('redis://myhost.example.com:1234/1')]
+    >>> r = make_best_resolvers(['http://example.com/warcs/',\
+                                'redis://example.com:1234/1'])
+    >>> map(lambda x: x.__class__.__name__, r)
+    ['PrefixResolver', 'RedisResolver']
    """
    if hasattr(paths, '__iter__'):
        return map(make_best_resolver, paths)
@ -127,13 +148,7 @@ def make_best_resolvers(paths):
        return [make_best_resolver(paths)]


-import utils
 #=================================================================
-if __name__ == "__main__" or utils.enable_doctests():
-
-    def class_name(obj):
-         return obj.__class__.__name__
-
+if __name__ == "__main__":
    import doctest
    doctest.testmod()
-
--- a/pywb/warc/recordloader.py
+++ b/pywb/warc/recordloader.py
@ -0,0 +1,161 @@
+import itertools
+import urlparse
+import collections
+
+from pywb.utils.statusandheaders import StatusAndHeaders
+from pywb.utils.statusandheaders import StatusAndHeadersParser
+
+from pywb.utils.loaders import FileLoader, HttpLoader
+from pywb.utils.bufferedreaders import BufferedReader
+
+#=================================================================
+ArcWarcRecord = collections.namedtuple('ArchiveRecord',
+                                       'type, rec_headers, ' +
+                                       'stream, status_headers')
+
+
+#=================================================================
+class ArchiveLoadFailed(Exception):
+    def __init__(self, reason, filename=''):
+        super(ArchiveLoadFailed, self).__init__(filename + ':' + str(reason))
+        #self.filename = filename
+        #self.reason = reason
+
+    def status(self):
+        return '503 Service Unavailable'
+
+
+#=================================================================
+class ArcWarcRecordLoader:
+    # Standard ARC headers
+    ARC_HEADERS = ["uri", "ip-address", "creation-date",
+                   "content-type", "length"]
+
+    # Since loading a range request,
+    # can only determine gzip-ness based on file extension
+    # (BufferedReader will however default to non-gzip if
+    # decompression fails)
+    FORMAT_MAP = {
+        '.warc.gz': ('warc', True),
+        '.arc.gz':  ('arc',  True),
+        '.warc':    ('warc', False),
+        '.arc':     ('arc',  False),
+    }
+
+    @staticmethod
+    def create_default_loaders(cookie_maker=None):
+        http = HttpLoader(cookie_maker)
+        file = FileLoader()
+        return {
+            'http': http,
+            'https': http,
+            'file': file,
+            '': file
+            }
+
+    def __init__(self, loaders={}, cookie_maker=None, chunk_size=8192):
+        self.loaders = loaders
+
+        if not self.loaders:
+            self.loaders = self.create_default_loaders(cookie_maker)
+
+        self.chunk_size = chunk_size
+
+        self.arc_parser = ARCHeadersParser(self.ARC_HEADERS)
+
+        warc_types = ['WARC/1.0', 'WARC/0.17', 'WARC/0.18']
+        self.warc_parser = StatusAndHeadersParser(warc_types)
+        self.http_parser = StatusAndHeadersParser(['HTTP/1.0', 'HTTP/1.1'])
+
+    def load(self, url, offset, length):
+        url_parts = urlparse.urlsplit(url)
+
+        loader = self.loaders.get(url_parts.scheme)
+        if not loader:
+            raise ArchiveLoadFailed('Unknown Protocol', url)
+
+        the_format = None
+
+        for ext, iformat in self.FORMAT_MAP.iteritems():
+            if url.endswith(ext):
+                the_format = iformat
+                break
+
+        if the_format is None:
+            raise ArchiveLoadFailed('Unknown file format', url)
+
+        (a_format, is_gzip) = the_format
+
+        #decomp = utils.create_decompressor() if is_gzip else None
+        decomp_type = 'gzip' if is_gzip else None
+
+        try:
+            length = int(length)
+        except:
+            length = -1
+
+        raw = loader.load(url, long(offset), length)
+
+        stream = BufferedReader(raw, length, self.chunk_size, decomp_type)
+
+        if a_format == 'arc':
+            rec_headers = self.arc_parser.parse(stream)
+            rec_type = 'response'
+            empty = (rec_headers.get_header('length') == 0)
+
+        elif a_format == 'warc':
+            rec_headers = self.warc_parser.parse(stream)
+            rec_type = rec_headers.get_header('WARC-Type')
+            empty = (rec_headers.get_header('Content-Length') == '0')
+
+        # special case: empty w/arc record (hopefully a revisit)
+        if empty:
+            status_headers = StatusAndHeaders('204 No Content', [])
+
+        # special case: warc records that are not expected to have http headers
+        # attempt to add 200 status and content-type
+        elif rec_type == 'metadata' or rec_type == 'resource':
+            content_type = [('Content-Type',
+                            rec_headers.get_header('Content-Type'))]
+
+            status_headers = StatusAndHeaders('200 OK', content_type)
+
+        # special case: http 0.9 response, no status or headers
+        #elif rec_type == 'response':
+        #    content_type = rec_headers.get_header('Content-Type')
+        #    if content_type and (';version=0.9' in content_type):
+        #        status_headers = StatusAndHeaders('200 OK', [])
+
+        # response record: parse HTTP status and headers!
+        else:
+            #(statusline, http_headers) = self.parse_http_headers(stream)
+            status_headers = self.http_parser.parse(stream)
+
+        return ArcWarcRecord((a_format, rec_type),
+                             rec_headers, stream, status_headers)
+
+
+#=================================================================
+class ARCHeadersParser:
+    def __init__(self, headernames):
+        self.headernames = headernames
+
+    def parse(self, stream):
+        headerline = stream.readline().rstrip()
+
+        parts = headerline.split()
+
+        headernames = self.headernames
+
+        if len(parts) != len(headernames):
+            msg = 'Wrong # of headers, expected arc headers {0}, Found {1}'
+            raise ArchiveLoadFailed(msg.format(headernames, parts))
+
+        headers = []
+
+        for name, value in itertools.izip(headernames, parts):
+            headers.append((name, value))
+
+        return StatusAndHeaders(statusline='',
+                                headers=headers,
+                                protocol='ARC/1.0')
--- a/pywb/warc/resolvingloader.py
+++ b/pywb/warc/resolvingloader.py
@ -0,0 +1,176 @@
+from pywb.utils.timeutils import iso_date_to_timestamp
+from recordloader import ArcWarcRecordLoader, ArchiveLoadFailed
+from pathresolvers import make_best_resolvers
+
+
+#=================================================================
+class ResolvingLoader:
+    def __init__(self, paths, record_loader=ArcWarcRecordLoader(),
+                 cdx_server=None):
+
+        self.path_resolvers = make_best_resolvers(paths)
+        self.record_loader = record_loader
+        self.cdx_server = cdx_server
+
+    def resolve_headers_and_payload(self, cdx, failed_files):
+        """
+        Resolve headers and payload for a given capture
+        In the simple case, headers and payload are in the same record.
+        In the case of revisit records, the payload and headers may be in
+        different records.
+
+        If the original has already been found, lookup original using
+        orig. fields in cdx dict.
+        Otherwise, call _load_different_url_payload() to get cdx index
+        from a different url to find the original record.
+        """
+        has_curr = (cdx['filename'] != '-')
+        has_orig = (cdx.get('orig.filename', '-') != '-')
+
+        # load headers record from cdx['filename'] unless it is '-' (rare)
+        headers_record = None
+        if has_curr:
+            headers_record = self._resolve_path_load(cdx, False, failed_files)
+
+        # two index lookups
+        # Case 1: if mimetype is still warc/revisit
+        if cdx['mimetype'] == 'warc/revisit' and headers_record:
+            payload_record = self._load_different_url_payload(cdx,
+                                                              headers_record,
+                                                              failed_files)
+
+        # single lookup cases
+        # case 2: non-revisit
+        elif (has_curr and not has_orig):
+            payload_record = headers_record
+
+        # case 3: identical url revisit, load payload from orig.filename
+        elif (has_orig):
+            payload_record = self._resolve_path_load(cdx, True, failed_files)
+
+        # special case: set header to payload if old-style revisit
+        # with missing header
+        if not headers_record:
+            headers_record = payload_record
+        elif headers_record != payload_record:
+            # close remainder of stream as this record only used for
+            # (already parsed) headers
+            headers_record.stream.close()
+
+            # special case: check if headers record is actually empty
+            # (eg empty revisit), then use headers from revisit
+            if not headers_record.status_headers.headers:
+                headers_record = payload_record
+
+        if not headers_record or not payload_record:
+            raise ArchiveLoadFailed('Could not load ' + str(cdx))
+
+        return (headers_record.status_headers, payload_record.stream)
+
+    def _resolve_path_load(self, cdx, is_original, failed_files):
+        """
+        Load specific record based on filename, offset and length
+        fields in the cdx.
+        If original=True, use the orig.* fields for the cdx
+
+        Resolve the filename to full path using specified path resolvers
+
+        If failed_files list provided, keep track of failed resolve attempts
+        """
+
+        if is_original:
+            (filename, offset, length) = (cdx['orig.filename'],
+                                          cdx['orig.offset'],
+                                          cdx['orig.length'])
+        else:
+            (filename, offset, length) = (cdx['filename'],
+                                          cdx['offset'],
+                                          cdx['length'])
+
+        # optimization: if same file already failed this request,
+        # don't try again
+        if failed_files and filename in failed_files:
+            raise ArchiveLoadFailed('Skipping Already Failed', filename)
+
+        any_found = False
+        last_exc = None
+        for resolver in self.path_resolvers:
+            possible_paths = resolver(filename)
+
+            if possible_paths:
+                for path in possible_paths:
+                    any_found = True
+                    try:
+                        return self.record_loader.load(path, offset, length)
+
+                    except Exception as ue:
+                        last_exc = ue
+
+        # Unsuccessful if reached here
+        if failed_files:
+            failed_files.append(filename)
+
+        if last_exc:
+            msg = str(last_exc.__class__.__name__)
+        else:
+            msg = 'Archive File Not Found'
+
+        raise ArchiveLoadFailed(msg, filename)
+
+    def _load_different_url_payload(self, cdx, headers_record, failed_files):
+        """
+        Handle the case where a duplicate of a capture with same digest
+        exists at a different url.
+
+        If a cdx_server is provided, a query is made for matching
+        url, timestamp and digest.
+
+        Raise exception if no matches found.
+        """
+
+        ref_target_uri = (headers_record.rec_headers.
+                          get_header('WARC-Refers-To-Target-URI'))
+
+        target_uri = headers_record.rec_headers.get_header('WARC-Target-URI')
+
+        # Check for unresolved revisit error,
+        # if refers to target uri not present or same as the current url
+        if not ref_target_uri or (ref_target_uri == target_uri):
+            raise ArchiveLoadFailed('Missing Revisit Original')
+
+        ref_target_date = (headers_record.rec_headers.
+                           get_header('WARC-Refers-To-Date'))
+
+        if not ref_target_date:
+            ref_target_date = cdx['timestamp']
+        else:
+            ref_target_date = iso_date_to_timestamp(ref_target_date)
+
+        orig_cdx_lines = self.load_cdx_for_dupe(ref_target_uri,
+                                                ref_target_date, digest)
+
+        for cdx in orig_cdx_lines:
+            try:
+                payload_record = self._load_and_resolve(cdx, False,
+                                                        failed_files)
+                return payload_record
+
+            except ArchiveLoadFailed as e:
+                pass
+
+        raise ArchiveLoadFailed('Original for revisit could not be loaded')
+
+    def load_cdx_for_dupe(url, timestamp, digest):
+        """
+        If a cdx_server is available, return response from server,
+        otherwise empty list
+        """
+        if not self.cdx_server:
+            return []
+
+        params = {'url': url,
+                  'closest': closest,
+                  'filter': 'digest:' + digest,
+                  'output': 'raw'}
+
+        return self.cdx_server.load_cdx(params)
--- a/pywb/warc/test/test_loading.py
+++ b/pywb/warc/test/test_loading.py
@ -0,0 +1,199 @@
+
+"""
+Test loading different types of records from a variety of formats
+
+# Load response record from WARC
+>>> load_test_archive('example.warc.gz', '333', '1043')
+(('warc', 'response'),
+ StatusAndHeaders(protocol = 'WARC/1.0', statusline = '', headers = [ ('WARC-Type', 'response'),
+  ('WARC-Record-ID', '<urn:uuid:6d058047-ede2-4a13-be79-90c17c631dd4>'),
+  ('WARC-Date', '2014-01-03T03:03:21Z'),
+  ('Content-Length', '1610'),
+  ('Content-Type', 'application/http; msgtype=response'),
+  ('WARC-Payload-Digest', 'sha1:B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'),
+  ('WARC-Target-URI', 'http://example.com?example=1'),
+  ('WARC-Warcinfo-ID', '<urn:uuid:fbd6cf0a-6160-4550-b343-12188dc05234>')]),
+ StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
+  ('Cache-Control', 'max-age=604800'),
+  ('Content-Type', 'text/html'),
+  ('Date', 'Fri, 03 Jan 2014 03:03:21 GMT'),
+  ('Etag', '"359670651"'),
+  ('Expires', 'Fri, 10 Jan 2014 03:03:21 GMT'),
+  ('Last-Modified', 'Fri, 09 Aug 2013 23:54:35 GMT'),
+  ('Server', 'ECS (sjc/4FCE)'),
+  ('X-Cache', 'HIT'),
+  ('x-ec-custom-error', '1'),
+  ('Content-Length', '1270'),
+  ('Connection', 'close')]))
+
+# Load revisit record from WARC
+>>> load_test_archive('example.warc.gz', '1864', '553')
+(('warc', 'revisit'),
+ StatusAndHeaders(protocol = 'WARC/1.0', statusline = '', headers = [ ('WARC-Type', 'revisit'),
+  ('WARC-Record-ID', '<urn:uuid:3619f5b0-d967-44be-8f24-762098d427c4>'),
+  ('WARC-Date', '2014-01-03T03:03:41Z'),
+  ('Content-Length', '340'),
+  ('Content-Type', 'application/http; msgtype=response'),
+  ('WARC-Payload-Digest', 'sha1:B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'),
+  ('WARC-Target-URI', 'http://example.com?example=1'),
+  ('WARC-Warcinfo-ID', '<urn:uuid:fbd6cf0a-6160-4550-b343-12188dc05234>'),
+  ( 'WARC-Profile',
+    'http://netpreserve.org/warc/0.18/revisit/identical-payload-digest'),
+  ('WARC-Refers-To-Target-URI', 'http://example.com?example=1'),
+  ('WARC-Refers-To-Date', '2014-01-03T03:03:21Z')]),
+ StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
+  ('Cache-Control', 'max-age=604800'),
+  ('Content-Type', 'text/html'),
+  ('Date', 'Fri, 03 Jan 2014 03:03:41 GMT'),
+  ('Etag', '"359670651"'),
+  ('Expires', 'Fri, 10 Jan 2014 03:03:41 GMT'),
+  ('Last-Modified', 'Fri, 09 Aug 2013 23:54:35 GMT'),
+  ('Server', 'ECS (sjc/4FCE)'),
+  ('X-Cache', 'HIT'),
+  ('x-ec-custom-error', '1'),
+  ('Content-Length', '1270'),
+  ('Connection', 'close')]))
+
+
+# Test of record loading based on cdx line
+# Print parsed http headers + 2 lines of content
+# ==============================================================================
+
+# Test loading from ARC based on cdx line
+>>> load_from_cdx_test('com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 856 171 example.arc.gz')
+StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
+  ('Cache-Control', 'max-age=604800'),
+  ('Content-Type', 'text/html'),
+  ('Date', 'Sun, 16 Feb 2014 05:02:20 GMT'),
+  ('Etag', '"359670651"'),
+  ('Expires', 'Sun, 23 Feb 2014 05:02:20 GMT'),
+  ('Last-Modified', 'Fri, 09 Aug 2013 23:54:35 GMT'),
+  ('Server', 'ECS (sjc/4FCE)'),
+  ('X-Cache', 'HIT'),
+  ('x-ec-custom-error', '1'),
+  ('Content-Length', '1270')])
+<!doctype html>
+<html>
+
+>>> load_from_cdx_test('com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1656 151 example.arc')
+StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
+  ('Cache-Control', 'max-age=604800'),
+  ('Content-Type', 'text/html'),
+  ('Date', 'Sun, 16 Feb 2014 05:02:20 GMT'),
+  ('Etag', '"359670651"'),
+  ('Expires', 'Sun, 23 Feb 2014 05:02:20 GMT'),
+  ('Last-Modified', 'Fri, 09 Aug 2013 23:54:35 GMT'),
+  ('Server', 'ECS (sjc/4FCE)'),
+  ('X-Cache', 'HIT'),
+  ('x-ec-custom-error', '1'),
+  ('Content-Length', '1270')])
+<!doctype html>
+<html>
+
+
+# Test loading from WARC based on cdx line
+>>> load_from_cdx_test('com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz')
+StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
+  ('Cache-Control', 'max-age=604800'),
+  ('Content-Type', 'text/html'),
+  ('Date', 'Fri, 03 Jan 2014 03:03:21 GMT'),
+  ('Etag', '"359670651"'),
+  ('Expires', 'Fri, 10 Jan 2014 03:03:21 GMT'),
+  ('Last-Modified', 'Fri, 09 Aug 2013 23:54:35 GMT'),
+  ('Server', 'ECS (sjc/4FCE)'),
+  ('X-Cache', 'HIT'),
+  ('x-ec-custom-error', '1'),
+  ('Content-Length', '1270'),
+  ('Connection', 'close')])
+<!doctype html>
+<html>
+
+# Test cdx w/ revisit
+>>> load_from_cdx_test('com,example)/?example=1 20140103030341 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz 1043 333 example.warc.gz')
+StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
+  ('Cache-Control', 'max-age=604800'),
+  ('Content-Type', 'text/html'),
+  ('Date', 'Fri, 03 Jan 2014 03:03:41 GMT'),
+  ('Etag', '"359670651"'),
+  ('Expires', 'Fri, 10 Jan 2014 03:03:41 GMT'),
+  ('Last-Modified', 'Fri, 09 Aug 2013 23:54:35 GMT'),
+  ('Server', 'ECS (sjc/4FCE)'),
+  ('X-Cache', 'HIT'),
+  ('x-ec-custom-error', '1'),
+  ('Content-Length', '1270'),
+  ('Connection', 'close')])
+<!doctype html>
+<html>
+
+# Test loading warc created by wget 1.14
+>>> load_from_cdx_test('com,example)/ 20140216012908 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1151 792 example-wget-1-14.warc.gz')
+StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
+  ('Cache-Control', 'max-age=604800'),
+  ('Content-Type', 'text/html'),
+  ('Date', 'Sun, 16 Feb 2014 01:29:08 GMT'),
+  ('Etag', '"359670651"'),
+  ('Expires', 'Sun, 23 Feb 2014 01:29:08 GMT'),
+  ('Last-Modified', 'Fri, 09 Aug 2013 23:54:35 GMT'),
+  ('Server', 'ECS (sjc/4FB4)'),
+  ('X-Cache', 'HIT'),
+  ('x-ec-custom-error', '1'),
+  ('Content-Length', '1270')])
+<!doctype html>
+<html>
+
+# Error Handling
+
+# Invalid WARC Offset
+>>> load_from_cdx_test('com,example)/?example=1 20140103030341 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1860 example.warc.gz 1043 333 example.warc.gz')
+Traceback (most recent call last):
+ArchiveLoadFailed: example.warc.gz:StatusAndHeadersParserException
+
+# Invalid ARC Offset
+>>> load_from_cdx_test('com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 332 example.warc.gz')
+Traceback (most recent call last):
+ArchiveLoadFailed: example.warc.gz:StatusAndHeadersParserException
+
+
+# Error Expected with revisit -- invalid offset on original
+>>> load_from_cdx_test('com,example)/?example=1 20140103030341 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz 1043 330 example.warc.gz')
+Traceback (most recent call last):
+ArchiveLoadFailed: example.warc.gz:StatusAndHeadersParserException
+
+"""
+
+import os
+import sys
+import pprint
+
+from pywb.warc.recordloader import ArcWarcRecordLoader, ArchiveLoadFailed
+from pywb.warc.pathresolvers import make_best_resolvers
+from pywb.warc.resolvingloader import ResolvingLoader
+from pywb.cdx.cdxobject import CDXObject
+
+from pywb import get_test_dir
+
+#test_warc_dir = os.path.dirname(os.path.realpath(__file__)) + '/../sample_data/'
+test_warc_dir = get_test_dir() + 'warcs/'
+
+def load_test_archive(test_file, offset, length):
+    path = test_warc_dir + test_file
+
+    testloader = ArcWarcRecordLoader()
+
+    archive = testloader.load(path, offset, length)
+    archive = testloader.load(path, offset, length)
+
+    pprint.pprint((archive.type, archive.rec_headers, archive.status_headers))
+
+
+def load_from_cdx_test(cdx):
+    resolve_loader = ResolvingLoader(test_warc_dir)
+    cdx = CDXObject(cdx)
+    (headers, stream) = resolve_loader.resolve_headers_and_payload(cdx, None)
+    print headers
+    sys.stdout.write(stream.readline())
+    sys.stdout.write(stream.readline())
+
+
+
+
--- a/pywb/wbapp.py
+++ b/pywb/wbapp.py
@ -1,8 +1,7 @@
-import utils
 import wbexceptions

 from wbrequestresponse import WbResponse, StatusAndHeaders
-from cdxserver.cdxserver import CDXException
+from pywb.cdx.cdxserver import CDXException

 import os
 import importlib
@ -10,13 +9,37 @@ import logging



+#=================================================================
+# adapted -from wsgiref.request_uri, but doesn't include domain name and allows all characters
+# allowed in the path segment according to: http://tools.ietf.org/html/rfc3986#section-3.3
+# explained here: http://stackoverflow.com/questions/4669692/valid-characters-for-directory-part-of-a-url-for-short-links
+def rel_request_uri(environ, include_query=1):
+    """
+    Return the requested path, optionally including the query string
+
+    # Simple test:
+    >>> rel_request_uri({'PATH_INFO': '/web/example.com'})
+    '/web/example.com'
+
+    # Test all unecoded special chars and double-quote
+    # (double-quote must be encoded but not single quote)
+    >>> rel_request_uri({'PATH_INFO': "/web/example.com/0~!+$&'()*+,;=:\\\""})
+    "/web/example.com/0~!+$&'()*+,;=:%22"
+    """
+    from urllib import quote
+    url = quote(environ.get('PATH_INFO',''), safe='/~!$&\'()*+,;=:@')
+    if include_query and environ.get('QUERY_STRING'):
+        url += '?' + environ['QUERY_STRING']
+
+    return url
+
 #=================================================================
 def create_wb_app(wb_router):

    # Top-level wsgi application
    def application(env, start_response):
        if env.get('SCRIPT_NAME') or not env.get('REQUEST_URI'):
-            env['REL_REQUEST_URI'] = utils.rel_request_uri(env)
+            env['REL_REQUEST_URI'] = rel_request_uri(env)
        else:
            env['REL_REQUEST_URI'] = env['REQUEST_URI']

@ -95,7 +118,7 @@ def main():
        raise

 #=================================================================
-if __name__ == "__main__" or utils.enable_doctests():
+if __name__ == "__main__":
    pass
 else:
    application = main()
--- a/pywb/wbrequestresponse.py
+++ b/pywb/wbrequestresponse.py
@ -1,7 +1,6 @@
-from wburl import WbUrl
-from url_rewriter import UrlRewriter
-
-import utils
+from pywb.rewrite.wburl import WbUrl
+from pywb.rewrite.url_rewriter import UrlRewriter
+from pywb.utils.statusandheaders import StatusAndHeaders

 import pprint
 #WB Request and Response
@ -182,35 +181,6 @@ class WbResponse:
    def __repr__(self):
        return str(vars(self))

-
-#=================================================================
-class StatusAndHeaders:
-    def __init__(self, statusline, headers, protocol = ''):
-        self.statusline = statusline
-        self.headers = headers
-        self.protocol = protocol
-
-    def get_header(self, name):
-        name_lower = name.lower()
-        for value in self.headers:
-            if (value[0].lower() == name_lower):
-                return value[1]
-
-    def remove_header(self, name):
-        name_lower = name.lower()
-        for x in xrange(len(self.headers) - 1, -1, -1):
-            if self.headers[x][0].lower() == name_lower:
-                del self.headers[x]
-                break
-
-    def __repr__(self):
-        return "StatusAndHeaders(protocol = '{0}', statusline = '{1}', headers = {2})".format(self.protocol, self.statusline, pprint.pformat(self.headers, indent = 2))
-        #return pprint.pformat(self.__dict__)
-
-    def __eq__(self, other):
-        return self.statusline == other.statusline and self.headers == other.headers and self.protocol == other.protocol
-
-
 if __name__ == "__main__":
    import doctest
    doctest.testmod()
--- a/run-tests.py
+++ b/run-tests.py
@ -0,0 +1,3 @@
+import pytest
+result = pytest.main('-v --doctest-module tests/ pywb/')
+exit(result)
--- a/sample_archive/cdx/iana.cdx.gz
+++ b/sample_archive/cdx/iana.cdx.gz
--- a/sample_archive/warcs/example-wget-1-14.warc.gz
+++ b/sample_archive/warcs/example-wget-1-14.warc.gz
--- a/sample_archive/warcs/example.arc
+++ b/sample_archive/warcs/example.arc
@ -0,0 +1,69 @@
+filedesc://live-web-example.arc.gz 127.0.0.1 20140216050221 text/plain 75
+1 0 LiveWeb Capture
+URL IP-address Archive-date Content-type Archive-length
+
+http://example.com/ 93.184.216.119 20140216050221 text/html 1591
+HTTP/1.1 200 OK
+Accept-Ranges: bytes
+Cache-Control: max-age=604800
+Content-Type: text/html
+Date: Sun, 16 Feb 2014 05:02:20 GMT
+Etag: "359670651"
+Expires: Sun, 23 Feb 2014 05:02:20 GMT
+Last-Modified: Fri, 09 Aug 2013 23:54:35 GMT
+Server: ECS (sjc/4FCE)
+X-Cache: HIT
+x-ec-custom-error: 1
+Content-Length: 1270
+
+<!doctype html>
+<html>
+<head>
+    <title>Example Domain</title>
+
+    <meta charset="utf-8" />
+    <meta http-equiv="Content-type" content="text/html; charset=utf-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1" />
+    <style type="text/css">
+    body {
+        background-color: #f0f0f2;
+        margin: 0;
+        padding: 0;
+        font-family: "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
+        
+    }
+    div {
+        width: 600px;
+        margin: 5em auto;
+        padding: 50px;
+        background-color: #fff;
+        border-radius: 1em;
+    }
+    a:link, a:visited {
+        color: #38488f;
+        text-decoration: none;
+    }
+    @media (max-width: 700px) {
+        body {
+            background-color: #fff;
+        }
+        div {
+            width: auto;
+            margin: 0 auto;
+            border-radius: 0;
+            padding: 1em;
+        }
+    }
+    </style>    
+</head>
+
+<body>
+<div>
+    <h1>Example Domain</h1>
+    <p>This domain is established to be used for illustrative examples in documents. You may use this
+    domain in examples without prior coordination or asking for permission.</p>
+    <p><a href="http://www.iana.org/domains/example">More information...</a></p>
+</div>
+</body>
+</html>
+
--- a/sample_archive/warcs/example.arc.gz
+++ b/sample_archive/warcs/example.arc.gz
--- a/setup.py
+++ b/setup.py
@ -5,18 +5,18 @@ import setuptools
 import glob

 setuptools.setup(name='pywb',
-        version='0.1',
+        version='0.2',
        url='https://github.com/ikreymer/pywb',
        author='Ilya Kreymer',
        author_email='ilya@archive.org',
        long_description=open('README.md').read(),
        license='GPL',
-        packages=['pywb', 'pywb.binsearch', 'pywb.cdxserver'],
-        provides=['pywb', 'pywb.binsearch', 'pywb.cdxserver'],
+        packages=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite'],
+        provides=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite'],
        package_data={'pywb': ['ui/*', 'static/*']},
        data_files = [('sample_archive/cdx/', glob.glob('sample_archive/cdx/*')),
                      ('sample_archive/warcs/', glob.glob('sample_archive/warcs/*'))],
-        install_requires=['uwsgi', 'rfc3987', 'chardet', 'redis', 'jinja2', 'surt', 'pyyaml', 'WebTest'],
-        tests_require=['WebTest', 'pytest'],
+        install_requires=['uwsgi', 'rfc3987', 'chardet', 'redis', 'jinja2', 'surt', 'pyyaml', 'WebTest','pytest'],
+#        tests_require=['WebTest', 'pytest'],
        zip_safe=False)

--- a/tests/test_archivalrouter.py
+++ b/tests/test_archivalrouter.py
@ -0,0 +1,88 @@
+"""
+Test Route
+# route with relative path
+>>> Route('web', BaseHandler())({'REL_REQUEST_URI': '/web/test.example.com', 'SCRIPT_NAME': ''}, False)
+{'wb_url': ('latest_replay', '', '', 'http://test.example.com', 'http://test.example.com'), 'coll': 'web', 'wb_prefix': '/web/', 'request_uri': '/web/test.example.com'}
+
+# route with absolute path, running at script /my_pywb
+>>> Route('web', BaseHandler())({'REL_REQUEST_URI': '/web/2013im_/test.example.com', 'SCRIPT_NAME': '/my_pywb', 'HTTP_HOST': 'localhost:8081', 'wsgi.url_scheme': 'https'}, True)
+{'wb_url': ('replay', '2013', 'im_', 'http://test.example.com', '2013im_/http://test.example.com'), 'coll': 'web', 'wb_prefix': 'https://localhost:8081/my_pywb/web/', 'request_uri': '/web/2013im_/test.example.com'}
+
+
+# not matching route -- skipped
+>>> Route('web', BaseHandler())({'REL_REQUEST_URI': '/other/test.example.com', 'SCRIPT_NAME': ''}, False)
+
+
+# Referer Redirect Test
+>>> ReferRedirect('http://localhost:8080/').match_prefixs
+['http://localhost:8080/']
+
+>>> ReferRedirect(['http://example:9090/']).match_prefixs
+['http://example:9090/']
+
+>>> _test_redir('http://localhost:8080/', '/diff_path/other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html')
+'http://localhost:8080/coll/20131010/http://example.com/diff_path/other.html'
+
+>>> _test_redir('http://localhost:8080/', '/../other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html')
+'http://localhost:8080/coll/20131010/http://example.com/other.html'
+
+>>> _test_redir('http://localhost:8080/', '/../../other.html', 'http://localhost:8080/coll/20131010/http://example.com/index.html')
+'http://localhost:8080/coll/20131010/http://example.com/other.html'
+
+# Custom collection
+>>> _test_redir('http://localhost:8080/', '/other.html', 'http://localhost:8080/complex/123/20131010/http://example.com/path/page.html', coll='complex/123')
+'http://localhost:8080/complex/123/20131010/http://example.com/other.html'
+
+# With timestamp included
+>>> _test_redir('http://localhost:8080/', '/20131010/other.html', 'http://localhost:8080/coll/20131010/http://example.com/index.html')
+'http://localhost:8080/coll/20131010/http://example.com/other.html'
+
+# With timestamp included
+>>> _test_redir('http://localhost:8080/', '/20131010/path/other.html', 'http://localhost:8080/coll/20131010/http://example.com/some/index.html')
+'http://localhost:8080/coll/20131010/http://example.com/path/other.html'
+
+# Wrong Host
+>>> _test_redir('http://example:8080/', '/other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html')
+False
+
+# Right Host
+>>> _test_redir('http://localhost:8080/', '/other.html', 'http://example.com:8080/coll/20131010/http://example.com/path/page.html', http_host = 'example.com:8080')
+'http://example.com:8080/coll/20131010/http://example.com/other.html'
+
+# With custom SCRIPT_NAME
+>>> _test_redir('http://localhost:8080/', '/../other.html', 'http://localhost:8080/extra/coll/20131010/http://example.com/path/page.html', '/extra')
+'http://localhost:8080/extra/coll/20131010/http://example.com/other.html'
+
+# With custom SCRIPT_NAME + timestamp
+>>> _test_redir('http://localhost:8080/', '/20131010/other.html', 'http://localhost:8080/extra/coll/20131010/http://example.com/path/page.html', '/extra')
+'http://localhost:8080/extra/coll/20131010/http://example.com/other.html'
+
+# With custom SCRIPT_NAME, bad match
+>>> _test_redir('http://localhost:8080/', '/../other.html', 'http://localhost:8080/extra/coll/20131010/http://example.com/path/page.html', '/extr')
+False
+
+
+"""
+
+from pywb.archivalrouter import Route, ReferRedirect
+from pywb.handlers import BaseHandler
+
+
+def _test_redir(match_host, request_uri, referrer, script_name = '', coll = 'coll', http_host = None):
+    env = {'REL_REQUEST_URI': request_uri, 'HTTP_REFERER': referrer, 'SCRIPT_NAME': script_name}
+
+    if http_host:
+        env['HTTP_HOST'] = http_host
+
+    routes = [Route(coll, BaseHandler())]
+
+    redir = ReferRedirect(match_host)
+    #req = WbRequest.from_uri(request_uri, env)
+    rep = redir(env, routes)
+    if not rep:
+        return False
+
+    return rep.status_headers.get_header('Location')
+
+
+
--- a/tests/test_binsearch.py
+++ b/tests/test_binsearch.py
@ -1,43 +0,0 @@
-import os
-from ..pywb.binsearch.binsearch import iter_prefix, iter_exact, FileReader
-
-test_cdx_dir = os.path.dirname(os.path.realpath(__file__)) + '/../sample_archive/cdx/'
-
-def binsearch_cdx_test(key, iter_func):
-    """
-    # Prefix Search
-    >>> binsearch_cdx_test('org,iana)/domains/root', iter_prefix)
-    org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz
-    org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz
-    org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz
-    org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz
-
-    >>> binsearch_cdx_test('org,iana)/domains/root', iter_exact)
-    org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz
-
-    >>> binsearch_cdx_test('org,iana)/', iter_exact)
-    org,iana)/ 20140126200624 http://www.iana.org/ text/html 200 OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 2258 334 iana.warc.gz
-
-    >>> binsearch_cdx_test('org,iana)/domains/root/db', iter_exact)
-    org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz
-    org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz
-
-    # Exact Search
-    >>> binsearch_cdx_test('org,iaana)/', iter_exact)
-    >>> binsearch_cdx_test('org,ibna)/', iter_exact)
-
-    >>> binsearch_cdx_test('org,iana)/time-zones', iter_exact)
-    org,iana)/time-zones 20140126200737 http://www.iana.org/time-zones text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
-    """
-
-    cdx =  FileReader(test_cdx_dir + 'iana.cdx')
-
-    for line in iter_func(cdx, key):
-        print line
-
-
-if __name__ == "__main__":
-    import doctest
-    doctest.testmod()
-
-
--- a/tests/test_cdxserve.py
+++ b/tests/test_cdxserve.py
@ -1,149 +0,0 @@
-from ..pywb.binsearch.binsearch import iter_exact, iter_prefix, FileReader
-from ..pywb.cdxserver.cdxserver import CDXServer
-import os
-import sys
-import pprint
-
-test_cdx_dir = os.path.dirname(os.path.realpath(__file__)) + '/../sample_archive/cdx/'
-
-def cdx_ops_test(url, sources = [test_cdx_dir + 'iana.cdx'], **kwparams):
-    """
-    # Merge Sort Multipe CDX Sources
-    >>> cdx_ops_test(url = 'http://iana.org/', sources = [test_cdx_dir + 'dupes.cdx', test_cdx_dir + 'iana.cdx'])
-    org,iana)/ 20140126200624 http://www.iana.org/ text/html 200 OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 2258 334 iana.warc.gz
-    org,iana)/ 20140127171238 http://iana.org unk 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 343 1858 dupes.warc.gz
-    org,iana)/ 20140127171238 http://www.iana.org/ warc/revisit - OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 536 2678 dupes.warc.gz
-
-
-    # Limit CDX Stream
-    >>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', limit = 3)
-    org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200625 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 117166 198285 iana.warc.gz
-    org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200654 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf warc/revisit - YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 548 482544 iana.warc.gz
-    org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200706 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf warc/revisit - YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 495230 iana.warc.gz
-
-
-    # Reverse CDX Stream
-    >>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', reverse = True, resolve_revisits = True, limit = 3)
-    org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201308 https://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 551 783712 iana.warc.gz 117166 198285 iana.warc.gz
-    org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201249 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 771773 iana.warc.gz 117166 198285 iana.warc.gz
-    org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201240 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 551 757988 iana.warc.gz 117166 198285 iana.warc.gz
-
-    >>> cdx_ops_test('http://iana.org/_js/2013.1/jquery.js', reverse = True, resolve_revisits = True, limit = 1)
-    org,iana)/_js/2013.1/jquery.js 20140126201307 https://www.iana.org/_js/2013.1/jquery.js application/x-javascript 200 AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO - - 543 778507 iana.warc.gz 33449 7311 iana.warc.gz
-
-    # No matching results
-    >>> cdx_ops_test('http://iana.org/dont_have_this', reverse = True, resolve_revisits = True, limit = 2)
-
-
-    # Filter cdx
-    >>> cdx_ops_test(url = 'http://iana.org/domains', match_type = 'prefix', filter = ['mimetype:text/html'])
-    org,iana)/domains 20140126200825 http://www.iana.org/domains text/html 200 7UPSCLNWNZP33LGW6OJGSF2Y4CDG4ES7 - - 2912 610534 iana.warc.gz
-    org,iana)/domains/arpa 20140126201248 http://www.iana.org/domains/arpa text/html 200 QOFZZRN6JIKAL2JRL6ZC2VVG42SPKGHT - - 2939 759039 iana.warc.gz
-    org,iana)/domains/idn-tables 20140126201127 http://www.iana.org/domains/idn-tables text/html 200 HNCUFTJMOQOGAEY6T56KVC3T7TVLKGEW - - 8118 715878 iana.warc.gz
-    org,iana)/domains/int 20140126201239 http://www.iana.org/domains/int text/html 200 X32BBNNORV4SPEHTQF5KI5NFHSKTZK6Q - - 2482 746788 iana.warc.gz
-    org,iana)/domains/reserved 20140126201054 http://www.iana.org/domains/reserved text/html 200 R5AAEQX5XY5X5DG66B23ODN5DUBWRA27 - - 3573 701457 iana.warc.gz
-    org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz
-    org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz
-    org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz
-    org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz
-
-
-    >>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', filter = 'statuscode:200')
-    org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz
-
-
-    # Collapse by timestamp
-    # unresolved revisits, different statuscode results in an extra repeat
-    >>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', collapse_time = 11)
-    org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz
-    org,iana)/_css/2013.1/screen.css 20140126200653 http://www.iana.org/_css/2013.1/screen.css warc/revisit - BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 533 328367 iana.warc.gz
-    org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css warc/revisit - BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz
-
-    # resolved revisits
-    >>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', collapse_time = '11', resolve_revisits = True)
-    org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz - - -
-    org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz 8754 41238 iana.warc.gz
-
-
-    # Sort by closest timestamp + field select output
-    >>> cdx_ops_test(closest_to = '20140126200826', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', fields = 'timestamp', limit = 10)
-    20140126200826
-    20140126200816
-    20140126200805
-    20140126200912
-    20140126200738
-    20140126200930
-    20140126200718
-    20140126200706
-    20140126200654
-    20140126200625
-
-    >>> cdx_ops_test(closest_to = '20140126201306', url = 'http://iana.org/dnssec', resolve_revisits = True, sources = [test_cdx_dir + 'dupes.cdx', test_cdx_dir + 'iana.cdx'])
-    org,iana)/dnssec 20140126201306 http://www.iana.org/dnssec text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 442 772827 iana.warc.gz - - -
-    org,iana)/dnssec 20140126201307 https://www.iana.org/dnssec text/html 200 PHLRSX73EV3WSZRFXMWDO6BRKTVUSASI - - 2278 773766 iana.warc.gz - - -
-
-
-    >>> cdx_ops_test(closest_to = '20140126201307', url = 'http://iana.org/dnssec', resolve_revisits = True)
-    org,iana)/dnssec 20140126201307 https://www.iana.org/dnssec text/html 200 PHLRSX73EV3WSZRFXMWDO6BRKTVUSASI - - 2278 773766 iana.warc.gz - - -
-    org,iana)/dnssec 20140126201306 http://www.iana.org/dnssec text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 442 772827 iana.warc.gz - - -
-
-    # equal dist prefer earlier
-    >>> cdx_ops_test(closest_to = '20140126200700', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolve_revisits = True, limit = 2)
-    org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200654 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 548 482544 iana.warc.gz 117166 198285 iana.warc.gz
-    org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200706 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 495230 iana.warc.gz 117166 198285 iana.warc.gz
-
-    >>> cdx_ops_test(closest_to = '20140126200659', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolve_revisits = True, limit = 2, fields = 'timestamp')
-    20140126200654
-    20140126200706
-
-    >>> cdx_ops_test(closest_to = '20140126200701', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolve_revisits = True, limit = 2, fields = 'timestamp')
-    20140126200706
-    20140126200654
-
-
-    # Resolve Revisits
-    >>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/inconsolata.otf', resolve_revisits = True)
-    org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200826 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 34054 620049 iana.warc.gz - - -
-    org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200912 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 546 667073 iana.warc.gz 34054 620049 iana.warc.gz
-    org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200930 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 534 697255 iana.warc.gz 34054 620049 iana.warc.gz
-    org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201055 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 547 714833 iana.warc.gz 34054 620049 iana.warc.gz
-    org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201249 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 551 768625 iana.warc.gz 34054 620049 iana.warc.gz
-
-    >>> cdx_ops_test('http://iana.org/domains/root/db', resolve_revisits = True)
-    org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz - - -
-    org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz - - -
-
-
-    # CDX Server init
-    >>> x = CDXServer([test_cdx_dir]).load_cdx(url = 'example.com', limit = 1, output = 'raw')
-    >>> pprint.pprint(x.next().items())
-    [('urlkey', 'com,example)/'),
-     ('timestamp', '20140127171200'),
-     ('original', 'http://example.com'),
-     ('mimetype', 'text/html'),
-     ('statuscode', '200'),
-     ('digest', 'B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'),
-     ('redirect', '-'),
-     ('robotflags', '-'),
-     ('length', '1046'),
-     ('offset', '334'),
-     ('filename', 'dupes.warc.gz')]
-
-    """
-
-    kwparams['url'] = url
-    kwparams['output'] = 'text'
-
-    server = CDXServer(sources)
-    results = server.load_cdx(**kwparams)
-
-    for x in results:
-        sys.stdout.write(x)
-
-
-
-if __name__ == "__main__":
-    import doctest
-    doctest.testmod()
-
-
--- a/tests/test_integration.py
+++ b/tests/test_integration.py
@ -1,7 +1,7 @@
 import webtest
-from ..pywb.pywb_init import pywb_config
-from ..pywb.wbapp import create_wb_app
-from ..pywb.cdxserver.cdxobject import CDXObject
+from pywb.pywb_init import pywb_config
+from pywb.wbapp import create_wb_app
+from pywb.cdx.cdxobject import CDXObject

 class TestWb:
    TEST_CONFIG = 'test_config.yaml'