pywb 0.2!

move to distinct packages: pywb.utils, pywb.cdx, pywb.warc, pywb.util, pywb.rewrite! each package will have its own README and tests shared sample_data and install
2025-03-14 15:53:28 +01:00 · 2014-02-17 02:34:39 -08:00 · 2014-02-17 02:34:39 -08:00 · 5345459298
commit 5345459298
parent 2528ee0a7c
61 changed files with 2951 additions and 2185 deletions
--- a/init.py
+++ b/init.py
@ -1,2 +0,0 @@
 #Allow importing
--- a/pywb/init.py
+++ b/pywb/init.py
@ -1,3 +1,4 @@
-#Allow importing
+import os
 def get_test_dir():
    return os.path.dirname(os.path.realpath(__file__)) + '/../sample_archive/'
--- a/pywb/archivalrouter.py
+++ b/pywb/archivalrouter.py
@ -3,8 +3,8 @@ import re
 import wbexceptions
 from wbrequestresponse import WbRequest, WbResponse
-from url_rewriter import UrlRewriter
+from pywb.rewrite.url_rewriter import UrlRewriter
-from wburl import WbUrl
+from pywb.rewrite.wburl import WbUrl
 #=================================================================
 # ArchivalRouter -- route WB requests in archival mode
@ -45,20 +45,6 @@ class ArchivalRouter:
 # of request uri (excluding first '/')
 #=================================================================
 class Route:
    """
    # route with relative path
    >>> Route('web', handlers.BaseHandler())({'REL_REQUEST_URI': '/web/test.example.com', 'SCRIPT_NAME': ''}, False)
    {'wb_url': ('latest_replay', '', '', 'http://test.example.com', 'http://test.example.com'), 'coll': 'web', 'wb_prefix': '/web/', 'request_uri': '/web/test.example.com'}
    # route with absolute path, running at script /my_pywb
    >>> Route('web', handlers.BaseHandler())({'REL_REQUEST_URI': '/web/2013im_/test.example.com', 'SCRIPT_NAME': '/my_pywb', 'HTTP_HOST': 'localhost:8081', 'wsgi.url_scheme': 'https'}, True)
    {'wb_url': ('replay', '2013', 'im_', 'http://test.example.com', '2013im_/http://test.example.com'), 'coll': 'web', 'wb_prefix': 'https://localhost:8081/my_pywb/web/', 'request_uri': '/web/2013im_/test.example.com'}
    # not matching route -- skipped
    >>> Route('web', handlers.BaseHandler())({'REL_REQUEST_URI': '/other/test.example.com', 'SCRIPT_NAME': ''}, False)
    """
    # match upto next / or ? or end
    SLASH_QUERY_LOOKAHEAD ='(?=/|$|\?)'
@ -127,57 +113,6 @@ class Route:
 # ReferRedirect -- redirect urls that have 'fallen through' based on the referrer settings
 #=================================================================
 class ReferRedirect:
    """
    >>> ReferRedirect('http://localhost:8080/').match_prefixs
    ['http://localhost:8080/']
    >>> ReferRedirect(['http://example:9090/']).match_prefixs
    ['http://example:9090/']
    >>> test_redir('http://localhost:8080/', '/diff_path/other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html')
    'http://localhost:8080/coll/20131010/http://example.com/diff_path/other.html'
    >>> test_redir('http://localhost:8080/', '/../other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html')
    'http://localhost:8080/coll/20131010/http://example.com/other.html'
    >>> test_redir('http://localhost:8080/', '/../../other.html', 'http://localhost:8080/coll/20131010/http://example.com/index.html')
    'http://localhost:8080/coll/20131010/http://example.com/other.html'
    # Custom collection
    >>> test_redir('http://localhost:8080/', '/other.html', 'http://localhost:8080/complex/123/20131010/http://example.com/path/page.html', coll='complex/123')
    'http://localhost:8080/complex/123/20131010/http://example.com/other.html'
    # With timestamp included
    >>> test_redir('http://localhost:8080/', '/20131010/other.html', 'http://localhost:8080/coll/20131010/http://example.com/index.html')
    'http://localhost:8080/coll/20131010/http://example.com/other.html'
    # With timestamp included
    >>> test_redir('http://localhost:8080/', '/20131010/path/other.html', 'http://localhost:8080/coll/20131010/http://example.com/some/index.html')
    'http://localhost:8080/coll/20131010/http://example.com/path/other.html'
    # Wrong Host
    >>> test_redir('http://example:8080/', '/other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html')
    False
    # Right Host
    >>> test_redir('http://localhost:8080/', '/other.html', 'http://example.com:8080/coll/20131010/http://example.com/path/page.html', http_host = 'example.com:8080')
    'http://example.com:8080/coll/20131010/http://example.com/other.html'
    # With custom SCRIPT_NAME
    >>> test_redir('http://localhost:8080/', '/../other.html', 'http://localhost:8080/extra/coll/20131010/http://example.com/path/page.html', '/extra')
    'http://localhost:8080/extra/coll/20131010/http://example.com/other.html'
    # With custom SCRIPT_NAME + timestamp
    >>> test_redir('http://localhost:8080/', '/20131010/other.html', 'http://localhost:8080/extra/coll/20131010/http://example.com/path/page.html', '/extra')
    'http://localhost:8080/extra/coll/20131010/http://example.com/other.html'
    # With custom SCRIPT_NAME, bad match
    >>> test_redir('http://localhost:8080/', '/../other.html', 'http://localhost:8080/extra/coll/20131010/http://example.com/path/page.html', '/extr')
    False
    """
    def __init__(self, match_prefixs):
        if isinstance(match_prefixs, list):
            self.match_prefixs = match_prefixs
@ -240,31 +175,3 @@ class ReferRedirect:
        final_url = urlparse.urlunsplit((ref_split.scheme, ref_split.netloc, rewriter.rewrite(rel_request_uri), '', ''))
        return WbResponse.redir_response(final_url)
 import utils
 if __name__ == "__main__" or utils.enable_doctests():
    import handlers
    def test_redir(match_host, request_uri, referrer, script_name = '', coll = 'coll', http_host = None):
        env = {'REL_REQUEST_URI': request_uri, 'HTTP_REFERER': referrer, 'SCRIPT_NAME': script_name}
        if http_host:
            env['HTTP_HOST'] = http_host
        routes = [Route(coll, handlers.BaseHandler())]
        redir = ReferRedirect(match_host)
        #req = WbRequest.from_uri(request_uri, env)
        rep = redir(env, routes)
        if not rep:
            return False
        return rep.status_headers.get_header('Location')
    import doctest
    doctest.testmod()
--- a/pywb/archiveloader.py
+++ b/pywb/archiveloader.py
@ -1,461 +0,0 @@
 import itertools
 import utils
 import urllib2
 import StringIO
 import urlparse
 import collections
 import wbexceptions
 from wbrequestresponse import StatusAndHeaders
 #=================================================================
 # load a reader from http
 #=================================================================
 class HttpLoader:
    """
    Load content over http with range request and optional signature
    """
    def __init__(self, hmac = None, hmac_duration = 30):
        self.hmac = hmac
        self.hmac_duration = hmac_duration
    def load(self, url, offset, length):
        if length > 0:
            range_header = 'bytes={0}-{1}'.format(offset, offset + length - 1)
        else:
            range_header = 'bytes={0}-'.format(offset)
        headers = {}
        headers['Range'] = range_header
        if self.hmac:
            headers['Cookie'] = self.hmac(self.hmac_duration)
        request = urllib2.Request(url, headers = headers)
        return urllib2.urlopen(request)
 #=================================================================
 # load a reader from local filesystem
 #=================================================================
 class FileLoader:
    """
    Load content from local file-system
    # Ensure attempt to read more than 100 bytes, only reads 100 bytes
    >>> len(FileLoader().load(utils.test_data_dir() + 'warcs/iana.warc.gz', 0, 100).read('400'))
    100
    """
    def load(self, url, offset, length):
        if url.startswith('file://'):
            url = url[len('file://'):]
        afile = open(url, 'rb')
        afile.seek(offset)
        if length > 0:
            return LimitReader(afile, length)
        else:
            return afile
 #=================================================================
 # A reader which will not read past the specified limit
 #=================================================================
 class LimitReader:
    """
    >>> LimitReader(StringIO.StringIO('abcdefghjiklmnopqrstuvwxyz'), 10).read(26)
    'abcdefghji'
    >>> LimitReader(StringIO.StringIO('abcdefghjiklmnopqrstuvwxyz'), 8).readline(26)
    'abcdefgh'
    >>> test_multiple_reads(LimitReader(StringIO.StringIO('abcdefghjiklmnopqrstuvwxyz'), 10), [2, 2, 20])
    'efghji'
    """
    def __init__(self, stream, limit):
        self.stream = stream
        self.limit = limit
        if not self.limit:
            self.limit = 1
    def read(self, length = None):
        length = min(length, self.limit) if length else self.limit
        buff = self.stream.read(length)
        self.limit -= len(buff)
        return buff
    def readline(self, length = None):
        length = min(length, self.limit) if length else self.limit
        buff = self.stream.readline(length)
        self.limit -= len(buff)
        return buff
    def close(self):
        self.stream.close()
 #=================================================================
 WBArchiveRecord = collections.namedtuple('WBArchiveRecord', 'type, rec_headers, stream, status_headers')
 #=================================================================
 class ArchiveLoader:
    """
    >>> load_test_archive('example.warc.gz', '333', '1043')
    (('warc', 'response'),
     StatusAndHeaders(protocol = 'WARC/1.0', statusline = '', headers = [ ('WARC-Type', 'response'),
      ('WARC-Record-ID', '<urn:uuid:6d058047-ede2-4a13-be79-90c17c631dd4>'),
      ('WARC-Date', '2014-01-03T03:03:21Z'),
      ('Content-Length', '1610'),
      ('Content-Type', 'application/http; msgtype=response'),
      ('WARC-Payload-Digest', 'sha1:B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'),
      ('WARC-Target-URI', 'http://example.com?example=1'),
      ('WARC-Warcinfo-ID', '<urn:uuid:fbd6cf0a-6160-4550-b343-12188dc05234>')]),
     StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
      ('Cache-Control', 'max-age=604800'),
      ('Content-Type', 'text/html'),
      ('Date', 'Fri, 03 Jan 2014 03:03:21 GMT'),
      ('Etag', '"359670651"'),
      ('Expires', 'Fri, 10 Jan 2014 03:03:21 GMT'),
      ('Last-Modified', 'Fri, 09 Aug 2013 23:54:35 GMT'),
      ('Server', 'ECS (sjc/4FCE)'),
      ('X-Cache', 'HIT'),
      ('x-ec-custom-error', '1'),
      ('Content-Length', '1270'),
      ('Connection', 'close')]))
    >>> load_test_archive('example.warc.gz', '1864', '553')
    (('warc', 'revisit'),
     StatusAndHeaders(protocol = 'WARC/1.0', statusline = '', headers = [ ('WARC-Type', 'revisit'),
      ('WARC-Record-ID', '<urn:uuid:3619f5b0-d967-44be-8f24-762098d427c4>'),
      ('WARC-Date', '2014-01-03T03:03:41Z'),
      ('Content-Length', '340'),
      ('Content-Type', 'application/http; msgtype=response'),
      ('WARC-Payload-Digest', 'sha1:B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'),
      ('WARC-Target-URI', 'http://example.com?example=1'),
      ('WARC-Warcinfo-ID', '<urn:uuid:fbd6cf0a-6160-4550-b343-12188dc05234>'),
      ( 'WARC-Profile',
        'http://netpreserve.org/warc/0.18/revisit/identical-payload-digest'),
      ('WARC-Refers-To-Target-URI', 'http://example.com?example=1'),
      ('WARC-Refers-To-Date', '2014-01-03T03:03:21Z')]),
     StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
      ('Cache-Control', 'max-age=604800'),
      ('Content-Type', 'text/html'),
      ('Date', 'Fri, 03 Jan 2014 03:03:41 GMT'),
      ('Etag', '"359670651"'),
      ('Expires', 'Fri, 10 Jan 2014 03:03:41 GMT'),
      ('Last-Modified', 'Fri, 09 Aug 2013 23:54:35 GMT'),
      ('Server', 'ECS (sjc/4FCE)'),
      ('X-Cache', 'HIT'),
      ('x-ec-custom-error', '1'),
      ('Content-Length', '1270'),
      ('Connection', 'close')]))
    """
    # Standard ARC headers
    ARC_HEADERS = ["uri", "ip-address", "creation-date", "content-type", "length"]
    # Since loading a range request, can only determine gzip-ness based on file extension
    FORMAT_MAP = {
        '.warc.gz': ('warc', True),
        '.arc.gz':  ('arc',  True),
        '.warc':    ('warc', False),
        '.arc':     ('arc',  False),
    }
    @staticmethod
    def create_default_loaders(hmac = None):
        http = HttpLoader(hmac)
        file = FileLoader()
        return {
                'http': http,
                'https': http,
                'file': file,
                '': file
               }
    def __init__(self, loaders = {}, hmac = None, chunk_size = 8192):
        self.loaders = loaders if loaders else ArchiveLoader.create_default_loaders(hmac)
        self.chunk_size = chunk_size
        self.arc_parser = ARCHeadersParser(ArchiveLoader.ARC_HEADERS)
        self.warc_parser = StatusAndHeadersParser(['WARC/1.0', 'WARC/0.17', 'WARC/0.18'])
        self.http_parser = StatusAndHeadersParser(['HTTP/1.0', 'HTTP/1.1'])
    def load(self, url, offset, length):
        url_parts = urlparse.urlsplit(url)
        loader = self.loaders.get(url_parts.scheme)
        if not loader:
            raise wbexceptions.UnknownLoaderProtocolException(url)
        the_format = None
        for ext, iformat in ArchiveLoader.FORMAT_MAP.iteritems():
            if url.endswith(ext):
                the_format = iformat
                break
        if the_format is None:
            raise wbexceptions.UnknownArchiveFormatException(url)
        (a_format, is_gzip) = the_format
        decomp = utils.create_decompressor() if is_gzip else None
        try:
            length = int(length)
        except:
            length = -1
        raw = loader.load(url, long(offset), length)
        stream = LineReader(raw, length, self.chunk_size, decomp)
        if a_format == 'arc':
            rec_headers = self.arc_parser.parse(stream)
            rec_type = 'response'
            empty = (rec_headers.get_header('length') == 0)
        elif a_format == 'warc':
            rec_headers = self.warc_parser.parse(stream)
            rec_type = rec_headers.get_header('WARC-Type')
            empty = (rec_headers.get_header('Content-Length') == '0')
        # special case: empty w/arc record (hopefully a revisit)
        if empty:
            status_headers = StatusAndHeaders('204 No Content', [])
        # special case: warc records that are not expected to have http headers
        # attempt to add 200 status and content-type
        elif rec_type == 'metadata' or rec_type == 'resource':
            status_headers = StatusAndHeaders('200 OK', [('Content-Type', rec_headers.get_header('Content-Type'))])
        # special case: http 0.9 response, no status or headers
        #elif rec_type == 'response':
        #    content_type = rec_headers.get_header('Content-Type')
        #    if content_type and (';version=0.9' in content_type):
        #        status_headers = StatusAndHeaders('200 OK', [])
        # response record: parse HTTP status and headers!
        else:
            #(statusline, http_headers) = self.parse_http_headers(stream)
            status_headers = self.http_parser.parse(stream)
        return WBArchiveRecord((a_format, rec_type), rec_headers, stream, status_headers)
 #=================================================================
 class StatusAndHeadersParser:
    def __init__(self, statuslist):
        self.statuslist = statuslist
    def parse(self, stream):
        statusline = stream.readline().rstrip()
        protocol_status = utils.split_prefix(statusline, self.statuslist)
        if not protocol_status:
            raise wbexceptions.InvalidArchiveRecordException('Expected Status Line, Found: ' + statusline)
        headers = []
        line = stream.readline().rstrip()
        while line and line != '\r\n':
            name, value = line.split(':', 1)
            header = (name, value.strip())
            headers.append(header)
            line = stream.readline().rstrip()
        return StatusAndHeaders(statusline = protocol_status[1].strip(), headers = headers, protocol = protocol_status[0])
 #=================================================================
 class ARCHeadersParser:
    def __init__(self, headernames):
        self.headernames = headernames
    def parse(self, stream):
        headerline = stream.readline().rstrip()
        parts = headerline.split()
        headernames = self.headernames
        if len(parts) != len(headernames):
            raise wbexceptions.InvalidArchiveRecordException('Wrong # of heaeders, expected arc headers {0}, Found {1}'.format(headernames, parts))
        headers = []
        for name, value in itertools.izip(headernames, parts):
            headers.append((name, value))
        return StatusAndHeaders(statusline = '', headers = headers, protocol = 'ARC/1.0')
 #=================================================================
 class LineReader:
    def __init__(self, stream, max_len = 0, chunk_size = 1024, decomp = None):
        self.stream = stream
        self.chunk_size = chunk_size
        self.decomp = decomp
        self.buff = None
        self.num_read = 0
        self.max_len = max_len
    def _fillbuff(self, chunk_size = None):
        if not chunk_size:
            chunk_size = self.chunk_size
        if not self.buff or self.buff.pos >= self.buff.len:
            to_read =  min(self.max_len - self.num_read, self.chunk_size) if (self.max_len > 0) else self.chunk_size
            data = self.stream.read(to_read)
            self._process_read(data)
    def _process_read(self, data):
        if self.decomp and data:
            try:
                data = self.decomp.decompress(data)
            except Exception:
                # if first read attempt, assume non-gzipped stream
                if self.num_read == 0:
                    self.decomp = False
                # otherwise (partly decompressed), something is wrong
                else:
                    raise
        self.num_read += len(data)
        self.buff = StringIO.StringIO(data)
    def read(self, length = None):
        self._fillbuff()
        return self.buff.read(length)
    def readline(self, length = None):
        self._fillbuff()
        return self.buff.readline(length)
    def close(self):
        if self.stream:
            self.stream.close()
            self.stream = None
 class ChunkedDataException(Exception):
    pass
 class ChunkedLineReader(LineReader):
    r"""
    Properly formatted chunked data:
    >>> c=ChunkedLineReader(StringIO.StringIO("4\r\n1234\r\n0\r\n\r\n")); c.read()+c.read()
    '1234'
    Non-chunked data:
    >>> ChunkedLineReader(StringIO.StringIO("xyz123!@#")).read()
    'xyz123!@#'
    Starts like chunked data, but isn't:
    >>> c=ChunkedLineReader(StringIO.StringIO("1\r\nxyz123!@#")); c.read()+c.read()
    '1\r\nx123!@#'
    Chunked data cut off part way through:
    >>> c=ChunkedLineReader(StringIO.StringIO("4\r\n1234\r\n4\r\n12"));c.read()+c.read()
    '123412'
    """
    all_chunks_read = False
    not_chunked = False
    raise_chunked_data_exceptions = False # if False, we'll use best-guess fallback for parse errors
    def _fillbuff(self, chunk_size = None):
        if self.not_chunked:
            return LineReader._fillbuff(self, chunk_size)
        if self.all_chunks_read:
            return
        if not self.buff or self.buff.pos >= self.buff.len:
            length_header = self.stream.readline(64)
            data = ''
            try:
                # decode length header
                try:
                    chunk_size = int(length_header.strip().split(';')[0], 16)
                except ValueError:
                    raise ChunkedDataException("Couldn't decode length header '%s'" % length_header)
                if chunk_size:
                    # read chunk
                    while len(data) < chunk_size:
                        new_data = self.stream.read(chunk_size - len(data))
                        # if we unexpectedly run out of data, either raise an exception or just stop reading, assuming file was cut off
                        if not new_data:
                            if self.raise_chunked_data_exceptions:
                                raise ChunkedDataException("Ran out of data before end of chunk")
                            else:
                                chunk_size = len(data)
                                self.all_chunks_read = True
                        data += new_data
                    # if we successfully read a block without running out, it should end in \r\n
                    if not self.all_chunks_read:
                        clrf = self.stream.read(2)
                        if clrf != '\r\n':
                            raise ChunkedDataException("Chunk terminator not found.")
                    if self.decomp:
                        data = self.decomp.decompress(data)
                else:
                    # chunk_size 0 indicates end of file
                    self.all_chunks_read = True
                    data = ''
                self._process_read(data)
            except ChunkedDataException:
                if self.raise_chunked_data_exceptions:
                    raise
                # Can't parse the data as chunked.
                # It's possible that non-chunked data is set with a Transfer-Encoding: chunked
                # Treat this as non-chunk encoded from here on
                self._process_read(length_header + data)
                self.not_chunked = True
 #=================================================================
 import utils
 if __name__ == "__main__" or utils.enable_doctests():
    import os
    import pprint
    testloader = ArchiveLoader()
    def load_test_archive(test_file, offset, length):
        path = utils.test_data_dir() + 'warcs/' + test_file
        archive = testloader.load(path, offset, length)
        pprint.pprint((archive.type, archive.rec_headers, archive.status_headers))
    def test_multiple_reads(reader, inc_reads):
        result = None
        for x in inc_reads:
            result = reader.read(x)
        return result
    import doctest
    doctest.testmod()
--- a/pywb/binsearch/binsearch.py
+++ b/pywb/binsearch/binsearch.py
@ -1,123 +0,0 @@
 from collections import deque
 import os
 import itertools
 #=================================================================
 # Binary Search over a text file
 #=================================================================
 class FileReader:
    """
    A very simple file-like object wrapper that knows it's size
    getsize() method returns the filesize
    """
    def __init__(self, filename):
        self.fh = open(filename, 'rb')
        self.filename = filename
        self.size = os.path.getsize(filename)
    def getsize(self):
        return self.size
    def readline(self):
        return self.fh.readline()
    def seek(self, offset):
        return self.fh.seek(offset)
    def close(self):
        return self.fh.close()
 #=================================================================
 def binsearch_offset(reader, key, compare_func=cmp, block_size=8192):
    """
    Find offset of the full line which matches a given 'key' using binary search
    If key is not found, the offset is of the line after the key
    File is subdivided into block_size (default 8192) sized blocks
    Optional compare_func may be specified
    """
    min = 0
    max = reader.getsize() / block_size
    while (max - min > 1):
        mid = min + ((max - min) / 2)
        reader.seek(mid * block_size)
        if mid > 0:
            reader.readline() # skip partial line
        line = reader.readline()
        if compare_func(key, line) > 0:
            min = mid
        else:
            max = mid
    return (min * block_size)
 def search(reader, key, prev_size = 0, compare_func = cmp, block_size = 8192):
    """
    Perform a binsearch for a specified key down to block_size (8192) sized blocks,
    followed by linear search within the block to find first matching line.
    When performing linear search, keep track of up to N previous lines before
    first matching line.
    """
    min = binsearch_offset(reader, key, compare_func, block_size)
    reader.seek(min)
    if min > 0:
        reader.readline() # skip partial line
    if prev_size > 1:
        prev_deque = deque(maxlen = prev_size)
    line = None
    while True:
        line = reader.readline()
        if not line:
            break
        if compare_func(line, key) >= 0:
            break
        if prev_size == 1:
            prev = line
        elif prev_size > 1:
            prev_deque.append(line)
    def gen_iter(line):
        if prev_size == 1:
            yield prev.rstrip()
        elif prev_size > 1:
            for i in prev_deque:
                yield i.rstrip()
        while line:
            yield line.rstrip()
            line = reader.readline()
    return gen_iter(line)
 # Iterate over prefix matches
 def iter_prefix(reader, key):
    """
    Creates an iterator which iterates over prefix matches for a key in a sorted text file
    A line matches as long as it starts with key
    """
    return itertools.takewhile(lambda line: line.startswith(key), search(reader, key))
 def iter_exact(reader, key, token=' '):
    """
    Create an iterator which iterates over exact matches for a key in a sorted text file
    Key is terminated by a token (default ' ')
    """
    return iter_prefix(reader, key + token)
--- a/pywb/cdx/README.md
+++ b/pywb/cdx/README.md
@ -0,0 +1,36 @@
 ## PyWb CDX v0.2
 [![Build Status](https://travis-ci.org/ikreymer/pywb_cdx.png?branch=master)](https://travis-ci.org/ikreymer/pywb_cdx)
 This package contains the CDX processing suite of the pywb wayback tool suite.
 The CDX Server loads, filters and transforms cdx from multiple sources in response
 to a given query.
 ### Installation and Tests
 `pip install -r requirements` -- to install
 `python run-tests.py` -- to run all tests
 ### Sample App
 A very simple reference WSGI app is included.
 Run: `python -m pywb_cdx.wsgi_cdxserver` to start the app, keyboard interrupt to stop.
 The default [config.yaml](pywb_cdx/config.yaml) points to the sample data directory
 and uses port 8080
 ### CDX Server API Reference
 Goal is to provide compatiblity with this feature set and more:
 https://github.com/internetarchive/wayback/tree/master/wayback-cdx-server
 TODO
--- a/pywb/binsearch/init.py
+++ b/pywb/binsearch/init.py
--- a/pywb/cdxserver/cdxobject.py
+++ b/pywb/cdxserver/cdxobject.py
@ -1,25 +1,31 @@
 from collections import OrderedDict
 import itertools
 #=================================================================
 class CDXObject(OrderedDict):
    CDX_FORMATS = [
        # Public CDX Format
-        ["urlkey","timestamp","original","mimetype","statuscode","digest","length"],
+        ["urlkey", "timestamp", "original", "mimetype", "statuscode",
         "digest", "length"],
        # CDX 11 Format
-        ["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"],
+        ["urlkey", "timestamp", "original", "mimetype", "statuscode",
         "digest", "redirect", "robotflags", "length", "offset", "filename"],
        # CDX 9 Format
-        ["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","offset","filename"],
+        ["urlkey", "timestamp", "original", "mimetype", "statuscode",
         "digest", "redirect", "offset", "filename"],
        # CDX 11 Format + 3 revisit resolve fields
-        ["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename",
+        ["urlkey", "timestamp", "original", "mimetype", "statuscode",
-         "orig.length","orig.offset","orig.filename"],
+         "digest", "redirect", "robotflags", "length", "offset", "filename",
         "orig.length", "orig.offset", "orig.filename"],
        # CDX 9 Format + 3 revisit resolve fields
-        ["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","offset","filename",
+        ["urlkey", "timestamp", "original", "mimetype", "statuscode",
-         "orig.length","orig.offset","orig.filename"]
+         "digest", "redirect", "offset", "filename",
         "orig.length", "orig.offset", "orig.filename"]
        ]
    def __init__(self, cdxline):
@ -53,5 +59,3 @@ class CDXObject(OrderedDict):
        li = itertools.imap(lambda (n, val): val, self.items())
        return ' '.join(li)
--- a/pywb/cdxserver/cdxops.py
+++ b/pywb/cdxserver/cdxops.py
@ -1,8 +1,6 @@
 from cdxobject import CDXObject
 from pywb.utils.timeutils import timestamp_to_sec
 from ..binsearch.binsearch import iter_exact, iter_prefix, FileReader
 import timeutils
 import bisect
 import itertools
 import re
@ -11,7 +9,6 @@ from heapq import merge
 from collections import deque
 #=================================================================
 def cdx_text_out(cdx, fields):
    if not fields:
@ -26,30 +23,31 @@ def cdx_load(sources, params):
    cdx_iter = make_cdx_iter(cdx_iter)
-    resolve_revisits = params.get('resolve_revisits', False)
+    if not params.get('proxy_all'):
-    if resolve_revisits:
+        resolve_revisits = params.get('resolve_revisits', False)
-        cdx_iter = cdx_resolve_revisits(cdx_iter)
+        if resolve_revisits:
            cdx_iter = cdx_resolve_revisits(cdx_iter)
-    filters = params.get('filter', None)
+        filters = params.get('filter', None)
-    if filters:
+        if filters:
-        cdx_iter = cdx_filter(cdx_iter, filters)
+            cdx_iter = cdx_filter(cdx_iter, filters)
-    collapse_time = params.get('collapse_time', None)
+        collapse_time = params.get('collapse_time', None)
-    if collapse_time:
+        if collapse_time:
-        cdx_iter = cdx_collapse_time_status(cdx_iter, collapse_time)
+            cdx_iter = cdx_collapse_time_status(cdx_iter, collapse_time)
-    limit = int(params.get('limit', 1000000))
+        limit = int(params.get('limit', 1000000))
-    reverse = params.get('reverse', False)
+        reverse = params.get('reverse', False)
-    if reverse:
+        if reverse:
-        cdx_iter = cdx_reverse(cdx_iter, limit)
+            cdx_iter = cdx_reverse(cdx_iter, limit)
-    closest_to = params.get('closest_to', None)
+        closest_to = params.get('closest', None)
-    if closest_to:
+        if closest_to:
-        cdx_iter = cdx_sort_closest(closest_to, cdx_iter, limit)
+            cdx_iter = cdx_sort_closest(closest_to, cdx_iter, limit)
-    if limit:
+        if limit:
-        cdx_iter = cdx_limit(cdx_iter, limit)
+            cdx_iter = cdx_limit(cdx_iter, limit)
    # output raw cdx objects
    if params.get('output') == 'raw':
@ -73,6 +71,7 @@ def load_cdx_streams(sources, params):
    merged_stream = merge(*(source_iters))
    return merged_stream
 #=================================================================
 # convert text cdx stream to CDXObject
 def make_cdx_iter(text_iter):
@ -98,7 +97,7 @@ def cdx_reverse(cdx_iter, limit):
        return [last] if last else []
-    reverse_cdxs = deque(maxlen = limit)
+    reverse_cdxs = deque(maxlen=limit)
    for cdx in cdx_iter:
        reverse_cdxs.appendleft(cdx)
@ -142,14 +141,13 @@ def cdx_filter(cdx_iter, filter_strings):
    filters = map(Filter, filter_strings)
    for cdx in cdx_iter:
-        if all (x(cdx) for x in filters):
+        if all(x(cdx) for x in filters):
            yield cdx
 #=================================================================
 # collapse by timestamp and status code
-def cdx_collapse_time_status(cdx_iter, timelen = 10):
+def cdx_collapse_time_status(cdx_iter, timelen=10):
    timelen = int(timelen)
    last_token = None
@ -163,16 +161,15 @@ def cdx_collapse_time_status(cdx_iter, timelen = 10):
            yield cdx
 #=================================================================
 # sort CDXCaptureResult by closest to timestamp
-def cdx_sort_closest(closest, cdx_iter, limit = 10):
+def cdx_sort_closest(closest, cdx_iter, limit=10):
    closest_cdx = []
-    closest_sec = timeutils.timestamp_to_sec(closest)
+    closest_sec = timestamp_to_sec(closest)
    for cdx in cdx_iter:
-        sec = timeutils.timestamp_to_sec(cdx['timestamp'])
+        sec = timestamp_to_sec(cdx['timestamp'])
        key = abs(closest_sec - sec)
        # create tuple to sort by key
@ -186,22 +183,22 @@ def cdx_sort_closest(closest, cdx_iter, limit = 10):
        if len(closest_cdx) > limit:
            closest_cdx.pop()
    return itertools.imap(lambda x: x[1], closest_cdx)
 #=================================================================
 # resolve revisits
 # Fields to append from cdx original to revisit
 ORIG_TUPLE = ['length', 'offset', 'filename']
 def cdx_resolve_revisits(cdx_iter):
    originals = {}
    for cdx in cdx_iter:
-        is_revisit = (cdx['mimetype'] == 'warc/revisit') or (cdx['filename'] == '-')
+        is_revisit = ((cdx['mimetype'] == 'warc/revisit') or
                      (cdx['filename'] == '-'))
        digest = cdx['digest']
@ -210,7 +207,6 @@ def cdx_resolve_revisits(cdx_iter):
        if not original_cdx and not is_revisit:
            originals[digest] = cdx
        if original_cdx and is_revisit:
            fill_orig = lambda field: original_cdx[field]
            # Transfer mimetype and statuscode
@ -224,5 +220,3 @@ def cdx_resolve_revisits(cdx_iter):
            cdx['orig.' + field] = fill_orig(field)
        yield cdx
--- a/pywb/cdxserver/cdxserver.py
+++ b/pywb/cdxserver/cdxserver.py
@ -1,5 +1,4 @@
 import surt
 from ..binsearch.binsearch import iter_exact, iter_prefix, FileReader
 from cdxops import cdx_load
 import itertools
@ -7,39 +6,21 @@ import logging
 import os
 import urlparse
 from cdxsource import CDXSource, CDXFile, RemoteCDXSource
 #=================================================================
 class CDXFile:
    def __init__(self, filename):
        self.filename = filename
    def load_cdx(self, params):
        source = FileReader(self.filename)
        match_type = params.get('match_type')
        if match_type == 'prefix':
            iter_func = iter_prefix
        else:
            iter_func = iter_exact
        key = params.get('key')
        return iter_func(source, key)
    def __str__(self):
        return 'CDX File - ' + self.filename
 #=================================================================
 class CDXException(Exception):
    def __init__(self, msg, url = None):
        Exception.__init__(self, msg)
        self.url = url
    def status(self):
        return '400 Bad Request'
 #=================================================================
 class AccessException(CDXException):
    def status(self):
        return '403 Bad Request'
 #=================================================================
 class CDXServer:
    """
@ -47,33 +28,51 @@ class CDXServer:
    responds to queries and dispatches to the cdx ops for processing
    """
-    def __init__(self, sources, surt_ordered = True):
+    @staticmethod
    def create_from_config(config):
        paths = config.get('index_paths')
        surt_ordered = config.get('surt_ordered', True)
        return CDXServer(paths, surt_ordered)
    def __init__(self, sources, surt_ordered=True):
        self.sources = []
        self.surt_ordered = surt_ordered
        logging.debug('CDX Surt-Ordered? ' + str(surt_ordered))
        if not isinstance(sources, list):
            sources = [sources]
        for src in sources:
-            if os.path.isdir(src):
+            if isinstance(src, CDXSource):
-                for file in os.listdir(src):
+                self.add_cdx_source(src)
-                    self.add_cdx_loader(src + file)
+            elif isinstance(src, str):
-            else:
+                if os.path.isdir(src):
-                self.add_cdx_loader(src)
+                    for file in os.listdir(src):
                        self.add_cdx_source(src + file)
                else:
                    self.add_cdx_source(src)
        if len(self.sources) == 0:
-            logging.exception('No CDX Sources Found!')
+            logging.exception('No CDX Sources Found from: ' + str(sources))
-    def add_cdx_loader(self, filename):
+    def add_cdx_source(self, source):
-        source = self.create_cdx_loader(filename)
+        if not isinstance(source, CDXSource):
-        if not source:
+            source = self.create_cdx_source(source)
-            return
+            if not source:
                return
        logging.debug('Adding CDX Source: ' + str(source))
        self.sources.append(source)
    @staticmethod
-    def create_cdx_loader(filename):
+    def create_cdx_source(filename):
        if filename.startswith('http://') or filename.startswith('https://'):
            return RemoteCDXSource(filename)
        if filename.endswith('.cdx'):
            return CDXFile(filename)
        return None
        #TODO: support zipnum
        #elif filename.endswith('.summary')
@ -81,27 +80,52 @@ class CDXServer:
        #elif filename.startswith('redis://')
        #    return RedisCDXSource(filename)
    def load_cdx(self, **params):
-        # canonicalize to surt (canonicalization is part of surt conversion)
+        # if key not set, assume 'url' is set and needs canonicalization
        if not params.get('key'):
            params['key'] = self._canonicalize(params)
        self._convert_old_style(params)
        return cdx_load(self.sources, params)
    def _canonicalize(self, params):
        """
        Canonicalize url and convert to surt
        If no surt-mode, convert back to url form
        as surt conversion is currently part of canonicalization
        """
        try:
            url = params['url']
        except KeyError:
-            raise CDXException('The url= param must be specified to query the cdx server')
+            msg = 'A url= param must be specified to query the cdx server'
            raise CDXException(msg)
        try:
            key = surt.surt(url)
        except Exception as e:
-            raise CDXException('Invalid url: ', url)
+            raise CDXException('Invalid Url: ' + url)
        # if not surt, unsurt the surt to get canonicalized non-surt url
        if not self.surt_ordered:
            key = unsurt(key)
-        params['key'] = key
+        return key
-        return cdx_load(self.sources, params)
+    def _convert_old_style(self, params):
        """
        Convert old-style CDX Server param semantics
        """
        collapse_time = params.get('collapseTime')
        if collapse_time:
            params['collapse_time'] = collapse_time
        resolve_revisits = params.get('resolveRevisits')
        if resolve_revisits:
            params['resolve_revisits'] = resolve_revisits
        if params.get('sort') == 'reverse':
            params['reverse'] = True
    def load_cdx_from_request(self, env):
        #url = wbrequest.wb_url.url
@ -113,7 +137,8 @@ class CDXServer:
            params['output'] = 'text'
        # parse_qs produces arrays for single values
-        # cdxreader expects singleton params for all except filters, so convert here
+        # cdx processing expects singleton params for all params,
        # except filters, so convert here
        # use first value of the list
        for name, val in params.iteritems():
            if name != 'filter':
@ -122,13 +147,10 @@ class CDXServer:
        cdx_lines = self.load_cdx(**params)
        return cdx_lines
    def __str__(self):
        return 'load cdx indexes from ' + str(self.sources)
 #=================================================================
 def unsurt(surt):
    """
@ -141,7 +163,8 @@ def unsurt(surt):
    'com,example)'
    # Long surt
-    >>> unsurt('suffix,domain,sub,subsub,another,subdomain)/path/file/index.html?a=b?c=)/')
+    >>> unsurt('suffix,domain,sub,subsub,another,subdomain)/path/file/\
 index.html?a=b?c=)/')
    'subdomain.another.subsub.sub.domain.suffix)/path/file/index.html?a=b?c=)/'
    """
@ -158,3 +181,6 @@ def unsurt(surt):
        return surt
 if __name__ == "__main__":
    import doctest
    doctest.testmod()
--- a/pywb/cdx/cdxsource.py
+++ b/pywb/cdx/cdxsource.py
@ -0,0 +1,92 @@
 from pywb.utils.binsearch import iter_exact, iter_prefix
 from pywb.utils.loaders import SeekableTextFileReader
 import urllib
 import urllib2
 #=================================================================
 class CDXSource(object):
    """
    Represents any cdx index source
    """
    def load_cdx(self, params):
        raise NotImplementedError('Implement in subclass')
 #=================================================================
 class CDXFile(CDXSource):
    """
    Represents a local plain-text .cdx file
    """
    def __init__(self, filename):
        self.filename = filename
    def load_cdx(self, params):
        source = SeekableTextFileReader(self.filename)
        match_type = params.get('match_type')
        if match_type == 'prefix':
            iter_func = iter_prefix
        else:
            iter_func = iter_exact
        key = params.get('key')
        return iter_func(source, key)
    def __str__(self):
        return 'CDX File - ' + self.filename
 #=================================================================
 class RemoteCDXSource(CDXSource):
    """
    Represents a remote cdx server, to which requests will be proxied.
    Only url and match type params are proxied at this time,
    the stream is passed through all other filters locally.
    """
    def __init__(self, filename, cookie=None, proxy_all=True):
        self.remote_url = filename
        self.cookie = cookie
        self.proxy_all = proxy_all
    def load_cdx(self, proxy_params):
        if self.proxy_all:
            params = proxy_params
            params['proxy_all'] = True
        else:
            # Only send url and matchType params to remote
            params = {}
            params['url'] = proxy_params['url']
            match_type = proxy_params.get('match_type')
            if match_type:
                proxy_params['matchType'] = match_type
        urlparams = urllib.urlencode(params, True)
        try:
            request = urllib2.Request(self.remote_url, urlparams)
            if self.cookie:
                request.add_header('Cookie', self.cookie)
            response = urllib2.urlopen(request)
        except urllib2.HTTPError as e:
            if e.code == 403:
                exc_msg = e.read()
                msg = ('Blocked By Robots' if 'Blocked By Robots' in exc_msg
                       else 'Excluded')
                raise AccessException(msg)
            else:
                raise
        return iter(response)
    def __str__(self):
        return 'Remote CDX Server: ' + self.remote_url
--- a/pywb/cdx/config.yaml
+++ b/pywb/cdx/config.yaml
@ -0,0 +1,3 @@
 #CDX Server WSGI App Config
 index_paths: ./sample_data/
 port: 8090
--- a/pywb/cdx/test/cdxserver_test.py
+++ b/pywb/cdx/test/cdxserver_test.py
@ -0,0 +1,163 @@
 #=================================================================
 """
 # Merge Sort Multipe CDX Sources
 >>> cdx_ops_test(url = 'http://iana.org/', sources = [test_cdx_dir + 'dupes.cdx', test_cdx_dir + 'iana.cdx'])
 org,iana)/ 20140126200624 http://www.iana.org/ text/html 200 OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 2258 334 iana.warc.gz
 org,iana)/ 20140127171238 http://iana.org unk 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 343 1858 dupes.warc.gz
 org,iana)/ 20140127171238 http://www.iana.org/ warc/revisit - OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 536 2678 dupes.warc.gz
 # Limit CDX Stream
 >>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', limit = 3)
 org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200625 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 117166 198285 iana.warc.gz
 org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200654 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf warc/revisit - YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 548 482544 iana.warc.gz
 org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200706 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf warc/revisit - YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 495230 iana.warc.gz
 # Reverse CDX Stream
 >>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', reverse = True, resolve_revisits = True, limit = 3)
 org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201308 https://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 551 783712 iana.warc.gz 117166 198285 iana.warc.gz
 org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201249 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 771773 iana.warc.gz 117166 198285 iana.warc.gz
 org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201240 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 551 757988 iana.warc.gz 117166 198285 iana.warc.gz
 >>> cdx_ops_test('http://iana.org/_js/2013.1/jquery.js', reverse = True, resolve_revisits = True, limit = 1)
 org,iana)/_js/2013.1/jquery.js 20140126201307 https://www.iana.org/_js/2013.1/jquery.js application/x-javascript 200 AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO - - 543 778507 iana.warc.gz 33449 7311 iana.warc.gz
 # No matching results
 >>> cdx_ops_test('http://iana.org/dont_have_this', reverse = True, resolve_revisits = True, limit = 2)
 # Filter cdx
 >>> cdx_ops_test(url = 'http://iana.org/domains', match_type = 'prefix', filter = ['mimetype:text/html'])
 org,iana)/domains 20140126200825 http://www.iana.org/domains text/html 200 7UPSCLNWNZP33LGW6OJGSF2Y4CDG4ES7 - - 2912 610534 iana.warc.gz
 org,iana)/domains/arpa 20140126201248 http://www.iana.org/domains/arpa text/html 200 QOFZZRN6JIKAL2JRL6ZC2VVG42SPKGHT - - 2939 759039 iana.warc.gz
 org,iana)/domains/idn-tables 20140126201127 http://www.iana.org/domains/idn-tables text/html 200 HNCUFTJMOQOGAEY6T56KVC3T7TVLKGEW - - 8118 715878 iana.warc.gz
 org,iana)/domains/int 20140126201239 http://www.iana.org/domains/int text/html 200 X32BBNNORV4SPEHTQF5KI5NFHSKTZK6Q - - 2482 746788 iana.warc.gz
 org,iana)/domains/reserved 20140126201054 http://www.iana.org/domains/reserved text/html 200 R5AAEQX5XY5X5DG66B23ODN5DUBWRA27 - - 3573 701457 iana.warc.gz
 org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz
 org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz
 org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz
 org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz
 >>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', filter = 'statuscode:200')
 org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz
 # Collapse by timestamp
 # unresolved revisits, different statuscode results in an extra repeat
 >>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', collapse_time = 11)
 org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz
 org,iana)/_css/2013.1/screen.css 20140126200653 http://www.iana.org/_css/2013.1/screen.css warc/revisit - BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 533 328367 iana.warc.gz
 org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css warc/revisit - BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz
 # resolved revisits
 >>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', collapse_time = '11', resolve_revisits = True)
 org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz - - -
 org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz 8754 41238 iana.warc.gz
 # Sort by closest timestamp + field select output
 >>> cdx_ops_test(closest = '20140126200826', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', fields = 'timestamp', limit = 10)
 20140126200826
 20140126200816
 20140126200805
 20140126200912
 20140126200738
 20140126200930
 20140126200718
 20140126200706
 20140126200654
 20140126200625
 >>> cdx_ops_test(closest = '20140126201306', url = 'http://iana.org/dnssec', resolve_revisits = True, sources = [test_cdx_dir + 'dupes.cdx', test_cdx_dir + 'iana.cdx'])
 org,iana)/dnssec 20140126201306 http://www.iana.org/dnssec text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 442 772827 iana.warc.gz - - -
 org,iana)/dnssec 20140126201307 https://www.iana.org/dnssec text/html 200 PHLRSX73EV3WSZRFXMWDO6BRKTVUSASI - - 2278 773766 iana.warc.gz - - -
 >>> cdx_ops_test(closest = '20140126201307', url = 'http://iana.org/dnssec', resolve_revisits = True)
 org,iana)/dnssec 20140126201307 https://www.iana.org/dnssec text/html 200 PHLRSX73EV3WSZRFXMWDO6BRKTVUSASI - - 2278 773766 iana.warc.gz - - -
 org,iana)/dnssec 20140126201306 http://www.iana.org/dnssec text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 442 772827 iana.warc.gz - - -
 # equal dist prefer earlier
 >>> cdx_ops_test(closest = '20140126200700', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolve_revisits = True, limit = 2)
 org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200654 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 548 482544 iana.warc.gz 117166 198285 iana.warc.gz
 org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200706 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 495230 iana.warc.gz 117166 198285 iana.warc.gz
 >>> cdx_ops_test(closest = '20140126200659', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolve_revisits = True, limit = 2, fields = 'timestamp')
 20140126200654
 20140126200706
 >>> cdx_ops_test(closest = '20140126200701', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolve_revisits = True, limit = 2, fields = 'timestamp')
 20140126200706
 20140126200654
 # Resolve Revisits
 >>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/inconsolata.otf', resolve_revisits = True)
 org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200826 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 34054 620049 iana.warc.gz - - -
 org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200912 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 546 667073 iana.warc.gz 34054 620049 iana.warc.gz
 org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200930 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 534 697255 iana.warc.gz 34054 620049 iana.warc.gz
 org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201055 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 547 714833 iana.warc.gz 34054 620049 iana.warc.gz
 org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201249 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 551 768625 iana.warc.gz 34054 620049 iana.warc.gz
 >>> cdx_ops_test('http://iana.org/domains/root/db', resolve_revisits = True)
 org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz - - -
 org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz - - -
 # CDX Server init
 >>> x = CDXServer([test_cdx_dir]).load_cdx(url = 'example.com', limit = 1, output = 'raw')
 >>> pprint.pprint(x.next().items())
 [('urlkey', 'com,example)/'),
 ('timestamp', '20140127171200'),
 ('original', 'http://example.com'),
 ('mimetype', 'text/html'),
 ('statuscode', '200'),
 ('digest', 'B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'),
 ('redirect', '-'),
 ('robotflags', '-'),
 ('length', '1046'),
 ('offset', '334'),
 ('filename', 'dupes.warc.gz')]
 # NOTE: external dependency -- need self-contained test
 >>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'example.com', output = 'raw', limit = '2')
 >>> pprint.pprint(x.next().items())
 [('urlkey', 'com,example)/'),
 ('timestamp', '20020120142510'),
 ('original', 'http://example.com:80/'),
 ('mimetype', 'text/html'),
 ('statuscode', '200'),
 ('digest', 'HT2DYGA5UKZCPBSFVCV3JOBXGW2G5UUA'),
 ('length', '1792')]
 """
 #=================================================================
 from pywb.cdx.cdxserver import CDXServer
 import os
 import sys
 import pprint
 from pywb import get_test_dir
 #test_cdx_dir = os.path.dirname(os.path.realpath(__file__)) + '/../sample_data/'
 test_cdx_dir = get_test_dir() + 'cdx/'
 def cdx_ops_test(url, sources = [test_cdx_dir + 'iana.cdx'], **kwparams):
    kwparams['url'] = url
    kwparams['output'] = 'text'
    server = CDXServer(sources)
    results = server.load_cdx(**kwparams)
    for x in results:
        sys.stdout.write(x)
 if __name__ == "__main__":
    import doctest
    doctest.testmod()
--- a/pywb/cdx/wsgi_cdxserver.py
+++ b/pywb/cdx/wsgi_cdxserver.py
@ -0,0 +1,72 @@
 from cdxserver import CDXServer
 import logging
 import os
 import yaml
 import pkgutil
 #=================================================================
 TEST_CDX_DIR = os.path.dirname(os.path.realpath(__file__)) + '/../sample_data/'
 CONFIG_FILE = 'config.yaml'
 DEFAULT_PORT = 8080
 if __package__:
    config = pkgutil.get_data(__package__, CONFIG_FILE)
    config = yaml.load(config)
 else:
    config = None
 #=================================================================
 def main():
    logging.basicConfig(format='%(asctime)s: [%(levelname)s]: %(message)s',
                        level=logging.DEBUG)
    cdx_config = config.get('index_paths') if config else None
    if not cdx_config:
        cdx_config = [TEST_CDX_DIR]
    cdxserver = CDXServer(cdx_config)
    def application(env, start_response):
        try:
            response = cdxserver.load_cdx_from_request(env)
            start_response('200 OK', [('Content-Type', 'text/plain')])
            response = list(response)
        except Exception as exc:
            import traceback
            err_details = traceback.format_exc(exc)
            start_response('400 Error', [('Content-Type', 'text/plain')])
            response = [str(exc)]
            print err_details
        return response
    return application
 if __name__ == "__main__":
    from wsgiref.simple_server import make_server
    app = main()
    port = DEFAULT_PORT
    if config:
        port = config.get('port', DEFAULT_PORT)
    httpd = make_server('', port, app)
    logging.debug('Starting CDX Server on port ' + str(port))
    try:
        httpd.serve_forever()
    except KeyboardInterrupt:
        pass
    logging.debug('Stopping CDX Server')
 else:
    application = main()
--- a/pywb/cdxserver/cdxapp.py
+++ b/pywb/cdxserver/cdxapp.py
@ -1,42 +0,0 @@
 from cdxserver import CDXServer
 import logging
 import os
 test_cdx_dir = os.path.dirname(os.path.realpath(__file__)) + '/../../sample_archive/cdx/'
 #=================================================================
 def main(config = None):
    logging.basicConfig(format = '%(asctime)s: [%(levelname)s]: %(message)s', level = logging.DEBUG)
    if not config:
        config = [test_cdx_dir]
    cdxserver = CDXServer(config)
    def application(env, start_response):
        try:
            response = cdxserver.load_cdx_from_request(env)
            start_response('200 OK', [('Content-Type', 'text/plain')])
            response = list(response)
        except Exception as exc:
            import traceback
            err_details = traceback.format_exc(exc)
            start_response('400 Error', [('Content-Type', 'text/plain')])
            response = [str(exc)]
            print err_details
        return response
    return application
 if __name__ == "__main__":
    pass
 else:
    application = main()
--- a/pywb/config_utils.py
+++ b/pywb/config_utils.py
@ -1,59 +1,34 @@
 import archiveloader
 import views
 import handlers
 import indexreader
 import replay_views
 import replay_resolvers
 import logging
-import hmac
+
-import time
+from pywb.warc.recordloader import ArcWarcRecordLoader
 from pywb.warc.resolvingloader import ResolvingLoader
 from pywb.rewrite.rewrite_content import RewriteContent
 #=================================================================
 # Config Loading
 #=================================================================
 def load_template_file(file, desc = None, view_class = views.J2TemplateView):
    if file:
-        logging.info('Adding {0}: {1}'.format(desc if desc else name, file))
+        logging.debug('Adding {0}: {1}'.format(desc if desc else name, file))
        file = view_class(file)
    return file
 #=================================================================
-# Cookie Signing
+def create_wb_handler(cdx_server, config):
 #=================================================================
-class HMACCookieMaker:
+    record_loader = ArcWarcRecordLoader(cookie_maker = config.get('cookie_maker'))
-    def __init__(self, key, name):
+    paths = config.get('archive_paths')
        self.key = key
        self.name = name
-    def __call__(self, duration, extra_id = ''):
+    resolving_loader = ResolvingLoader(paths = paths, cdx_server = cdx_server, record_loader = record_loader)
        expire = str(long(time.time() + duration))
-        if extra_id:
+    replayer = replay_views.ReplayView(
-            msg = extra_id + '-' + expire
+        content_loader = resolving_loader,
        else:
            msg = expire
-        hmacdigest = hmac.new(self.key, msg)
+        content_rewriter = RewriteContent(),
        hexdigest = hmacdigest.hexdigest()
        if extra_id:
            cookie = '{0}-{1}={2}-{3}'.format(self.name, extra_id, expire, hexdigest)
        else:
            cookie = '{0}={1}-{2}'.format(self.name, expire, hexdigest)
        return cookie
 #=================================================================
 def create_wb_handler(cdx_source, config):
    replayer = replay_views.RewritingReplayView(
        resolvers = replay_resolvers.make_best_resolvers(config.get('archive_paths')),
        loader = archiveloader.ArchiveLoader(hmac = config.get('hmac')),
        head_insert_view = load_template_file(config.get('head_insert_html'), 'Head Insert'),
@ -66,7 +41,7 @@ def create_wb_handler(cdx_source, config):
    wb_handler = handlers.WBHandler(
-        cdx_source,
+        cdx_server,
        replayer,
--- a/pywb/handlers.py
+++ b/pywb/handlers.py
@ -1,13 +1,12 @@
 import views
 import utils
 import urlparse
 from wbrequestresponse import WbResponse
 from wburl import WbUrl
 from wbexceptions import WbException, NotFoundException
 import pkgutil
 import mimetypes
 import time
 from pywb.rewrite.wburl import WbUrl
 from wbrequestresponse import WbResponse
 from wbexceptions import WbException, NotFoundException
 from views import TextCapturesView
 class BaseHandler:
@ -22,23 +21,22 @@ class BaseHandler:
 # Standard WB Handler
 #=================================================================
 class WBHandler(BaseHandler):
-    def __init__(self, cdx_reader, replay, html_view = None, search_view = None):
+    def __init__(self, index_reader, replay, html_view = None, search_view = None):
-        self.cdx_reader = cdx_reader
+        self.index_reader = index_reader
        self.replay = replay
-        self.text_view = views.TextCapturesView()
+        self.text_view = TextCapturesView()
        self.html_view = html_view
        self.search_view = search_view
    def __call__(self, wbrequest):
        if wbrequest.wb_url_str == '/':
            return self.render_search_page(wbrequest)
-        with utils.PerfTimer(wbrequest.env.get('X_PERF'), 'query') as t:
+        with PerfTimer(wbrequest.env.get('X_PERF'), 'query') as t:
-            cdx_lines = self.cdx_reader.load_for_request(wbrequest, parsed_cdx = True)
+            cdx_lines = self.index_reader.load_for_request(wbrequest)
        # new special modifier to always show cdx index
        if wbrequest.wb_url.mod == 'cdx_':
@ -48,8 +46,8 @@ class WBHandler(BaseHandler):
            query_view = self.html_view if self.html_view else self.text_view
            return query_view.render_response(wbrequest, cdx_lines)
-        with utils.PerfTimer(wbrequest.env.get('X_PERF'), 'replay') as t:
+        with PerfTimer(wbrequest.env.get('X_PERF'), 'replay') as t:
-            return self.replay(wbrequest, cdx_lines, self.cdx_reader)
+            return self.replay(wbrequest, cdx_lines)
    def render_search_page(self, wbrequest):
@ -60,18 +58,18 @@ class WBHandler(BaseHandler):
    def __str__(self):
-        return 'WBHandler: ' + str(self.cdx_reader) + ', ' + str(self.replay)
+        return 'WBHandler: ' + str(self.index_reader) + ', ' + str(self.replay)
 #=================================================================
 # CDX-Server Handler -- pass all params to cdx server
 #=================================================================
 class CDXHandler(BaseHandler):
-    def __init__(self, cdx_server, view = None):
+    def __init__(self, index_reader, view = None):
-        self.cdx_server = cdx_server
+        self.index_reader = index_reader
-        self.view = view if view else views.TextCapturesView()
+        self.view = view if view else TextCapturesView()
    def __call__(self, wbrequest):
-        cdx_lines = self.cdx_server.load_cdx_from_request(wbrequest.env)
+        cdx_lines = self.index_reader.cdx_server.load_cdx_from_request(wbrequest.env)
        return self.view.render_response(wbrequest, cdx_lines)
@ -81,7 +79,7 @@ class CDXHandler(BaseHandler):
        return None
    def __str__(self):
-        return 'CDX Server: ' + str(self.cdx_server)
+        return 'Index Reader: ' + str(self.index_reader)
 #=================================================================
@ -136,4 +134,19 @@ class DebugEchoHandler(BaseHandler):
        return WbResponse.text_response(str(wbrequest))
 #=================================================================
 class PerfTimer:
    def __init__(self, perfdict, name):
        self.perfdict = perfdict
        self.name = name
    def __enter__(self):
        self.start = time.clock()
        return self
    def __exit__(self, *args):
        self.end = time.clock()
        if self.perfdict is not None:
            self.perfdict[self.name] = str(self.end - self.start)
--- a/pywb/indexreader.py
+++ b/pywb/indexreader.py
@ -1,17 +1,22 @@
 import urllib
 import urllib2
 import wbexceptions
 import wbrequestresponse
 from collections import OrderedDict
-from cdxserver.cdxserver import CDXServer, CDXException
+from itertools import chain
-from cdxserver.cdxobject import CDXObject
+from pprint import pprint
-import logging
+from pywb.cdx.cdxserver import CDXServer, CDXException
 from pywb.cdx.cdxobject import CDXObject
 #=================================================================
-class IndexReader:
+class IndexReader(object):
-    def load_for_request(self, wbrequest, parsed_cdx = True):
+    def __init__(self, config):
        if isinstance(config, str):
            self.cdx_server = CDXServer(config)
        else:
            self.cdx_server = CDXServer.create_from_config(config)
    def load_for_request(self, wbrequest):
        wburl = wbrequest.wb_url
        # init standard params
@ -24,147 +29,27 @@ class IndexReader:
        if wbrequest.custom_params:
            params.update(wbrequest.custom_params)
-        #params['url'] = wburl.url
+        params['url'] = wburl.url
        output = 'raw' if parsed_cdx else 'text'
        try:
-            cdxlines = self.load_cdx(url = wburl.url, output = output, **params)
+            cdxlines = self.load_cdx(output='raw', **params)
        except CDXException:
            raise wbexceptions.BadUrlException('Bad Request Url: ' + wburl.url)
-        cdxlines = utils.peek_iter(cdxlines)
+        cdxlines = self.peek_iter(cdxlines)
        if cdxlines is None:
            raise wbexceptions.NotFoundException('WB Does Not Have Url: ' + wburl.url)
        cdxlines = self.filter_cdx(wbrequest, cdxlines)
        return cdxlines
-    def filter_cdx(self, wbrequest, cdxlines):
+    def load_cdx(self, **params):
-        # Subclasses may wrap cdxlines iterator in a filter
+        return self.cdx_server.load_cdx(**params)
        return cdxlines
    def load_cdx(self, url, params = {}, parsed_cdx = True):
        raise NotImplementedError('Override in subclasses')
    @staticmethod
    def make_best_cdx_source(paths, config):
        # may be a string or list
        surt_ordered = config.get('surt_ordered', True)
        # support mixed cdx streams and remote servers?
        # for now, list implies local sources
        if isinstance(paths, list):
            if len(paths) > 1:
                return EmbeddedCDXServer(paths, surt_ordered)
            else:
                # treat as non-list
                paths = paths[0]
        # a single uri
        uri = paths
        # Check for remote cdx server
        if (uri.startswith('http://') or uri.startswith('https://')) and not uri.endswith('.cdx'):
            cookie = config.get('cookie', None)
            return RemoteCDXServer(uri, cookie = cookie)
        else:
            return EmbeddedCDXServer([uri], surt_ordered)
 #=================================================================
 class EmbeddedCDXServer(CDXServer, IndexReader):
    def get_query_params(self, wburl, limit = 150000, collapse_time = None, replay_closest = 10):
        if wburl.type == wburl.URL_QUERY:
            raise NotImplementedError('Url Query Not Yet Supported')
        return {
            wburl.QUERY:
                {'collapse_time': collapse_time, 'filter': '!statuscode:(500|502|504)', 'limit': limit},
            wburl.URL_QUERY:
                {},
 #                raise Exception('Not Yet Implemented')
 #                {'collapse': 'urlkey', 'matchType': 'prefix', 'showGroupCount': True, 'showUniqCount': True, 'lastSkipTimestamp': True, 'limit': limit,
 #                 'fl': 'urlkey,original,timestamp,endtimestamp,groupcount,uniqcount',
 #                },
            wburl.REPLAY:
                {'filter': '!statuscode:(500|502|504)', 'limit': replay_closest, 'closest_to': wburl.timestamp, 'resolve_revisits': True},
           wburl.LATEST_REPLAY:
                {'reverse': True, 'filter': 'statuscode:[23]..', 'limit': '1', 'resolve_revisits': True}
        }[wburl.type]
    def __str__(self):
        return 'load cdx indexes from ' + str(self.sources)
 #=================================================================
 class RemoteCDXServer(IndexReader):
    """
    >>> x = RemoteCDXServer('http://web.archive.org/cdx/search/cdx').load_cdx('example.com', parsed_cdx = True, limit = '2')
    >>> pprint(x.next().items())
    [('urlkey', 'com,example)/'),
     ('timestamp', '20020120142510'),
     ('original', 'http://example.com:80/'),
     ('mimetype', 'text/html'),
     ('statuscode', '200'),
     ('digest', 'HT2DYGA5UKZCPBSFVCV3JOBXGW2G5UUA'),
     ('length', '1792')]
    """
    def __init__(self, server_url, cookie = None):
        self.server_url = server_url
        self.auth_cookie = cookie
    def load_cdx(self, url, params = {}, parsed_cdx = True, **kwvalues):
        #url is required, must be passed explicitly!
        params['url'] = url
        params.update(**kwvalues)
        urlparams = urllib.urlencode(params, True)
        try:
            request = urllib2.Request(self.server_url, urlparams)
            if self.auth_cookie:
                request.add_header('Cookie', self.auth_cookie)
            response = urllib2.urlopen(request)
        except urllib2.HTTPError, e:
            if e.code == 403:
                exc_msg = e.read()
                msg = 'Blocked By Robots' if 'Blocked By Robots' in exc_msg else 'Excluded'
                raise wbexceptions.AccessException(msg)
            else:
                raise
        if parsed_cdx:
            return (CDXObject(cdx) for cdx in response)
        else:
            return iter(response)
    # Note: this params are designed to make pywb compatible with the original Java wayback-cdx-server API:
    # https://github.com/internetarchive/wayback/tree/master/wayback-cdx-server
    # Soon, this will be switched over to support the native pywb cdx server
    # BUG: Setting replayClosest to high number for now, as cdx server sometimes returns wrong result
    # with lower values if there are too many captures. Ideally, should be around 10-20
    # The replayClosest is the max number of cdx lines, so max number of retry attempts that WB will make
    def get_query_params(self, wburl, limit = '150000', collapse_time = '10', replay_closest = '4000'):
        return {
            wburl.QUERY:
                {'collapseTime': collapse_time, 'filter': '!statuscode:(500|502|504)', 'limit': limit},
@ -184,18 +69,20 @@ class RemoteCDXServer(IndexReader):
        }[wburl.type]
    @staticmethod
    def peek_iter(iterable):
        try:
            first = next(iterable)
        except StopIteration:
            return None
-    def __str__(self):
+        return chain([first], iterable)
        return 'server cdx from ' + self.server_url
 #=================================================================
 class RemoteCDXServer(IndexReader):
    def __init__(self, remote_url, cookie=None):
        self.remote = RemoteCDXSource(remote_url=remote_url, cookie=cookie, proxy_all=True)
        self.cdx_server = CDXServer(self.remote)
-# Testing
+    #def load_cdx(self, **params):
-
+        #return remote.load_cdx(**params)
 import utils
 if __name__ == "__main__" or utils.enable_doctests():
    from pprint import pprint
    test_dir = utils.test_data_dir() + 'cdx/'
    import doctest
    doctest.testmod()
--- a/pywb/pywb_init.py
+++ b/pywb/pywb_init.py
@ -1,11 +1,12 @@
 import handlers
 import indexreader
 import archivalrouter
 import config_utils
 import proxy
 import os
 import yaml
 import config_utils
 import logging
 import proxy
 #=================================================================
 DEFAULTS = {
@ -49,24 +50,20 @@ def pywb_config_manual(passed_config = {}):
    collections = config.get('collections')
    for name, value in collections.iteritems():
-        route_config = config
+        if isinstance(value, str):
-
+            route_config = config
-        if isinstance(value, dict):
+            cdx_server = indexreader.IndexReader(value)
            # if a dict, extend with base properies
            index_paths = value['index_paths']
            route_config = DictChain(value, config)
        else:
-            index_paths = str(value)
+            route_config = DictChain(value, config)
-
+            cdx_server = indexreader.IndexReader(route_config)
        cdx_source = indexreader.IndexReader.make_best_cdx_source(index_paths, route_config)
        wb_handler = config_utils.create_wb_handler(
-            cdx_source = cdx_source,
+            cdx_server = cdx_server,
            config = route_config,
        )
-        logging.info('Adding Collection: ' + name)
+        logging.debug('Adding Collection: ' + name)
        route_class = route_config.get('route_class', archivalrouter.Route)
@ -74,7 +71,7 @@ def pywb_config_manual(passed_config = {}):
        # cdx query handler
        if route_config.get('enable_cdx_api', False):
-            routes.append(archivalrouter.Route(name + '-cdx', handlers.CDXHandler(cdx_source)))
+            routes.append(archivalrouter.Route(name + '-cdx', handlers.CDXHandler(cdx_server)))
    if config.get('debug_echo_env', False):
@ -125,11 +122,3 @@ def pywb_config(config_file = None):
    return pywb_config_manual(config)
 import utils
 if __name__ == "__main__" or utils.enable_doctests():
    # Just test for execution for now
    #pywb_config(os.path.dirname(os.path.realpath(__file__)) + '/../config.yaml')
    pywb_config_manual()
--- a/pywb/regex_rewriters.py
+++ b/pywb/regex_rewriters.py
@ -1,269 +0,0 @@
 import re
 import sys
 import itertools
 from url_rewriter import UrlRewriter
 #=================================================================
 class RegexRewriter:
    """
    # Test https->http converter (other tests below in subclasses)
    >>> RegexRewriter([(RegexRewriter.HTTPX_MATCH_STR, RegexRewriter.remove_https, 0)]).rewrite('a = https://example.com; b = http://example.com; c = https://some-url/path/https://embedded.example.com')
    'a = http://example.com; b = http://example.com; c = http://some-url/path/http://embedded.example.com'
    """
    @staticmethod
    def comment_out(string):
        return '/*' + string + '*/'
    @staticmethod
    def remove_https(string):
        return string.replace("https", "http")
    @staticmethod
    def add_prefix(prefix):
        return lambda string: prefix + string
    @staticmethod
    def archival_rewrite(rewriter):
        return lambda x: rewriter.rewrite(x)
    @staticmethod
    def replacer(string):
        return lambda x: string
    HTTPX_MATCH_STR = r'https?:\\?/\\?/[A-Za-z0-9:_@.-]+'
    DEFAULT_OP = add_prefix
    def __init__(self, rules):
        #rules = self.create_rules(http_prefix)
        # Build regexstr, concatenating regex list
        regex_str = '|'.join(['(' + rx + ')' for rx, op, count in rules])
        # ensure it's not middle of a word, wrap in non-capture group
        regex_str = '(?<!\w)(?:' + regex_str + ')'
        self.regex = re.compile(regex_str, re.M)
        self.rules = rules
    def filter(self, m):
        return True
    def rewrite(self, string):
        return self.regex.sub(lambda x: self.replace(x), string)
    def close(self):
        return ''
    def replace(self, m):
        i = 0
        for _, op, count in self.rules:
            i += 1
            full_m = i
            while count > 0:
                i += 1
                count -= 1
            if not m.group(i):
                continue
            # Optional filter to skip matches
            if not self.filter(m):
                return m.group(0)
            # Custom func
            if not hasattr(op, '__call__'):
                op = RegexRewriter.DEFAULT_OP(op)
            result = op(m.group(i))
            # if extracting partial match
            if i != full_m:
                result = m.string[m.start(full_m):m.start(i)] + result + m.string[m.end(i):m.end(full_m)]
            return result
 #=================================================================
 class JSRewriter(RegexRewriter):
    """
    >>> test_js('location = "http://example.com/abc.html"')
    'WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"'
    >>> test_js(r'location = "http:\/\/example.com/abc.html"')
    'WB_wombat_location = "/web/20131010im_/http:\\\\/\\\\/example.com/abc.html"'
    >>> test_js(r'location = "http:\\/\\/example.com/abc.html"')
    'WB_wombat_location = "/web/20131010im_/http:\\\\/\\\\/example.com/abc.html"'
    >>> test_js(r"location = 'http://example.com/abc.html/'")
    "WB_wombat_location = '/web/20131010im_/http://example.com/abc.html/'"
    >>> test_js(r'location = http://example.com/abc.html/')
    'WB_wombat_location = http://example.com/abc.html/'
    >>> test_js(r'location = /http:\/\/example.com/abc.html/')
    'WB_wombat_location = /http:\\\\/\\\\/example.com/abc.html/'
    >>> test_js('"/location" == some_location_val; locations = location;')
    '"/location" == some_location_val; locations = WB_wombat_location;'
    >>> test_js('cool_Location = "http://example.com/abc.html"')
    'cool_Location = "/web/20131010im_/http://example.com/abc.html"'
    >>> test_js('window.location = "http://example.com/abc.html" document.domain = "anotherdomain.com"')
    'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html" document.WB_wombat_domain = "anotherdomain.com"'
    >>> test_js('document_domain = "anotherdomain.com"; window.document.domain = "example.com"')
    'document_domain = "anotherdomain.com"; window.document.WB_wombat_domain = "example.com"'
    # custom rules added
    >>> test_js('window.location = "http://example.com/abc.html"; some_func(); ', [('some_func\(\).*', RegexRewriter.comment_out, 0)])
    'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"; /*some_func(); */'
    # scheme-agnostic
    >>> test_js('cool_Location = "//example.com/abc.html" //comment')
    'cool_Location = "/web/20131010im_///example.com/abc.html" //comment'
    """
    JS_HTTPX = r'(?<="|\')(?:https?:)?\\?/\\?/[A-Za-z0-9:_@.-]+'
    def __init__(self, rewriter, extra = []):
        rules = self._create_rules(rewriter.get_abs_url())
        rules.extend(extra)
        RegexRewriter.__init__(self, rules)
    def _create_rules(self, http_prefix):
        return [
             (self.JS_HTTPX, http_prefix, 0),
             (r'(?<!/)\blocation\b', 'WB_wombat_', 0),
             (r'(?<=document\.)domain', 'WB_wombat_', 0),
        ]
 #=================================================================
 class XMLRewriter(RegexRewriter):
    """
    >>> test_xml('<tag xmlns="http://www.example.com/ns" attr="http://example.com"></tag>')
    '<tag xmlns="http://www.example.com/ns" attr="/web/20131010im_/http://example.com"></tag>'
    >>> test_xml('<tag xmlns:xsi="http://www.example.com/ns" attr=" http://example.com"></tag>')
    '<tag xmlns:xsi="http://www.example.com/ns" attr=" /web/20131010im_/http://example.com"></tag>'
    >>> test_xml('<tag> http://example.com<other>abchttp://example.com</other></tag>')
    '<tag> /web/20131010im_/http://example.com<other>abchttp://example.com</other></tag>'
    >>> test_xml('<main>   http://www.example.com/blah</tag> <other xmlns:abcdef= " http://example.com"/> http://example.com </main>')
    '<main>   /web/20131010im_/http://www.example.com/blah</tag> <other xmlns:abcdef= " http://example.com"/> /web/20131010im_/http://example.com </main>'
    """
    def __init__(self, rewriter, extra = []):
        rules = self._create_rules(rewriter.get_abs_url())
        RegexRewriter.__init__(self, rules)
    # custom filter to reject 'xmlns' attr
    def filter(self, m):
        attr = m.group(1)
        if attr and attr.startswith('xmlns'):
            return False
        return True
    def _create_rules(self, http_prefix):
        return [
             ('([A-Za-z:]+[\s=]+)?["\'\s]*(' + RegexRewriter.HTTPX_MATCH_STR + ')', http_prefix, 2),
        ]
 #=================================================================
 class CSSRewriter(RegexRewriter):
    r"""
    >>> test_css("background: url('/some/path.html')")
    "background: url('/web/20131010im_/http://example.com/some/path.html')"
    >>> test_css("background: url('../path.html')")
    "background: url('/web/20131010im_/http://example.com/path.html')"
    >>> test_css("background: url(\"http://domain.com/path.html\")")
    'background: url("/web/20131010im_/http://domain.com/path.html")'
    >>> test_css("background: url(file.jpeg)")
    'background: url(/web/20131010im_/http://example.com/file.jpeg)'
    >>> test_css("background: url('')")
    "background: url('')"
    >>> test_css("background: url (\"weirdpath\')")
    'background: url ("/web/20131010im_/http://example.com/weirdpath\')'
    >>> test_css("@import   url ('path.css')")
    "@import   url ('/web/20131010im_/http://example.com/path.css')"
    >>> test_css("@import url('path.css')")
    "@import url('/web/20131010im_/http://example.com/path.css')"
    >>> test_css("@import ( 'path.css')")
    "@import ( '/web/20131010im_/http://example.com/path.css')"
    >>> test_css("@import  \"path.css\"")
    '@import  "/web/20131010im_/http://example.com/path.css"'
    >>> test_css("@import ('../path.css\"")
    '@import (\'/web/20131010im_/http://example.com/path.css"'
    >>> test_css("@import ('../url.css\"")
    '@import (\'/web/20131010im_/http://example.com/url.css"'
    >>> test_css("@import (\"url.css\")")
    '@import ("/web/20131010im_/http://example.com/url.css")'
    >>> test_css("@import url(/url.css)\n@import  url(/anotherurl.css)\n @import  url(/and_a_third.css)")
    '@import url(/web/20131010im_/http://example.com/url.css)\n@import  url(/web/20131010im_/http://example.com/anotherurl.css)\n @import  url(/web/20131010im_/http://example.com/and_a_third.css)'
    """
    CSS_URL_REGEX = "url\\s*\\(\\s*[\\\\\"']*([^)'\"]+)[\\\\\"']*\\s*\\)"
    CSS_IMPORT_NO_URL_REGEX = "@import\\s+(?!url)\\(?\\s*['\"]?(?!url[\\s\\(])([\w.:/\\\\-]+)"
    def __init__(self, rewriter):
        rules = self._create_rules(rewriter)
        RegexRewriter.__init__(self, rules)
    def _create_rules(self, rewriter):
        return [
             (CSSRewriter.CSS_URL_REGEX, RegexRewriter.archival_rewrite(rewriter), 1),
             (CSSRewriter.CSS_IMPORT_NO_URL_REGEX, RegexRewriter.archival_rewrite(rewriter), 1),
        ]
 import utils
 if __name__ == "__main__" or utils.enable_doctests():
    arcrw = UrlRewriter('20131010im_/http://example.com/', '/web/')
    def test_js(string, extra = []):
        return JSRewriter(arcrw, extra).rewrite(string)
    def test_xml(string):
        return XMLRewriter(arcrw).rewrite(string)
    def test_css(string):
        return CSSRewriter(arcrw).rewrite(string)
    import doctest
    doctest.testmod()
--- a/pywb/replay_views.py
+++ b/pywb/replay_views.py
@ -1,30 +1,30 @@
 import StringIO
 from urllib2 import URLError
 import chardet
 import copy
 import itertools
-import archiveloader
+from pywb.rewrite.url_rewriter import UrlRewriter
-from wbrequestresponse import WbResponse, StatusAndHeaders
+from pywb.utils.bufferedreaders import ChunkedDataReader
-import utils
+from wbrequestresponse import WbResponse
 from url_rewriter import UrlRewriter
 from header_rewriter import HeaderRewriter
 import html_rewriter
 import regex_rewriters
 import wbexceptions
 #=================================================================
 class ReplayView:
-    def __init__(self, resolvers, loader = None, reporter = None):
+    def __init__(self, content_loader, content_rewriter, head_insert_view = None,
-        self.resolvers = resolvers
+                 redir_to_exact = True, buffer_response = False, reporter = None):
-        self.loader = loader if loader else archiveloader.ArchiveLoader()
+
        self.content_loader = content_loader
        self.content_rewriter = content_rewriter
        self.head_insert_view = head_insert_view
        self.redir_to_exact = redir_to_exact
        # buffer or stream rewritten response
        self.buffer_response = buffer_response
        self._reporter = reporter
-    def __call__(self, wbrequest, cdx_lines, cdx_reader):
+    def __call__(self, wbrequest, cdx_lines):
        last_e = None
        first = True
@ -40,9 +40,22 @@ class ReplayView:
                    self._redirect_if_needed(wbrequest, cdx)
                    first = False
-                (cdx, status_headers, stream) = self.resolve_headers_and_payload(cdx, wbrequest, cdx_reader, failed_files)
+                (status_headers, stream) = self.content_loader.resolve_headers_and_payload(cdx, failed_files)
-                response = self.make_response(wbrequest, cdx, status_headers, stream)
+                # check and reject self-redirect
                self._reject_self_redirect(wbrequest, cdx, status_headers)
                # check if redir is needed
                self._redirect_if_needed(wbrequest, cdx)
                response = None
                if self.content_rewriter and wbrequest.wb_url.mod != 'id_':
                    response = self.rewrite_content(wbrequest, cdx, status_headers, stream)
                else:
                    (status_headers, stream) = self.sanitize_content(status_headers, stream)
                    response_iter = self.stream_to_iter(stream)
                    response = WbResponse(status_headers, response_iter)
                # notify reporter callback, if any
                if self._reporter:
@ -62,288 +75,57 @@ class ReplayView:
        else:
            raise wbexceptions.UnresolvedArchiveFileException()
    # callback to issue a redirect to another request
    # subclasses may provide custom logic
    def _redirect_if_needed(self, wbrequest, cdx):
        pass
    def _load(self, cdx, revisit, failed_files):
        if revisit:
            (filename, offset, length) = (cdx['orig.filename'], cdx['orig.offset'], cdx['orig.length'])
        else:
            (filename, offset, length) = (cdx['filename'], cdx['offset'], cdx['length'])
        #optimization: if same file already failed this request, don't try again
        if failed_files and filename in failed_files:
            raise wbexceptions.ArchiveLoadFailed(filename, 'Skipping Already Failed')
        any_found = False
        last_exc = None
        for resolver in self.resolvers:
            possible_paths = resolver(filename)
            if possible_paths:
                for path in possible_paths:
                    any_found = True
                    try:
                        return self.loader.load(path, offset, length)
                    except Exception as ue:
                        last_exc = ue
                        print last_exc
                        pass
        # Unsuccessful if reached here
        if failed_files:
           failed_files.append(filename)
        if not any_found:
            raise wbexceptions.UnresolvedArchiveFileException('Archive File Not Found: ' + filename)
        else:
            raise wbexceptions.ArchiveLoadFailed(filename, last_exc.reason if last_exc else '')
    def resolve_headers_and_payload(self, cdx, wbrequest, cdx_reader, failed_files):
        has_curr = (cdx['filename'] != '-')
        has_orig = (cdx.get('orig.filename','-') != '-')
        # load headers record from cdx['filename'] unless it is '-' (rare)
        headers_record = self._load(cdx, False, failed_files) if has_curr else None
        # two index lookups
        # Case 1: if mimetype is still warc/revisit
        if cdx['mimetype'] == 'warc/revisit' and headers_record:
            payload_record = self._load_different_url_payload(wbrequest, cdx_reader, cdx, headers_record, failed_files)
        # single lookup cases
        # case 2: non-revisit
        elif (has_curr and not has_orig):
            payload_record = headers_record
        # case 3: identical url revisit, load payload from orig.filename
        elif (has_orig):
            payload_record = self._load(cdx, True, failed_files)
        # special case: set header to payload if old-style revisit with missing header
        if not headers_record:
            headers_record = payload_record
        elif headers_record != payload_record:
            # close remainder of stream as this record only used for (already parsed) headers
            headers_record.stream.close()
            # special case: check if headers record is actually empty (eg empty revisit), then use headers from revisit
            if not headers_record.status_headers.headers:
                headers_record = payload_record
        if not headers_record or not payload_record:
            raise wbexceptions.CaptureException('Invalid CDX' + str(cdx))
        #response = WbResponse(headers_record.status_headers, self.create_stream_gen(payload_record.stream))
        #response._stream = payload_record.stream
        return (cdx, headers_record.status_headers, payload_record.stream)
    # done here! just return response
    # subclasses make override to do additional processing
    def make_response(self, wbrequest, cdx, status_headers, stream):
        return self.create_stream_response(status_headers, stream)
    # create response from headers and wrapping stream in generator
    def create_stream_response(self, status_headers, stream):
        return WbResponse(status_headers, self.create_stream_gen(stream))
    # Handle the case where a duplicate of a capture with same digest exists at a different url
    # Must query the index at that url filtering by matching digest
    # Raise exception if no matches found
    def _load_different_url_payload(self, wbrequest, cdx_reader, cdx, headers_record, failed_files):
        ref_target_uri = headers_record.rec_headers.get_header('WARC-Refers-To-Target-URI')
        # Check for unresolved revisit error, if refers to target uri not present or same as the current url
        if not ref_target_uri or (ref_target_uri == headers_record.rec_headers.get_header('WARC-Target-URI')):
            raise wbexceptions.CaptureException('Missing Revisit Original' + str(cdx))
        ref_target_date = headers_record.rec_headers.get_header('WARC-Refers-To-Date')
        if not ref_target_date:
            ref_target_date = cdx['timestamp']
        else:
            ref_target_date = utils.iso_date_to_timestamp(ref_target_date)
        # clone WbRequest
        orig_wbreq = copy.copy(wbrequest)
        orig_wbreq.wb_url = copy.copy(orig_wbreq.wb_url)
        orig_wbreq.wb_url.url = ref_target_uri
        orig_wbreq.wb_url.timestamp = ref_target_date
        # Must also match digest
        orig_wbreq.query_filter.append('digest:' + cdx['digest'])
        orig_cdx_lines = cdx_reader.load_for_request(orig_wbreq, parsed_cdx = True)
        for cdx in orig_cdx_lines:
            try:
                #cdx = cdx_reader.CDXCaptureResult(cdx)
                #print cdx
                payload_record = self._load(cdx, False, failed_files)
                return payload_record
            except wbexceptions.CaptureException as e:
                pass
        raise wbexceptions.CaptureException('Original for revisit could not be loaded')
    def resolve_full(self, filename):
        # Attempt to resolve cdx file to full path
        full_url = None
        for resolver in self.resolvers:
            full_url = resolver(filename)
            if full_url:
                return full_url
        raise wbexceptions.UnresolvedArchiveFileException('Archive File Not Found: ' + filename)
    # Create a generator reading from a stream, with optional rewriting and final read call
    @staticmethod
-    def create_stream_gen(stream, rewrite_func = None, final_read_func = None, first_buff = None):
+    def stream_to_iter(stream):
        try:
-            buff = first_buff if first_buff else stream.read()
+            buff = stream.read()
            while buff:
                if rewrite_func:
                    buff = rewrite_func(buff)
                yield buff
                buff = stream.read()
            # For adding a tail/handling final buffer
            if final_read_func:
                buff = final_read_func()
                if buff:
                    yield buff
        finally:
            stream.close()
    def sanitize_content(self, status_headers, stream):
        # remove transfer encoding chunked and wrap in a dechunking stream
        if (status_headers.remove_header('transfer-encoding')):
            stream = ChunkedDataReader(stream)
-    def __str__(self):
+        return (status_headers, stream)
        return 'find archive files from ' + str(self.resolvers)
 #=================================================================
 class RewritingReplayView(ReplayView):
    def __init__(self, resolvers, loader = None, head_insert_view = None, header_rewriter = None, redir_to_exact = True, buffer_response = False, reporter = None):
        ReplayView.__init__(self, resolvers, loader, reporter)
        self.head_insert_view = head_insert_view
        self.header_rewriter = header_rewriter if header_rewriter else HeaderRewriter()
        self.redir_to_exact = redir_to_exact
        # buffer or stream rewritten response
        self.buffer_response = buffer_response
    def _text_content_type(self, content_type):
        for ctype, mimelist in self.REWRITE_TYPES.iteritems():
            if any ((mime in content_type) for mime in mimelist):
                return ctype
        return None
    def make_response(self, wbrequest, cdx, status_headers, stream):
        # check and reject self-redirect
        self._reject_self_redirect(wbrequest, cdx, status_headers)
        # check if redir is needed
        self._redirect_if_needed(wbrequest, cdx)
    def rewrite_content(self, wbrequest, cdx, status_headers, stream):
        urlrewriter = wbrequest.urlrewriter
-        rewritten_headers = self.header_rewriter.rewrite(status_headers, urlrewriter)
+        (rewritten_headers, stream) = self.content_rewriter.rewrite_headers(urlrewriter, status_headers, stream)
-        # de_chunking in case chunk encoding is broken
+        # no rewriting needed!
        # TODO: investigate further
        de_chunk = False
        # handle transfer-encoding: chunked
        if (rewritten_headers.contains_removed_header('transfer-encoding', 'chunked')):
            stream = archiveloader.ChunkedLineReader(stream)
            de_chunk = True
        # transparent, though still may need to dechunk
        if wbrequest.wb_url.mod == 'id_':
            if de_chunk:
                status_headers.remove_header('transfer-encoding')
            return self.create_stream_response(status_headers, stream)
        # non-text content type, just send through with rewritten headers
        # but may need to dechunk
        if rewritten_headers.text_type is None:
-            status_headers = rewritten_headers.status_headers
+            response_iter = self.stream_to_iter(stream)
            return WbResponse(rewritten_headers.status_headers, response_iter)
-            return self.create_stream_response(status_headers, stream)
+        # do head insert
-
+        if self.head_insert_view:
-        # Handle text rewriting
+            head_insert_str = self.head_insert_view.render_to_string(wbrequest = wbrequest, cdx = cdx)
        # special case -- need to ungzip the body
        if (rewritten_headers.contains_removed_header('content-encoding', 'gzip')):
            stream = archiveloader.LineReader(stream, decomp = utils.create_decompressor())
        # TODO: is this right?
        if rewritten_headers.charset:
            encoding = rewritten_headers.charset
            first_buff = None
        else:
-            (encoding, first_buff) = self._detect_charset(stream)
+            head_insert_str = None
-            # if chardet thinks its ascii, use utf-8
+        (status_headers, response_gen) = self.content_rewriter.rewrite_content(urlrewriter, rewritten_headers, stream, head_insert_str)
            if encoding == 'ascii':
                #encoding = None
                encoding = 'utf-8'
        # Buffering response for html, streaming for others?
        #if rewritten_headers.text_type == 'html':
        #    return self._rewrite_html(encoding, urlrewriter, stream, rewritten_headers.status_headers, first_buff)
        #else:
        #    return self._rewrite_other(rewritten_headers.text_type, encoding, urlrewriter, stream, rewritten_headers.status_headers, first_buff)
        text_type = rewritten_headers.text_type
        status_headers = rewritten_headers.status_headers
        if text_type == 'html':
            head_insert_str = self.head_insert_view.render_to_string(wbrequest = wbrequest, cdx = cdx) if self.head_insert_view else None
            rewriter = html_rewriter.HTMLRewriter(urlrewriter, outstream = None, head_insert = head_insert_str)
        elif text_type == 'css':
            rewriter = regex_rewriters.CSSRewriter(urlrewriter)
        elif text_type == 'js':
            rewriter = regex_rewriters.JSRewriter(urlrewriter)
        elif text_type == 'xml':
            rewriter = regex_rewriters.XMLRewriter(urlrewriter)
        else:
            raise Exception('Unknown Text Type for Rewrite: ' + text_type)
        # Create generator for response
        response_gen = self._create_rewrite_stream(rewriter, encoding, stream, first_buff)
        if self.buffer_response:
-            return self._create_buffer_response(status_headers, response_gen)
+            if wbrequest.wb_url.mod == 'id_':
-        else:
+                status_headers.remove_header('content-length')
-            return WbResponse(status_headers, value = response_gen)
+
            return self.buffered_response(status_headers, response_gen)
        return WbResponse(status_headers, response_gen)
-    # Buffer rewrite generator and return a response from a string
+    # Buffer rewrite iterator and return a response from a string
-    def _create_buffer_response(self, status_headers, generator):
+    def buffered_response(self, status_headers, iterator):
        out = StringIO.StringIO()
        try:
-            for buff in generator:
+            for buff in iterator:
                out.write(buff)
        finally:
@ -355,53 +137,9 @@ class RewritingReplayView(ReplayView):
        return WbResponse(status_headers, value = [content])
    # Create rewrite response from record (no Content-Length), may even be chunked by front-end
    def _create_rewrite_stream(self, rewriter, encoding, stream, first_buff = None):
        def do_rewrite(buff):
            if encoding:
                buff = self._decode_buff(buff, stream, encoding)
            buff = rewriter.rewrite(buff)
            if encoding:
                buff = buff.encode(encoding)
            return buff
        def do_finish():
            return rewriter.close()
        return self.create_stream_gen(stream, rewrite_func = do_rewrite, final_read_func = do_finish, first_buff = first_buff)
    def _decode_buff(self, buff, stream, encoding):
        try:
            buff = buff.decode(encoding)
        except UnicodeDecodeError, e:
            # chunk may have cut apart unicode bytes -- add 1-3 bytes and retry
            for i in range(3):
                buff += stream.read(1)
                try:
                    buff = buff.decode(encoding)
                    break
                except UnicodeDecodeError:
                    pass
            else:
                raise
        return buff
    def _detect_charset(self, stream):
        buff = stream.read(8192)
        result = chardet.detect(buff)
        print "chardet result: " + str(result)
        return (result['encoding'], buff)
    def _redirect_if_needed(self, wbrequest, cdx):
-        is_proxy = wbrequest.is_proxy
+        if self.redir_to_exact and not wbrequest.is_proxy and cdx and (cdx['timestamp'] != wbrequest.wb_url.timestamp):
        if self.redir_to_exact and not is_proxy and cdx and (cdx['timestamp'] != wbrequest.wb_url.timestamp):
            new_url = wbrequest.urlrewriter.get_timestamp_url(cdx['timestamp'], cdx['original'])
            raise wbexceptions.InternalRedirect(new_url)
--- a/pywb/rewrite/README.md
+++ b/pywb/rewrite/README.md
@ -0,0 +1,47 @@
 ## PyWb Rewrite v0.2
 [![Build Status](https://travis-ci.org/ikreymer/pywb_rewrite.png?branch=master)](https://travis-ci.org/ikreymer/pywb_rewrite)
 This package includes the content rewriting component of the pywb wayback tool suite.
 This package applies standard rewriting content rewriting, in the form of url rewriting, for
 HTTP headers, html, css, js and xml content.
 An additional domain-specific rewritin is planned, especially for JS, to allow for proper
 replay of difficult pages.
 ### Command-Line Rewriter
 To enable easier testing of rewriting, this package includes a command-line rewriter 
 which will fetch a live url and apply the registered rewriting rules to that url:
 After installing with:
 `pip install -r requirements.txt`
 Run:
 `python ./pywb_rewrite/rewrite_live.py http://example.com`
 To specify custom timestamp and prefix:
 ```
 python ./pywb_rewrite/rewrite_live.py http://example.com /mycoll/20141026000102/http://mysite.example.com/path.html
 ```
 This will print to stdout the content of `http://example.com` with all urls rewritten relative to 
 `/mycoll/20141026000102/http://mysite.example.com/path.html`.
 Headers are also rewritten, for further details, consult the `get_rewritten` function in
 [pywb_rewrite/rewrite_live.py](pywb_rewrite/rewrite_live.py)
 ### Tests
 Rewriting doctests as well as live rewriting tests (subject to change) are provided.
 To run full test suite: `python run-tests.py`
--- a/pywb/cdxserver/init.py
+++ b/pywb/cdxserver/init.py
--- a/pywb/rewrite/header_rewriter.py
+++ b/pywb/rewrite/header_rewriter.py
@ -1,4 +1,4 @@
-from wbrequestresponse import StatusAndHeaders
+from pywb.utils.statusandheaders import StatusAndHeaders
 #=================================================================
 class RewrittenStatusAndHeaders:
@ -14,37 +14,6 @@ class RewrittenStatusAndHeaders:
 #=================================================================
 class HeaderRewriter:
    """
    # Text with charset
    >>> test_rewrite([('Date', 'Fri, 03 Jan 2014 03:03:21 GMT'), ('Content-Length', '5'), ('Content-Type', 'text/html;charset=UTF-8')])
    {'text_type': 'html', 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Date', 'Fri, 03 Jan 2014 03:03:21 GMT'),
      ('X-Archive-Orig-Content-Length', '5'),
      ('Content-Type', 'text/html;charset=UTF-8')]), 'removed_header_dict': {}, 'charset': 'utf-8'}
    # Redirect
    >>> test_rewrite([('Connection', 'close'), ('Location', '/other.html')], '302 Redirect')
    {'text_type': None, 'status_headers': StatusAndHeaders(protocol = '', statusline = '302 Redirect', headers = [ ('X-Archive-Orig-Connection', 'close'),
      ('Location', '/web/20131226101010/http://example.com/other.html')]), 'removed_header_dict': {}, 'charset': None}
    # gzip
    >>> test_rewrite([('Content-Length', '199999'), ('Content-Type', 'text/javascript'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked')])
    {'text_type': 'js', 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Content-Length', '199999'),
      ('Content-Type', 'text/javascript')]), 'removed_header_dict': {'transfer-encoding': 'chunked', 'content-encoding': 'gzip'}, 'charset': None}
    # Binary
    >>> test_rewrite([('Content-Length', '200000'), ('Content-Type', 'image/png'), ('Cookie', 'blah'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked')])
    {'text_type': None, 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('Content-Length', '200000'),
      ('Content-Type', 'image/png'),
      ('X-Archive-Orig-Cookie', 'blah'),
      ('Content-Encoding', 'gzip')]), 'removed_header_dict': {'transfer-encoding': 'chunked'}, 'charset': None}
    Removing Transfer-Encoding always, Was:
      ('Content-Encoding', 'gzip'),
      ('Transfer-Encoding', 'chunked')]), 'charset': None, 'text_type': None, 'removed_header_dict': {}}
    """
    REWRITE_TYPES = {
        'html': ['text/html', 'application/xhtml'],
        'css':  ['text/css'],
@ -122,20 +91,3 @@ class HeaderRewriter:
        return (new_headers, removed_header_dict)
 import utils
 if __name__ == "__main__" or utils.enable_doctests():
    import os
    import pprint
    import url_rewriter
    urlrewriter = url_rewriter.UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/web/')
    headerrewriter = HeaderRewriter()
    def test_rewrite(headers, status = '200 OK'):
        rewritten = headerrewriter.rewrite(StatusAndHeaders(status, headers), urlrewriter)
        return vars(rewritten)
    import doctest
    doctest.testmod()
--- a/pywb/rewrite/html_rewriter.py
+++ b/pywb/rewrite/html_rewriter.py
@ -12,75 +12,8 @@ from regex_rewriters import JSRewriter, CSSRewriter
 # HTMLRewriter -- html parser for custom rewriting, also handlers for script and css
 #=================================================================
 class HTMLRewriter(HTMLParser):
-    r"""
+    """
-    >>> parse('<HTML><A Href="page.html">Text</a></hTmL>')
+    HTML-Parsing Rewriter
    <HTML><a href="/web/20131226101010/http://example.com/some/path/page.html">Text</a></html>
    >>> parse('<body x="y"><img src="../img.gif"/><br/></body>')
    <body x="y"><img src="/web/20131226101010im_/http://example.com/some/img.gif"/><br/></body>
    >>> parse('<body x="y"><img src="/img.gif"/><br/></body>')
    <body x="y"><img src="/web/20131226101010im_/http://example.com/img.gif"/><br/></body>
    >>> parse('<input "selected"><img src></div>')
    <input "selected"=""><img src=""></div>
    >>> parse('<html><head><base href="http://example.com/some/path/index.html"/>')
    <html><head><base href="/web/20131226101010/http://example.com/some/path/index.html"/>
    # HTML Entities
    >>> parse('<a href="">&rsaquo; &nbsp; &#62;</div>')
    <a href="">&rsaquo; &nbsp; &#62;</div>
    # Don't rewrite anchors
    >>> parse('<HTML><A Href="#abc">Text</a></hTmL>')
    <HTML><a href="#abc">Text</a></html>
    # Unicode
    >>> parse('<a href="http://испытание.испытание/">испытание</a>')
    <a href="/web/20131226101010/http://испытание.испытание/">испытание</a>
    # Meta tag
    >>> parse('<META http-equiv="refresh" content="10; URL=/abc/def.html">')
    <meta http-equiv="refresh" content="10; URL=/web/20131226101010/http://example.com/abc/def.html">
    >>> parse('<meta http-equiv="Content-type" content="text/html; charset=utf-8" />')
    <meta http-equiv="Content-type" content="text/html; charset=utf-8"/>
    >>> parse('<META http-equiv="refresh" content>')
    <meta http-equiv="refresh" content="">
    # Script tag
    >>> parse('<script>window.location = "http://example.com/a/b/c.html"</script>')
    <script>window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html"</script>
    # Unterminated script tag auto-terminate
    >>> parse('<script>window.location = "http://example.com/a/b/c.html"</sc>')
    <script>window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html"</sc></script>
    >>> parse('<script>/*<![CDATA[*/window.location = "http://example.com/a/b/c.html;/*]]>*/"</script>')
    <script>/*<![CDATA[*/window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html;/*]]>*/"</script>
    >>> parse('<div style="background: url(\'abc.html\')" onblah onclick="location = \'redirect.html\'"></div>')
    <div style="background: url('/web/20131226101010/http://example.com/some/path/abc.html')" onblah="" onclick="WB_wombat_location = 'redirect.html'"></div>
    >>> parse('<style>@import "styles.css" .a { font-face: url(\'myfont.ttf\') }</style>')
    <style>@import "/web/20131226101010/http://example.com/some/path/styles.css" .a { font-face: url('/web/20131226101010/http://example.com/some/path/myfont.ttf') }</style>
    # Unterminated style tag auto-terminate
    >>> parse('<style>@import url(styles.css)')
    <style>@import url(/web/20131226101010/http://example.com/some/path/styles.css)</style>
    # Head Insertion
    >>> parse('<html><head><script src="other.js"></script></head><body>Test</body></html>', head_insert = '<script src="cool.js"></script>')
    <html><head><script src="cool.js"></script><script src="/web/20131226101010js_/http://example.com/some/path/other.js"></script></head><body>Test</body></html>
    >>> parse('<body><div>SomeTest</div>', head_insert = '/* Insert */')
    /* Insert */<body><div>SomeTest</div>
    >>> parse('<link href="abc.txt"><div>SomeTest</div>', head_insert = '<script>load_stuff();</script>')
    <link href="/web/20131226101010oe_/http://example.com/some/path/abc.txt"><script>load_stuff();</script><div>SomeTest</div>
    """
    REWRITE_TAGS = {
@ -307,16 +240,4 @@ class HTMLRewriter(HTMLParser):
        self.out.write(']>')
 import utils
 if __name__ == "__main__" or utils.enable_doctests():
    url_rewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/web/')
    def parse(data, head_insert = None):
        parser = HTMLRewriter(url_rewriter, head_insert = head_insert)
        print parser.rewrite(data) + parser.close()
    import doctest
    doctest.testmod()
--- a/pywb/rewrite/regex_rewriters.py
+++ b/pywb/rewrite/regex_rewriters.py
@ -0,0 +1,156 @@
 import re
 import sys
 import itertools
 from url_rewriter import UrlRewriter
 #=================================================================
 class RegexRewriter(object):
    @staticmethod
    def comment_out(string):
        return '/*' + string + '*/'
    @staticmethod
    def remove_https(string):
        return string.replace("https", "http")
    @staticmethod
    def add_prefix(prefix):
        return lambda string: prefix + string
    @staticmethod
    def archival_rewrite(rewriter):
        return lambda x: rewriter.rewrite(x)
    @staticmethod
    def replacer(string):
        return lambda x: string
    HTTPX_MATCH_STR = r'https?:\\?/\\?/[A-Za-z0-9:_@.-]+'
    DEFAULT_OP = add_prefix
    def __init__(self, rules):
        #rules = self.create_rules(http_prefix)
        # Build regexstr, concatenating regex list
        regex_str = '|'.join(['(' + rx + ')' for rx, op, count in rules])
        # ensure it's not middle of a word, wrap in non-capture group
        regex_str = '(?<!\w)(?:' + regex_str + ')'
        self.regex = re.compile(regex_str, re.M)
        self.rules = rules
    def filter(self, m):
        return True
    def rewrite(self, string):
        return self.regex.sub(lambda x: self.replace(x), string)
    def close(self):
        return ''
    def replace(self, m):
        i = 0
        for _, op, count in self.rules:
            i += 1
            full_m = i
            while count > 0:
                i += 1
                count -= 1
            if not m.group(i):
                continue
            # Optional filter to skip matches
            if not self.filter(m):
                return m.group(0)
            # Custom func
            if not hasattr(op, '__call__'):
                op = RegexRewriter.DEFAULT_OP(op)
            result = op(m.group(i))
            # if extracting partial match
            if i != full_m:
                result = m.string[m.start(full_m):m.start(i)] + result + m.string[m.end(i):m.end(full_m)]
            return result
 #=================================================================
 class JSLinkRewriter(RegexRewriter):
    """
    JS Rewriter which rewrites absolute http://, https:// and // urls
    at the beginning of a string
    """
    JS_HTTPX = r'(?<="|\')(?:https?:)?\\{0,2}/\\{0,2}/[A-Za-z0-9:_@.-]+'
    def __init__(self, rewriter, rules = []):
        rules = rules + [(self.JS_HTTPX, rewriter.get_abs_url(), 0)]
        super(JSLinkRewriter, self).__init__(rules)
 #=================================================================
 class JSLocationAndLinkRewriter(JSLinkRewriter):
    """
    JS Rewriter which also rewrites location and domain to the
    specified prefix (default: 'WB_wombat_')
    """
    def __init__(self, rewriter, rules = [], prefix = 'WB_wombat_'):
        rules = rules + [
             (r'(?<!/)\blocation\b', prefix, 0),
             (r'(?<=document\.)domain', prefix, 0),
        ]
        super(JSLocationAndLinkRewriter, self).__init__(rewriter, rules)
 #=================================================================
 # Set 'default' JSRewriter
 JSRewriter = JSLocationAndLinkRewriter
 #=================================================================
 class XMLRewriter(RegexRewriter):
    def __init__(self, rewriter, extra = []):
        rules = self._create_rules(rewriter.get_abs_url())
        RegexRewriter.__init__(self, rules)
    # custom filter to reject 'xmlns' attr
    def filter(self, m):
        attr = m.group(1)
        if attr and attr.startswith('xmlns'):
            return False
        return True
    def _create_rules(self, http_prefix):
        return [
             ('([A-Za-z:]+[\s=]+)?["\'\s]*(' + RegexRewriter.HTTPX_MATCH_STR + ')', http_prefix, 2),
        ]
 #=================================================================
 class CSSRewriter(RegexRewriter):
    CSS_URL_REGEX = "url\\s*\\(\\s*[\\\\\"']*([^)'\"]+)[\\\\\"']*\\s*\\)"
    CSS_IMPORT_NO_URL_REGEX = "@import\\s+(?!url)\\(?\\s*['\"]?(?!url[\\s\\(])([\w.:/\\\\-]+)"
    def __init__(self, rewriter):
        rules = self._create_rules(rewriter)
        RegexRewriter.__init__(self, rules)
    def _create_rules(self, rewriter):
        return [
             (CSSRewriter.CSS_URL_REGEX, RegexRewriter.archival_rewrite(rewriter), 1),
             (CSSRewriter.CSS_IMPORT_NO_URL_REGEX, RegexRewriter.archival_rewrite(rewriter), 1),
        ]
--- a/pywb/rewrite/rewrite_content.py
+++ b/pywb/rewrite/rewrite_content.py
@ -0,0 +1,151 @@
 import chardet
 from url_rewriter import UrlRewriter
 from html_rewriter import HTMLRewriter
 from regex_rewriters import RegexRewriter, JSRewriter, CSSRewriter, XMLRewriter
 from header_rewriter import HeaderRewriter, RewrittenStatusAndHeaders
 from pywb.utils.statusandheaders import StatusAndHeaders
 from pywb.utils.bufferedreaders import BufferedReader, ChunkedDataReader
 class RewriteContent:
    DEFAULT_CONTENT_REWRITERS = {
      'header': HeaderRewriter,
      'js': JSRewriter,
      'css': CSSRewriter,
      'xml': XMLRewriter,
      'html': HTMLRewriter
    }
    def __init__(self, rewriters = {}):
        self.rewriters = dict(self.DEFAULT_CONTENT_REWRITERS.items() + rewriters.items())
    def rewrite_headers(self, urlrewriter, status_headers, stream):
        rewritten_headers = self.rewriters['header']().rewrite(status_headers, urlrewriter)
        # note: since chunking may be broken, approach taken here is to *always* attempt
        # to dechunk if transfer-encoding: chunked is present
        #
        # an alternative may be to serve chunked unless content rewriting is needed
        # todo: possible revisit this approach
        if (rewritten_headers.contains_removed_header('transfer-encoding', 'chunked')):
            stream = ChunkedDataReader(stream)
        return (rewritten_headers, stream)
    def rewrite_content(self, urlrewriter, headers, stream, head_insert_str = None):
        # see if we've already rewritten headers
        if isinstance(headers, RewrittenStatusAndHeaders):
            rewritten_headers = headers
        elif isinstance(headers, StatusAndHeaders):
        # otherwise, need to determine if rewriting is even necessary
            (rewritten_headers, stream) = self.rewrite_headers(urlrewriter, headers, stream)
            # no rewriting needed here
            if rewritten_headers.text_type is None:
                gen = self.stream_to_gen(stream)
                return (status_headers, gen)
        status_headers = rewritten_headers.status_headers
        # Handle text content rewriting
        # =========================================================================
        # special case -- need to ungzip the body
        if (rewritten_headers.contains_removed_header('content-encoding', 'gzip')):
            stream = BufferedReader(stream, 'gzip')
        if rewritten_headers.charset:
            encoding = rewritten_headers.charset
            first_buff = None
        else:
            (encoding, first_buff) = self._detect_charset(stream)
            # if chardet thinks its ascii, use utf-8
            if encoding == 'ascii':
                encoding = 'utf-8'
        text_type = rewritten_headers.text_type
        rewriter_class = self.rewriters.get(text_type)
        if not rewriter_class:
            raise Exception('Unknown Text Type for Rewrite: ' + text_type)
        if text_type == 'html':
            rewriter = rewriter_class(urlrewriter, outstream = None, head_insert = head_insert_str)
        else:
            rewriter = rewriter_class(urlrewriter)
        # Create rewriting generator
        gen = self._rewriting_stream_gen(rewriter, encoding, stream, first_buff)
        return (status_headers, gen)
    # Create rewrite stream,  may even be chunked by front-end
    def _rewriting_stream_gen(self, rewriter, encoding, stream, first_buff = None):
        def do_rewrite(buff):
            if encoding:
                buff = self._decode_buff(buff, stream, encoding)
            buff = rewriter.rewrite(buff)
            if encoding:
                buff = buff.encode(encoding)
            return buff
        def do_finish():
            return rewriter.close()
        return self.stream_to_gen(stream, rewrite_func = do_rewrite, final_read_func = do_finish, first_buff = first_buff)
    def _decode_buff(self, buff, stream, encoding):
        try:
            buff = buff.decode(encoding)
        except UnicodeDecodeError, e:
            # chunk may have cut apart unicode bytes -- add 1-3 bytes and retry
            for i in range(3):
                buff += stream.read(1)
                try:
                    buff = buff.decode(encoding)
                    break
                except UnicodeDecodeError:
                    pass
            else:
                raise
        return buff
    def _detect_charset(self, stream):
        buff = stream.read(8192)
        result = chardet.detect(buff)
        print "chardet result: " + str(result)
        return (result['encoding'], buff)
    # Create a generator reading from a stream, with optional rewriting and final read call
    @staticmethod
    def stream_to_gen(stream, rewrite_func = None, final_read_func = None, first_buff = None):
        try:
            buff = first_buff if first_buff else stream.read()
            while buff:
                if rewrite_func:
                    buff = rewrite_func(buff)
                yield buff
                buff = stream.read()
            # For adding a tail/handling final buffer
            if final_read_func:
                buff = final_read_func()
                if buff:
                    yield buff
        finally:
            stream.close()
--- a/pywb/rewrite/rewrite_live.py
+++ b/pywb/rewrite/rewrite_live.py
@ -0,0 +1,68 @@
 import urllib2
 import os
 import sys
 import datetime
 from pywb.utils.timeutils import datetime_to_timestamp
 from pywb.utils.statusandheaders import StatusAndHeaders
 from pywb.rewrite.url_rewriter import UrlRewriter
 from pywb.rewrite.rewrite_content import RewriteContent
 """
 Fetch a url from live web and apply rewriting rules
 """
 #=================================================================
 def get_status_and_stream(url):
    resp = urllib2.urlopen(url)
    headers = []
    for name, value in resp.info().dict.iteritems():
        headers.append((name, value))
    status_headers = StatusAndHeaders('200 OK', headers)
    stream = resp
    return (status_headers, stream)
 #=================================================================
 def get_rewritten(url, urlrewriter):
    (status_headers, stream) = get_status_and_stream(url)
    status_headers, gen = RewriteContent().rewrite_content(urlrewriter, status_headers, stream)
    buff = ''
    for x in gen:
        buff += x
    return (status_headers, buff)
 #=================================================================
 def main():
    if len(sys.argv) < 2:
        print 'Usage: {0} url-to-fetch [wb-url-target] [extra-prefix]'.format(sys.argv[0])
        exit(1)
    else:
        url = sys.argv[1]
    if len(sys.argv) >= 3:
        wburl_str = sys.argv[2]
        if wburl_str.startswith('/'):
            wburl_str = wburl_str[1:]
        prefix, wburl_str = wburl_str.split('/', 1)
        prefix = '/' + prefix + '/'
    else:
        wburl_str = datetime_to_timestamp(datetime.datetime.now()) + '/http://example.com/path/sample.html'
        prefix = '/pywb_rewrite/'
    urlrewriter = UrlRewriter(wburl_str, prefix)
    status_headers, buff = get_rewritten(url, urlrewriter)
    sys.stdout.write(buff)
 #=================================================================
 if __name__ == "__main__":
    main()
--- a/pywb/rewrite/test/test_rewrite.py
+++ b/pywb/rewrite/test/test_rewrite.py
@ -0,0 +1,266 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 r"""
 #=================================================================
 # HTML Rewriting
 #=================================================================
 >>> parse('<HTML><A Href="page.html">Text</a></hTmL>')
 <HTML><a href="/web/20131226101010/http://example.com/some/path/page.html">Text</a></html>
 >>> parse('<body x="y"><img src="../img.gif"/><br/></body>')
 <body x="y"><img src="/web/20131226101010im_/http://example.com/some/img.gif"/><br/></body>
 >>> parse('<body x="y"><img src="/img.gif"/><br/></body>')
 <body x="y"><img src="/web/20131226101010im_/http://example.com/img.gif"/><br/></body>
 >>> parse('<input "selected"><img src></div>')
 <input "selected"=""><img src=""></div>
 >>> parse('<html><head><base href="http://example.com/some/path/index.html"/>')
 <html><head><base href="/web/20131226101010/http://example.com/some/path/index.html"/>
 # HTML Entities
 >>> parse('<a href="">&rsaquo; &nbsp; &#62;</div>')
 <a href="">&rsaquo; &nbsp; &#62;</div>
 # Don't rewrite anchors
 >>> parse('<HTML><A Href="#abc">Text</a></hTmL>')
 <HTML><a href="#abc">Text</a></html>
 # Unicode
 >>> parse('<a href="http://испытание.испытание/">испытание</a>')
 <a href="/web/20131226101010/http://испытание.испытание/">испытание</a>
 # Meta tag
 >>> parse('<META http-equiv="refresh" content="10; URL=/abc/def.html">')
 <meta http-equiv="refresh" content="10; URL=/web/20131226101010/http://example.com/abc/def.html">
 >>> parse('<meta http-equiv="Content-type" content="text/html; charset=utf-8" />')
 <meta http-equiv="Content-type" content="text/html; charset=utf-8"/>
 >>> parse('<META http-equiv="refresh" content>')
 <meta http-equiv="refresh" content="">
 # Script tag
 >>> parse('<script>window.location = "http://example.com/a/b/c.html"</script>')
 <script>window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html"</script>
 # Unterminated script tag auto-terminate
 >>> parse('<script>window.location = "http://example.com/a/b/c.html"</sc>')
 <script>window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html"</sc></script>
 >>> parse('<script>/*<![CDATA[*/window.location = "http://example.com/a/b/c.html;/*]]>*/"</script>')
 <script>/*<![CDATA[*/window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html;/*]]>*/"</script>
 >>> parse('<div style="background: url(\'abc.html\')" onblah onclick="location = \'redirect.html\'"></div>')
 <div style="background: url('/web/20131226101010/http://example.com/some/path/abc.html')" onblah="" onclick="WB_wombat_location = 'redirect.html'"></div>
 >>> parse('<style>@import "styles.css" .a { font-face: url(\'myfont.ttf\') }</style>')
 <style>@import "/web/20131226101010/http://example.com/some/path/styles.css" .a { font-face: url('/web/20131226101010/http://example.com/some/path/myfont.ttf') }</style>
 # Unterminated style tag auto-terminate
 >>> parse('<style>@import url(styles.css)')
 <style>@import url(/web/20131226101010/http://example.com/some/path/styles.css)</style>
 # Head Insertion
 >>> parse('<html><head><script src="other.js"></script></head><body>Test</body></html>', head_insert = '<script src="cool.js"></script>')
 <html><head><script src="cool.js"></script><script src="/web/20131226101010js_/http://example.com/some/path/other.js"></script></head><body>Test</body></html>
 >>> parse('<body><div>SomeTest</div>', head_insert = '/* Insert */')
 /* Insert */<body><div>SomeTest</div>
 >>> parse('<link href="abc.txt"><div>SomeTest</div>', head_insert = '<script>load_stuff();</script>')
 <link href="/web/20131226101010oe_/http://example.com/some/path/abc.txt"><script>load_stuff();</script><div>SomeTest</div>
 #=================================================================
 # Custom Regex
 # Test https->http converter (other tests below in subclasses)
 >>> RegexRewriter([(RegexRewriter.HTTPX_MATCH_STR, RegexRewriter.remove_https, 0)]).rewrite('a = https://example.com; b = http://example.com; c = https://some-url/path/https://embedded.example.com')
 'a = http://example.com; b = http://example.com; c = http://some-url/path/http://embedded.example.com'
 #=================================================================
 # JS Rewriting
 #=================================================================
 >>> _test_js('location = "http://example.com/abc.html"')
 'WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"'
 >>> _test_js(r'location = "http:\/\/example.com/abc.html"')
 'WB_wombat_location = "/web/20131010im_/http:\\/\\/example.com/abc.html"'
 >>> _test_js(r'location = "http:\\/\\/example.com/abc.html"')
 'WB_wombat_location = "/web/20131010im_/http:\\\\/\\\\/example.com/abc.html"'
 >>> _test_js(r"location = 'http://example.com/abc.html/'")
 "WB_wombat_location = '/web/20131010im_/http://example.com/abc.html/'"
 >>> _test_js(r'location = http://example.com/abc.html/')
 'WB_wombat_location = http://example.com/abc.html/'
 # not rewritten -- to be handled on client side
 >>> _test_js(r'location = "/abc.html"')
 'WB_wombat_location = "/abc.html"'
 >>> _test_js(r'location = /http:\/\/example.com/abc.html/')
 'WB_wombat_location = /http:\\/\\/example.com/abc.html/'
 >>> _test_js('"/location" == some_location_val; locations = location;')
 '"/location" == some_location_val; locations = WB_wombat_location;'
 >>> _test_js('cool_Location = "http://example.com/abc.html"')
 'cool_Location = "/web/20131010im_/http://example.com/abc.html"'
 >>> _test_js('window.location = "http://example.com/abc.html" document.domain = "anotherdomain.com"')
 'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html" document.WB_wombat_domain = "anotherdomain.com"'
 >>> _test_js('document_domain = "anotherdomain.com"; window.document.domain = "example.com"')
 'document_domain = "anotherdomain.com"; window.document.WB_wombat_domain = "example.com"'
 # custom rules added
 >>> _test_js('window.location = "http://example.com/abc.html"; some_func(); ', [('some_func\(\).*', RegexRewriter.comment_out, 0)])
 'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"; /*some_func(); */'
 # scheme-agnostic
 >>> _test_js('cool_Location = "//example.com/abc.html" //comment')
 'cool_Location = "/web/20131010im_///example.com/abc.html" //comment'
 #=================================================================
 # XML Rewriting
 #=================================================================
 >>> _test_xml('<tag xmlns="http://www.example.com/ns" attr="http://example.com"></tag>')
 '<tag xmlns="http://www.example.com/ns" attr="/web/20131010im_/http://example.com"></tag>'
 >>> _test_xml('<tag xmlns:xsi="http://www.example.com/ns" attr=" http://example.com"></tag>')
 '<tag xmlns:xsi="http://www.example.com/ns" attr=" /web/20131010im_/http://example.com"></tag>'
 >>> _test_xml('<tag> http://example.com<other>abchttp://example.com</other></tag>')
 '<tag> /web/20131010im_/http://example.com<other>abchttp://example.com</other></tag>'
 >>> _test_xml('<main>   http://www.example.com/blah</tag> <other xmlns:abcdef= " http://example.com"/> http://example.com </main>')
 '<main>   /web/20131010im_/http://www.example.com/blah</tag> <other xmlns:abcdef= " http://example.com"/> /web/20131010im_/http://example.com </main>'
 #=================================================================
 # CSS Rewriting
 #=================================================================
 >>> _test_css("background: url('/some/path.html')")
 "background: url('/web/20131010im_/http://example.com/some/path.html')"
 >>> _test_css("background: url('../path.html')")
 "background: url('/web/20131010im_/http://example.com/path.html')"
 >>> _test_css("background: url(\"http://domain.com/path.html\")")
 'background: url("/web/20131010im_/http://domain.com/path.html")'
 >>> _test_css("background: url(file.jpeg)")
 'background: url(/web/20131010im_/http://example.com/file.jpeg)'
 >>> _test_css("background: url('')")
 "background: url('')"
 >>> _test_css("background: url (\"weirdpath\')")
 'background: url ("/web/20131010im_/http://example.com/weirdpath\')'
 >>> _test_css("@import   url ('path.css')")
 "@import   url ('/web/20131010im_/http://example.com/path.css')"
 >>> _test_css("@import url('path.css')")
 "@import url('/web/20131010im_/http://example.com/path.css')"
 >>> _test_css("@import ( 'path.css')")
 "@import ( '/web/20131010im_/http://example.com/path.css')"
 >>> _test_css("@import  \"path.css\"")
 '@import  "/web/20131010im_/http://example.com/path.css"'
 >>> _test_css("@import ('../path.css\"")
 '@import (\'/web/20131010im_/http://example.com/path.css"'
 >>> _test_css("@import ('../url.css\"")
 '@import (\'/web/20131010im_/http://example.com/url.css"'
 >>> _test_css("@import (\"url.css\")")
 '@import ("/web/20131010im_/http://example.com/url.css")'
 >>> _test_css("@import url(/url.css)\n@import  url(/anotherurl.css)\n @import  url(/and_a_third.css)")
 '@import url(/web/20131010im_/http://example.com/url.css)\n@import  url(/web/20131010im_/http://example.com/anotherurl.css)\n @import  url(/web/20131010im_/http://example.com/and_a_third.css)'
 #=================================================================
 HTTP Headers Rewriting
 #=================================================================
 # Text with charset
 >>> _test_headers([('Date', 'Fri, 03 Jan 2014 03:03:21 GMT'), ('Content-Length', '5'), ('Content-Type', 'text/html;charset=UTF-8')])
 {'text_type': 'html', 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Date', 'Fri, 03 Jan 2014 03:03:21 GMT'),
  ('X-Archive-Orig-Content-Length', '5'),
  ('Content-Type', 'text/html;charset=UTF-8')]), 'removed_header_dict': {}, 'charset': 'utf-8'}
 # Redirect
 >>> _test_headers([('Connection', 'close'), ('Location', '/other.html')], '302 Redirect')
 {'text_type': None, 'status_headers': StatusAndHeaders(protocol = '', statusline = '302 Redirect', headers = [ ('X-Archive-Orig-Connection', 'close'),
  ('Location', '/web/20131226101010/http://example.com/other.html')]), 'removed_header_dict': {}, 'charset': None}
 # gzip
 >>> _test_headers([('Content-Length', '199999'), ('Content-Type', 'text/javascript'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked')])
 {'text_type': 'js', 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Content-Length', '199999'),
  ('Content-Type', 'text/javascript')]), 'removed_header_dict': {'transfer-encoding': 'chunked', 'content-encoding': 'gzip'}, 'charset': None}
 # Binary
 >>> _test_headers([('Content-Length', '200000'), ('Content-Type', 'image/png'), ('Cookie', 'blah'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked')])
 {'text_type': None, 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('Content-Length', '200000'),
  ('Content-Type', 'image/png'),
  ('X-Archive-Orig-Cookie', 'blah'),
  ('Content-Encoding', 'gzip')]), 'removed_header_dict': {'transfer-encoding': 'chunked'}, 'charset': None}
 Removing Transfer-Encoding always, Was:
  ('Content-Encoding', 'gzip'),
  ('Transfer-Encoding', 'chunked')]), 'charset': None, 'text_type': None, 'removed_header_dict': {}}
 """
 #=================================================================
 from pywb.rewrite.url_rewriter import UrlRewriter
 from pywb.rewrite.html_rewriter import HTMLRewriter
 from pywb.rewrite.regex_rewriters import RegexRewriter, JSRewriter, CSSRewriter, XMLRewriter
 from pywb.rewrite.header_rewriter import HeaderRewriter
 from pywb.utils.statusandheaders import StatusAndHeaders
 urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/web/')
 def parse(data, head_insert = None):
    parser = HTMLRewriter(urlrewriter, head_insert = head_insert)
    print parser.rewrite(data) + parser.close()
 arcrw = UrlRewriter('20131010im_/http://example.com/', '/web/')
 def _test_js(string, extra = []):
    return JSRewriter(arcrw, extra).rewrite(string)
 def _test_xml(string):
    return XMLRewriter(arcrw).rewrite(string)
 def _test_css(string):
    return CSSRewriter(arcrw).rewrite(string)
 headerrewriter = HeaderRewriter()
 def _test_headers(headers, status = '200 OK'):
    rewritten = headerrewriter.rewrite(StatusAndHeaders(status, headers), urlrewriter)
    return vars(rewritten)
 if __name__ == "__main__":
    import doctest
    doctest.testmod()
--- a/pywb/rewrite/test/test_rewrite_live.py
+++ b/pywb/rewrite/test/test_rewrite_live.py
@ -0,0 +1,32 @@
 from pywb.rewrite.rewrite_live import get_rewritten
 from pywb.rewrite.url_rewriter import UrlRewriter
 # This module has some rewriting tests against the 'live web'
 # As such, the content may change and the test may break
 urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/pywb/')
 def test_example_1():
    status_headers, buff = get_rewritten('http://example.com/', urlrewriter)
    # verify header rewriting
    assert (('X-Archive-Orig-connection', 'close') in status_headers.headers), status_headers
 def test_example_2():
    status_headers, buff = get_rewritten('http://example.com/', urlrewriter)
    # verify header rewriting
    assert (('X-Archive-Orig-connection', 'close') in status_headers.headers), status_headers
    assert '/pywb/20131226101010/http://www.iana.org/domains/example' in buff, buff
 def test_example_3():
    status_headers, buff = get_rewritten('http://archive.org/', urlrewriter)
    assert '/pywb/20131226101010/http://example.com/about/terms.php' in buff, buff
--- a/pywb/rewrite/url_rewriter.py
+++ b/pywb/rewrite/url_rewriter.py
@ -6,43 +6,43 @@ from wburl import WbUrl
 class UrlRewriter:
    """
-    >>> test_rewrite('other.html', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/')
+    >>> do_rewrite('other.html', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/')
    'https://web.archive.org/web/20131010/http://example.com/path/other.html'
-    >>> test_rewrite('file.js', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/', 'js_')
+    >>> do_rewrite('file.js', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/', 'js_')
    'https://web.archive.org/web/20131010js_/http://example.com/path/file.js'
-    >>> test_rewrite('/other.html', '20130907*/http://example.com/path/page.html', '/coll/')
+    >>> do_rewrite('/other.html', '20130907*/http://example.com/path/page.html', '/coll/')
    '/coll/20130907*/http://example.com/other.html'
-    >>> test_rewrite('./other.html', '20130907*/http://example.com/path/page.html', '/coll/')
+    >>> do_rewrite('./other.html', '20130907*/http://example.com/path/page.html', '/coll/')
    '/coll/20130907*/http://example.com/path/other.html'
-    >>> test_rewrite('../other.html', '20131112im_/http://example.com/path/page.html', '/coll/')
+    >>> do_rewrite('../other.html', '20131112im_/http://example.com/path/page.html', '/coll/')
    '/coll/20131112im_/http://example.com/other.html'
-    >>> test_rewrite('../../other.html', '*/http://example.com/index.html', 'localhost:8080/')
+    >>> do_rewrite('../../other.html', '*/http://example.com/index.html', 'localhost:8080/')
    'localhost:8080/*/http://example.com/other.html'
-    >>> test_rewrite('path/../../other.html', '*/http://example.com/index.html', 'localhost:8080/')
+    >>> do_rewrite('path/../../other.html', '*/http://example.com/index.html', 'localhost:8080/')
    'localhost:8080/*/http://example.com/other.html'
-    >>> test_rewrite('http://some-other-site.com', '20101226101112/http://example.com/index.html', 'localhost:8080/')
+    >>> do_rewrite('http://some-other-site.com', '20101226101112/http://example.com/index.html', 'localhost:8080/')
    'localhost:8080/20101226101112/http://some-other-site.com'
-    >>> test_rewrite('../../other.html', '2020/http://example.com/index.html', '/')
+    >>> do_rewrite('../../other.html', '2020/http://example.com/index.html', '/')
    '/2020/http://example.com/other.html'
-    >>> test_rewrite('../../other.html', '2020/http://example.com/index.html', '')
+    >>> do_rewrite('../../other.html', '2020/http://example.com/index.html', '')
    '2020/http://example.com/other.html'
-    >>> test_rewrite('', '20131010010203/http://example.com/file.html', '/web/')
+    >>> do_rewrite('', '20131010010203/http://example.com/file.html', '/web/')
    '/web/20131010010203/http://example.com/file.html'
-    >>> test_rewrite('#anchor', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/')
+    >>> do_rewrite('#anchor', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/')
    '#anchor'
-    >>> test_rewrite('mailto:example@example.com', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/')
+    >>> do_rewrite('mailto:example@example.com', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/')
    'mailto:example@example.com'
    >>> UrlRewriter('19960708im_/http://domain.example.com/path.txt', '/abc/').get_abs_url()
@ -62,7 +62,6 @@ class UrlRewriter:
    def __init__(self, wburl, prefix):
        self.wburl = wburl if isinstance(wburl, WbUrl) else WbUrl(wburl)
        self.prefix = prefix
        self.archivalurl_class = self.wburl.__class__
        #if self.prefix.endswith('/'):
        #    self.prefix = self.prefix[:-1]
@ -74,7 +73,7 @@ class UrlRewriter:
        wburl = self.wburl
-        isAbs = any (url.startswith(x) for x in self.PROTOCOLS)
+        isAbs = any(url.startswith(x) for x in self.PROTOCOLS)
        # Optimized rewriter for
        # -rel urls that don't start with / and  don't contain ../ and no special mod
@ -117,12 +116,11 @@ class UrlRewriter:
        return url
-import utils
+def do_rewrite(rel_url, base_url, prefix, mod = None):
-if __name__ == "__main__" or utils.enable_doctests():
+    rewriter = UrlRewriter(base_url, prefix)
-    def test_rewrite(rel_url, base_url, prefix, mod = None):
+    return rewriter.rewrite(rel_url, mod)
        rewriter = UrlRewriter(base_url, prefix)
        return rewriter.rewrite(rel_url, mod)
 if __name__ == "__main__":
    import doctest
    doctest.testmod()
--- a/pywb/rewrite/wburl.py
+++ b/pywb/rewrite/wburl.py
@ -3,9 +3,38 @@
 import re
 import rfc3987
 import wbexceptions
 # WbUrl : wb archival url representation for WB
 """
 WbUrl represents the standard wayback archival url format.
 A regular url is a subset of the WbUrl (latest replay).
 The WbUrl expresses the common interface for interacting
 with the wayback machine.
 There WbUrl may represent one of the following forms:
 query form: [/modifier]/[timestamp][-end_timestamp]*/<url>
 modifier, timestamp and end_timestamp are optional
 */example.com
 20101112030201*/http://example.com
 2009-2015*/http://example.com
 /cdx/*/http://example.com
 url query form: used to indicate query across urls
 same as query form but with a final *
 */example.com*
 20101112030201*/http://example.com*
 replay form:
 20101112030201/http://example.com
 20101112030201im_/http://example.com
 latest_replay: (no timestamp)
 http://example.com
 """
 class WbUrl:
    """
@ -38,6 +67,13 @@ class WbUrl:
    >>> repr(WbUrl('*/http://example.com/abc?def=a*'))
    "('url_query', '', '', 'http://example.com/abc?def=a', '*/http://example.com/abc?def=a*')"
    >>> repr(WbUrl('2010*/http://example.com/abc?def=a'))
    "('query', '2010', '', 'http://example.com/abc?def=a', '2010*/http://example.com/abc?def=a')"
    # timestamp range query
    >>> repr(WbUrl('2009-2015*/http://example.com/abc?def=a'))
    "('query', '2009', '', 'http://example.com/abc?def=a', '2009-2015*/http://example.com/abc?def=a')"
    >>> repr(WbUrl('json/*/http://example.com/abc?def=a'))
    "('query', '', 'json', 'http://example.com/abc?def=a', 'json/*/http://example.com/abc?def=a')"
@ -59,16 +95,16 @@ class WbUrl:
    # ======================
    >>> x = WbUrl('/#$%#/')
    Traceback (most recent call last):
-    BadUrlException: Bad Request Url: http://#$%#/
+    Exception: Bad Request Url: http://#$%#/
    >>> x = WbUrl('/http://example.com:abc/')
    Traceback (most recent call last):
-    BadUrlException: Bad Request Url: http://example.com:abc/
+    Exception: Bad Request Url: http://example.com:abc/
    """
    # Regexs
    # ======================
-    QUERY_REGEX = re.compile('^(?:([\w\-:]+)/)?(\d*)\*/?(.*)$')
+    QUERY_REGEX = re.compile('^(?:([\w\-:]+)/)?(\d*)(?:-(\d+))?\*/?(.*)$')
    REPLAY_REGEX = re.compile('^(\d*)([a-z]+_)?/{0,3}(.*)$')
    QUERY = 'query'
@ -85,13 +121,14 @@ class WbUrl:
        self.type = None
        self.url = ''
        self.timestamp = ''
        self.end_timestamp = ''
        self.mod = ''
        if not any (f(url) for f in [self._init_query, self._init_replay]):
-            raise wbexceptions.RequestParseException('Invalid WB Request Url: ', url)
+            raise Exception('Invalid WbUrl: ', url)
        if len(self.url) == 0:
-            raise wbexceptions.RequestParseException('Invalid WB Request Url: ', url)
+            raise Exception('Invalid WbUrl: ', url)
        # protocol agnostic url -> http://
        #if self.url.startswith('//'):
@ -105,7 +142,7 @@ class WbUrl:
        matcher = rfc3987.match(self.url.upper(), 'IRI')
        if not matcher:
-            raise wbexceptions.BadUrlException('Bad Request Url: ' + self.url)
+            raise Exception('Bad Request Url: ' + self.url)
    # Match query regex
    # ======================
@ -118,7 +155,8 @@ class WbUrl:
        self.mod = res[0]
        self.timestamp = res[1]
-        self.url = res[2]
+        self.end_timestamp = res[2]
        self.url = res[3]
        if self.url.endswith('*'):
            self.type = self.URL_QUERY
            self.url = self.url[:-1]
@ -151,6 +189,7 @@ class WbUrl:
        atype = overrides['type'] if 'type' in overrides else self.type
        mod = overrides['mod'] if 'mod' in overrides else self.mod
        timestamp = overrides['timestamp'] if 'timestamp' in overrides else self.timestamp
        end_timestamp = overrides['end_timestamp'] if 'end_timestamp' in overrides else self.end_timestamp
        url = overrides['url'] if 'url' in overrides else self.url
        if atype == self.QUERY or atype == self.URL_QUERY:
@ -159,6 +198,8 @@ class WbUrl:
                tsmod += mod + "/"
            if timestamp:
                tsmod += timestamp
            if end_timestamp:
                tsmod += '-' + end_timestamp
            tsmod += "*/" + url
            if atype == self.URL_QUERY:
--- a/pywb/utils.py
+++ b/pywb/utils.py
@ -1,122 +0,0 @@
 import itertools
 import time
 import zlib
 import time
 import datetime
 import calendar
 import re
 def peek_iter(iterable):
    try:
        first = next(iterable)
    except StopIteration:
        return None
    return itertools.chain([first], iterable)
 def split_prefix(key, prefixs):
    for p in prefixs:
        if key.startswith(p):
            plen = len(p)
            return (key[:plen], key[plen:])
 def create_decompressor():
    return zlib.decompressobj(16 + zlib.MAX_WBITS)
 #=================================================================
 # Adapted from example at
 class PerfTimer:
    def __init__(self, perfdict, name):
        self.perfdict = perfdict
        self.name = name
    def __enter__(self):
        self.start = time.clock()
        return self
    def __exit__(self, *args):
        self.end = time.clock()
        if self.perfdict is not None:
            self.perfdict[self.name] = str(self.end - self.start)
 #=================================================================
 # adapted -from wsgiref.request_uri, but doesn't include domain name and allows all characters
 # allowed in the path segment according to: http://tools.ietf.org/html/rfc3986#section-3.3
 # explained here: http://stackoverflow.com/questions/4669692/valid-characters-for-directory-part-of-a-url-for-short-links
 def rel_request_uri(environ, include_query=1):
    """
    Return the requested path, optionally including the query string
    # Simple test:
    >>> rel_request_uri({'PATH_INFO': '/web/example.com'})
    '/web/example.com'
    # Test all unecoded special chars and double-quote
    # (double-quote must be encoded but not single quote)
    >>> rel_request_uri({'PATH_INFO': "/web/example.com/0~!+$&'()*+,;=:\\\""})
    "/web/example.com/0~!+$&'()*+,;=:%22"
    """
    from urllib import quote
    url = quote(environ.get('PATH_INFO',''), safe='/~!$&\'()*+,;=:@')
    if include_query and environ.get('QUERY_STRING'):
        url += '?' + environ['QUERY_STRING']
    return url
 #=================================================================
 def unsurt(surt):
    """
    # Simple surt
    >>> unsurt('com,example)/')
    'example.com)/'
    # Broken surt
    >>> unsurt('com,example)')
    'com,example)'
    # Long surt
    >>> unsurt('suffix,domain,sub,subsub,another,subdomain)/path/file/index.html?a=b?c=)/')
    'subdomain.another.subsub.sub.domain.suffix)/path/file/index.html?a=b?c=)/'
    """
    try:
        index = surt.index(')/')
        parts = surt[0:index].split(',')
        parts.reverse()
        host = '.'.join(parts)
        host += surt[index:]
        return host
    except ValueError:
        # May not be a valid surt
        return surt
 #=================================================================
 # Support for bulk doctest testing via nose or py.test
 # nosetests --with-doctest
 # py.test --doctest_modules
 import sys
 is_in_testtool = any(sys.argv[0].endswith(tool) for tool in ['py.test', 'nosetests'])
 def enable_doctests():
    return is_in_testtool
 def test_data_dir():
    import os
    return os.path.dirname(os.path.realpath(__file__)) + '/../sample_archive/'
 #=================================================================
 if __name__ == "__main__" or enable_doctests():
    import doctest
    doctest.testmod()
--- a/pywb/utils/README.md
+++ b/pywb/utils/README.md
@ -0,0 +1,16 @@
 ## PyWb Utils v0.2 ##
 [![Build Status](https://travis-ci.org/ikreymer/pywb_utils.png?branch=master)](https://travis-ci.org/ikreymer/pywb_utils)
 This is a standalone module contains a variety of utils used by pywb wayback tool suite.
 `python run-tests.py` will run all tests
 #### Modules
 [binsearch.py](pywb_utils/binsearch.py) -- Binary search implementation over text files
 [loaders.py](pywb_utils/loaders.py) -- Loading abstraction for http, local file system, as well as buffered and seekable file readers
 [timeutils.py](pywb_utils/timeutils.py) -- Utility functions for converting between standard datetime formats 14-digit timestamp
--- a/pywb/utils/init.py
+++ b/pywb/utils/init.py
--- a/pywb/utils/binsearch.py
+++ b/pywb/utils/binsearch.py
@ -0,0 +1,110 @@
 """
 Utility functions for performing binary search over a sorted text file
 """
 from collections import deque
 import itertools
 #=================================================================
 def binsearch_offset(reader, key, compare_func=cmp, block_size=8192):
    """
    Find offset of the line which matches a given 'key' using binary search
    If key is not found, the offset is of the line after the key
    File is subdivided into block_size (default 8192) sized blocks
    Optional compare_func may be specified
    """
    min_ = 0
    max_ = reader.getsize() / block_size
    while max_ - min_ > 1:
        mid = min_ + ((max_ - min_) / 2)
        reader.seek(mid * block_size)
        if mid > 0:
            reader.readline()  # skip partial line
        line = reader.readline()
        if compare_func(key, line) > 0:
            min_ = mid
        else:
            max_ = mid
    return min_ * block_size
 #=================================================================
 def search(reader, key, prev_size=0, compare_func=cmp, block_size=8192):
    """
    Perform a binary search for a specified key to within a 'block_size'
    (default 8192) sized block followed by linear search
    within the block to find first matching line.
    When performin_g linear search, keep track of up to N previous lines before
    first matching line.
    """
    min_ = binsearch_offset(reader, key, compare_func, block_size)
    reader.seek(min_)
    if min_ > 0:
        reader.readline()  # skip partial line
    if prev_size > 1:
        prev_deque = deque(max_len=prev_size)
    line = None
    while True:
        line = reader.readline()
        if not line:
            break
        if compare_func(line, key) >= 0:
            break
        if prev_size == 1:
            prev = line
        elif prev_size > 1:
            prev_deque.append(line)
    def gen_iter(line):
        """
        Create iterator over any previous lines to
        current matched line
        """
        if prev_size == 1:
            yield prev.rstrip()
        elif prev_size > 1:
            for i in prev_deque:
                yield i.rstrip()
        while line:
            yield line.rstrip()
            line = reader.readline()
    return gen_iter(line)
 #=================================================================
 def iter_prefix(reader, key):
    """
    Creates an iterator which iterates over lines that start with prefix
    'key' in a sorted text file.
    """
    return itertools.takewhile(
        lambda line: line.startswith(key),
        search(reader, key))
 #=================================================================
 def iter_exact(reader, key, token=' '):
    """
    Create an iterator which iterates over lines where the first field matches
    the 'key', equivalent to token + sep prefix.
    Default field termin_ator/seperator is ' '
    """
    return iter_prefix(reader, key + token)
--- a/pywb/utils/bufferedreaders.py
+++ b/pywb/utils/bufferedreaders.py
@ -0,0 +1,204 @@
 import StringIO
 import zlib
 #=================================================================
 def gzip_decompressor():
    """
    Decompressor which can handle decompress gzip stream
    """
    return zlib.decompressobj(16 + zlib.MAX_WBITS)
 #=================================================================
 class BufferedReader(object):
    """
    A wrapping line reader which wraps an existing reader.
    Read operations operate on underlying buffer, which is filled to
    block_size (1024 default)
    If an optional decompress type is specified,
    data is fed through the decompressor when read from the buffer.
    Currently supported decompression: gzip
    If decompression fails on first try, data is assumed to be decompressed
    and no exception is thrown. If a failure occurs after data has been
    partially decompressed, the exception is propagated.
    """
    DECOMPRESSORS = {'gzip': gzip_decompressor}
    def __init__(self, stream, max_len=0, block_size=1024, decomp_type=None):
        self.stream = stream
        self.block_size = block_size
        if decomp_type:
            try:
                self.decompressor = self.DECOMPRESSORS[decomp_type.lower()]()
            except KeyError:
                raise Exception('Decompression type not supported: ' +
                                decomp_type)
        else:
            self.decompressor = None
        self.buff = None
        self.num_read = 0
        self.max_len = max_len
    def _fillbuff(self, block_size=None):
        if not block_size:
            block_size = self.block_size
        if not self.buff or self.buff.pos >= self.buff.len:
            if self.max_len > 0:
                to_read = min(self.max_len - self.num_read, self.block_size)
            else:
                to_read = self.block_size
            data = self.stream.read(to_read)
            self._process_read(data)
    def _process_read(self, data):
        data = self._decompress(data)
        self.num_read += len(data)
        self.buff = StringIO.StringIO(data)
    def _decompress(self, data):
        if self.decompressor and data:
            try:
                data = self.decompressor.decompress(data)
            except Exception:
                # if first read attempt, assume non-gzipped stream
                if self.num_read == 0:
                    self.decompressor = None
                # otherwise (partly decompressed), something is wrong
                else:
                    raise
        return data
    def read(self, length=None):
        self._fillbuff()
        return self.buff.read(length)
    def readline(self, length=None):
        self._fillbuff()
        return self.buff.readline(length)
    def close(self):
        if self.stream:
            self.stream.close()
            self.stream = None
 #=================================================================
 class ChunkedDataException(Exception):
    pass
 #=================================================================
 class ChunkedDataReader(BufferedReader):
    r"""
    A ChunkedDataReader is a BufferedReader which also supports de-chunking
    of the data if it happens to be http 'chunk-encoded'.
    If at any point the chunked header is not available, the stream is
    assumed to not be chunked and no more dechunking occurs.
    Properly formatted chunked data:
    >>> c = ChunkedDataReader(StringIO.StringIO("4\r\n1234\r\n0\r\n\r\n"));
    >>> c.read() + c.read()
    '1234'
    Non-chunked data:
    >>> ChunkedDataReader(StringIO.StringIO("xyz123!@#")).read()
    'xyz123!@#'
    Starts like chunked data, but isn't:
    >>> c = ChunkedDataReader(StringIO.StringIO("1\r\nxyz123!@#"));
    >>> c.read() + c.read()
    '1\r\nx123!@#'
    Chunked data cut off part way through:
    >>> c = ChunkedDataReader(StringIO.StringIO("4\r\n1234\r\n4\r\n12"));
    >>> c.read() + c.read()
    '123412'
    """
    all_chunks_read = False
    not_chunked = False
    # if False, we'll use best-guess fallback for parse errors
    raise_chunked_data_exceptions = False
    def _fillbuff(self, block_size=None):
        if self.not_chunked:
            return BufferedReader._fillbuff(self, block_size)
        if self.all_chunks_read:
            return
        if not self.buff or self.buff.pos >= self.buff.len:
            length_header = self.stream.readline(64)
            self._data = ''
            try:
                self._try_decode(length_header)
            except ChunkedDataException:
                if self.raise_chunked_data_exceptions:
                    raise
                # Can't parse the data as chunked.
                # It's possible that non-chunked data is served
                # with a Transfer-Encoding: chunked.
                # Treat this as non-chunk encoded from here on.
                self._process_read(length_header + self._data)
                self.not_chunked = True
    def _try_decode(self, length_header):
        # decode length header
        try:
            chunk_size = int(length_header.strip().split(';')[0], 16)
        except ValueError:
            raise ChunkedDataException("Couldn't decode length header " +
                                       length_header)
        if not chunk_size:
            # chunk_size 0 indicates end of file
            self.all_chunks_read = True
            #self._process_read('')
            return
        data_len = len(self._data)
        # read chunk
        while data_len < chunk_size:
            new_data = self.stream.read(chunk_size - data_len)
            # if we unexpectedly run out of data,
            # either raise an exception or just stop reading,
            # assuming file was cut off
            if not new_data:
                if self.raise_chunked_data_exceptions:
                    msg = 'Ran out of data before end of chunk'
                    raise ChunkedDataException(msg)
                else:
                    chunk_size = data_len
                    self.all_chunks_read = True
            self._data += new_data
            data_len = len(self._data)
        # if we successfully read a block without running out,
        # it should end in \r\n
        if not self.all_chunks_read:
            clrf = self.stream.read(2)
            if clrf != '\r\n':
                raise ChunkedDataException("Chunk terminator not found.")
        # hand to base class for further processing
        self._process_read(self._data)
 if __name__ == "__main__":
    import doctest
    doctest.testmod()
--- a/pywb/utils/loaders.py
+++ b/pywb/utils/loaders.py
@ -0,0 +1,152 @@
 """
 This module provides loaders for local file system and over http
 local and remote access
 """
 import os
 import hmac
 import urllib2
 import time
 #=================================================================
 # load a reader from http
 #=================================================================
 class HttpLoader(object):
    """
    Load a file-like reader over http using range requests
    and an optional cookie created via a cookie_maker
    """
    def __init__(self, cookie_maker=None):
        self.cookie_maker = cookie_maker
    def load(self, url, offset, length):
        if length > 0:
            range_header = 'bytes={0}-{1}'.format(offset, offset + length - 1)
        else:
            range_header = 'bytes={0}-'.format(offset)
        headers = {}
        headers['Range'] = range_header
        if self.cookie_maker:
            headers['Cookie'] = self.cookie_maker.make()
        request = urllib2.Request(url, headers=headers)
        return urllib2.urlopen(request)
 #=================================================================
 # Signed Cookie-Maker
 #=================================================================
 class HMACCookieMaker(object):
    """
    Utility class to produce signed HMAC digest cookies
    to be used with each http request
    """
    def __init__(self, key, name, duration=10):
        self.key = key
        self.name = name
        # duration in seconds
        self.duration = duration
    def make(self, extra_id=''):
        expire = str(long(time.time() + self.duration))
        if extra_id:
            msg = extra_id + '-' + expire
        else:
            msg = expire
        hmacdigest = hmac.new(self.key, msg)
        hexdigest = hmacdigest.hexdigest()
        if extra_id:
            cookie = '{0}-{1}={2}-{3}'.format(self.name, extra_id,
                                              expire, hexdigest)
        else:
            cookie = '{0}={1}-{2}'.format(self.name, expire, hexdigest)
        return cookie
 #=================================================================
 # load a reader from local filesystem
 #=================================================================
 class FileLoader(object):
    """
    Load a file-like reader from the local file system
    """
    def load(self, url, offset, length):
        if url.startswith('file://'):
            url = url[len('file://'):]
        afile = open(url, 'rb')
        afile.seek(offset)
        if length > 0:
            return LimitReader(afile, length)
 #=================================================================
 # Limit Reader
 #=================================================================
 class LimitReader(object):
    """
    A reader which will not read more than specified limit
    """
    def __init__(self, stream, limit):
        self.stream = stream
        self.limit = limit
        if not self.limit:
            self.limit = 1
    def read(self, length=None):
        length = min(length, self.limit) if length else self.limit
        buff = self.stream.read(length)
        self.limit -= len(buff)
        return buff
    def readline(self, length=None):
        length = min(length, self.limit) if length else self.limit
        buff = self.stream.readline(length)
        self.limit -= len(buff)
        return buff
    def close(self):
        self.stream.close()
 #=================================================================
 # Local text file with known size -- used for binsearch
 #=================================================================
 class SeekableTextFileReader(object):
    """
    A very simple file-like object wrapper that knows it's total size,
    via getsize()
    Supports seek() operation.
    Assumed to be a text file. Used for binsearch.
    """
    def __init__(self, filename):
        self.fh = open(filename, 'rb')
        self.filename = filename
        self.size = os.path.getsize(filename)
    def getsize(self):
        return self.size
    def read(self):
        return self.fh.read()
    def readline(self):
        return self.fh.readline()
    def seek(self, offset):
        return self.fh.seek(offset)
    def close(self):
        return self.fh.close()
--- a/pywb/utils/statusandheaders.py
+++ b/pywb/utils/statusandheaders.py
@ -0,0 +1,107 @@
 """
 Representation and parsing of HTTP-style status + headers
 """
 import pprint
 #=================================================================
 class StatusAndHeaders(object):
    """
    Representation of parsed http-style status line and headers
    Status Line if first line of request/response
    Headers is a list of (name, value) tuples
    An optional protocol which appears on first line may be specified
    """
    def __init__(self, statusline, headers, protocol=''):
        self.statusline = statusline
        self.headers = headers
        self.protocol = protocol
    def get_header(self, name):
        """
        return header (name, value)
        if found
        """
        name_lower = name.lower()
        for value in self.headers:
            if value[0].lower() == name_lower:
                return value[1]
    def remove_header(self, name):
        """
        remove header (case-insensitive)
        return True if header removed, False otherwise
        """
        name_lower = name.lower()
        for index in xrange(len(self.headers) - 1, -1, -1):
            if self.headers[index][0].lower() == name_lower:
                del self.headers[index]
                return True
        return False
    def __repr__(self):
        headers_str = pprint.pformat(self.headers, indent=2)
        return "StatusAndHeaders(protocol = '{0}', statusline = '{1}', \
 headers = {2})".format(self.protocol, self.statusline, headers_str)
    def __eq__(self, other):
        return (self.statusline == other.statusline and
                self.headers == other.headers and
                self.protocol == other.protocol)
 #=================================================================
 class StatusAndHeadersParser(object):
    """
    Parser which consumes a stream support readline() to read
    status and headers and return a StatusAndHeaders object
    """
    def __init__(self, statuslist):
        self.statuslist = statuslist
    def parse(self, stream):
        """
        parse stream for status line and headers
        return a StatusAndHeaders object
        """
        statusline = stream.readline().rstrip()
        protocol_status = self.split_prefix(statusline, self.statuslist)
        if not protocol_status:
            msg = 'Expected Status Line - Found: ' + statusline
            raise StatusAndHeadersParserException(msg)
        headers = []
        line = stream.readline().rstrip()
        while line and line != '\r\n':
            name, value = line.split(':', 1)
            header = (name, value.strip())
            headers.append(header)
            line = stream.readline().rstrip()
        return StatusAndHeaders(statusline=protocol_status[1].strip(),
                                headers=headers,
                                protocol=protocol_status[0])
    @staticmethod
    def split_prefix(key, prefixs):
        """
        split key string into prefix and remainder
        for first matching prefix from a list
        """
        for prefix in prefixs:
            if key.startswith(prefix):
                plen = len(prefix)
                return (key[:plen], key[plen:])
 #=================================================================
 class StatusAndHeadersParserException(Exception):
    """
    status + headers parsing exception
    """
    pass
--- a/pywb/utils/test/binsearch_test.py
+++ b/pywb/utils/test/binsearch_test.py
@ -0,0 +1,52 @@
 #=================================================================
 """
 # binsearch tests
 # Prefix Search
 >>> print_binsearch_results('org,iana)/domains/root', iter_prefix)
 org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz
 org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz
 org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz
 org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz
 >>> print_binsearch_results('org,iana)/domains/root', iter_exact)
 org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz
 >>> print_binsearch_results('org,iana)/', iter_exact)
 org,iana)/ 20140126200624 http://www.iana.org/ text/html 200 OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 2258 334 iana.warc.gz
 >>> print_binsearch_results('org,iana)/domains/root/db', iter_exact)
 org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz
 org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz
 # Exact Search
 >>> print_binsearch_results('org,iaana)/', iter_exact)
 >>> print_binsearch_results('org,ibna)/', iter_exact)
 >>> print_binsearch_results('org,iana)/time-zones', iter_exact)
 org,iana)/time-zones 20140126200737 http://www.iana.org/time-zones text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
 """
 #=================================================================
 import os
 from pywb.utils.binsearch import iter_prefix, iter_exact
 from pywb.utils.loaders import SeekableTextFileReader
 from pywb import get_test_dir
 #test_cdx_dir = os.path.dirname(os.path.realpath(__file__)) + '/../sample-data/'
 test_cdx_dir = get_test_dir() + 'cdx/'
 def print_binsearch_results(key, iter_func):
    cdx =  SeekableTextFileReader(test_cdx_dir + 'iana.cdx')
    for line in iter_func(cdx, key):
        print line
 if __name__ == "__main__":
    import doctest
    doctest.testmod()
--- a/pywb/utils/test/loaders_test.py
+++ b/pywb/utils/test/loaders_test.py
@ -0,0 +1,69 @@
 #=================================================================
 """
 # LimitReader Tests
 >>> LimitReader(StringIO.StringIO('abcdefghjiklmnopqrstuvwxyz'), 10).read(26)
 'abcdefghji'
 >>> LimitReader(StringIO.StringIO('abcdefghjiklmnopqrstuvwxyz'), 8).readline(26)
 'abcdefgh'
 >>> read_multiple(LimitReader(StringIO.StringIO('abcdefghjiklmnopqrstuvwxyz'), 10), [2, 2, 20])
 'efghji'
 # FileLoader Tests (includes LimitReader)
 # Ensure attempt to read more than 100 bytes, reads exactly 100 bytes
 >>> len(FileLoader().load(test_cdx_dir + 'iana.cdx', 0, 100).read('400'))
 100
 # SeekableTextFileReader Test
 >>> sr = SeekableTextFileReader(test_cdx_dir + 'iana.cdx')
 >>> sr.getsize()
 30399
 >>> seek_read_full(sr, 100)
 'org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200826 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 34054 620049 iana.warc.gz\\n'
 #BufferedReader readline()
 >>> BufferedReader(open(test_cdx_dir + 'iana.cdx', 'rb')).readline()
 ' CDX N b a m s k r M S V g\\n'
 #BufferedReader readline() with decompression
 >>> BufferedReader(open(test_cdx_dir + 'iana.cdx.gz', 'rb'), decomp_type = 'gzip').readline()
 ' CDX N b a m s k r M S V g\\n'
 >>> HttpLoader(HMACCookieMaker('test', 'test', 5)).load('http://example.com', 41, 14).read()
 'Example Domain'
 """
 #=================================================================
 import os
 import StringIO
 from pywb.utils.loaders import FileLoader, HttpLoader, HMACCookieMaker
 from pywb.utils.loaders import LimitReader, SeekableTextFileReader
 from pywb.utils.bufferedreaders import BufferedReader
 from pywb import get_test_dir
 #test_cdx_dir = os.path.dirname(os.path.realpath(__file__)) + '/../sample-data/'
 test_cdx_dir = get_test_dir() + 'cdx/'
 def read_multiple(reader, inc_reads):
    result = None
    for x in inc_reads:
        result = reader.read(x)
    return result
 def seek_read_full(seekable_reader, offset):
    seekable_reader.seek(offset)
    seekable_reader.readline() #skip
    return seekable_reader.readline()
 if __name__ == "__main__":
    import doctest
    doctest.testmod()
--- a/pywb/cdxserver/timeutils.py
+++ b/pywb/cdxserver/timeutils.py
@ -1,20 +1,25 @@
 """
 utility functions for converting between
 datetime, iso date and 14-digit timestamp
 """
 import re
 import time
 import datetime
 import calendar
 from itertools import imap
 #=================================================================
 # str <-> datetime conversion
 #=================================================================
-DATE_TIMESPLIT = re.compile('[^\d]')
+DATE_TIMESPLIT = re.compile(r'[^\d]')
 TIMESTAMP_14 = '%Y%m%d%H%M%S'
 PAD_STAMP_END = '29991231235959'
 def iso_date_to_datetime(string):
    """
    >>> iso_date_to_datetime('2013-12-26T10:11:12Z')
@ -28,16 +33,18 @@ def iso_date_to_datetime(string):
    if nums[-1] == '':
        nums = nums[:-1]
-    dt = datetime.datetime(*map(int, nums))
+    the_datetime = datetime.datetime(*imap(int, nums))
-    return dt
+    return the_datetime
-def datetime_to_timestamp(dt):
+
 def datetime_to_timestamp(the_datetime):
    """
    >>> datetime_to_timestamp(datetime.datetime(2013, 12, 26, 10, 11, 12))
    '20131226101112'
    """
-    return dt.strftime(TIMESTAMP_14)
+    return the_datetime.strftime(TIMESTAMP_14)
 def iso_date_to_timestamp(string):
    """
@ -52,7 +59,7 @@ def iso_date_to_timestamp(string):
 # default pad is end of range for compatibility
-def pad_timestamp(string, pad_str = PAD_STAMP_END):
+def pad_timestamp(string, pad_str=PAD_STAMP_END):
    """
    >>> pad_timestamp('20')
    '20991231235959'
@ -76,10 +83,12 @@ def pad_timestamp(string, pad_str = PAD_STAMP_END):
 def timestamp_to_datetime(string):
    """
    >>> timestamp_to_datetime('20131226095010')
-    time.struct_time(tm_year=2013, tm_mon=12, tm_mday=26, tm_hour=9, tm_min=50, tm_sec=10, tm_wday=3, tm_yday=360, tm_isdst=-1)
+    time.struct_time(tm_year=2013, tm_mon=12, tm_mday=26, \
 tm_hour=9, tm_min=50, tm_sec=10, tm_wday=3, tm_yday=360, tm_isdst=-1)
    >>> timestamp_to_datetime('2014')
-    time.struct_time(tm_year=2014, tm_mon=12, tm_mday=31, tm_hour=23, tm_min=59, tm_sec=59, tm_wday=2, tm_yday=365, tm_isdst=-1)
+    time.struct_time(tm_year=2014, tm_mon=12, tm_mday=31, \
 tm_hour=23, tm_min=59, tm_sec=59, tm_wday=2, tm_yday=365, tm_isdst=-1)
    """
    # Default pad to end of range for comptability
--- a/pywb/views.py
+++ b/pywb/views.py
@ -1,4 +1,4 @@
-import cdxserver.timeutils as timeutils
+import pywb.utils.timeutils as timeutils
 import wbrequestresponse
 import wbexceptions
--- a/pywb/warc/README.md
+++ b/pywb/warc/README.md
@ -0,0 +1,22 @@
 ## PyWb Warc v0.2
 [![Build Status](https://travis-ci.org/ikreymer/pywb_warc.png?branch=master)](https://travis-ci.org/ikreymer/pywb_warc)
 This is the WARC/ARC record loading component of pywb wayback tool suite.
 This package provides the following facilities:
 * Resolve relative WARC/ARC filenames to a full path based on configurable resolvers
 * Resolve 'revisit' records from provided index to find a full record with headers and payload content
 * Load WARC and ARC records either locally or via http using http 1.1 range requests
 ### Tests
 This package will include a test suite for different WARC and ARC loading formats.
 To run: `python run-tests.py`
--- a/pywb/warc/init.py
+++ b/pywb/warc/init.py
--- a/pywb/warc/pathresolvers.py
+++ b/pywb/warc/pathresolvers.py
@ -1,13 +1,27 @@
 import redis
-import binsearch.binsearch
+
 from pywb.utils.binsearch import iter_exact
 from pywb.utils.loaders import SeekableTextFileReader
 import urlparse
 import os
 import logging
-#======================================
+"""
-# PrefixResolver - convert cdx file entry to url with prefix if url contains specified string
+The purpose of this module is to 'resolve' a warc/arc filename,
-#======================================
+often found in a CDX file, to a full loadable url.
 Supported resolvers are: url prefix, path index lookup and redis
 make_best_resolver() attempts to guess the resolver method for given uri
 """
 #=================================================================
 # PrefixResolver - convert cdx file entry to url with prefix
 # if url contains specified string
 #=================================================================
 class PrefixResolver:
    def __init__(self, prefix, contains):
        self.prefix = prefix
@ -18,14 +32,15 @@ class PrefixResolver:
    def __repr__(self):
        if self.contains:
-            return "PrefixResolver('{0}', contains = '{1}')".format(self.prefix, self.contains)
+            return ("PrefixResolver('{0}', contains = '{1}')"
                    .format(self.prefix, self.contains))
        else:
            return "PrefixResolver('{0}')".format(self.prefix)
-#======================================
+#=================================================================
 class RedisResolver:
-    def __init__(self, redis_url, key_prefix = None):
+    def __init__(self, redis_url, key_prefix=None):
        self.redis_url = redis_url
        self.key_prefix = key_prefix if key_prefix else 'w:'
        self.redis = redis.StrictRedis.from_url(redis_url)
@ -42,14 +57,14 @@ class RedisResolver:
        return "RedisResolver('{0}')".format(self.redis_url)
-#======================================
+#=================================================================
 class PathIndexResolver:
    def __init__(self, pathindex_file):
        self.pathindex_file = pathindex_file
-        self.reader = binsearch.binsearch.FileReader(pathindex_file)
+        self.reader = SeekableTextFileReader(pathindex_file)
    def __call__(self, filename):
-        result = binsearch.binsearch.iter_exact(self.reader, filename, '\t')
+        result = iter_exact(self.reader, filename, '\t')
        def gen_list(result):
            for pathline in result:
@ -63,6 +78,7 @@ class PathIndexResolver:
        return "PathIndexResolver('{0}')".format(self.pathindex_file)
 #=================================================================
 #TODO: more options (remote files, contains param, etc..)
 # find best resolver given the path
 def make_best_resolver(param):
@ -80,11 +96,14 @@ def make_best_resolver(param):
    RedisResolver('redis://myhost.example.com:1234/1')
    # a file
-    >>> class_name(make_best_resolver('file://' + os.path.realpath(__file__)))
+    >>> r = make_best_resolver('file://' + os.path.realpath(__file__))
    >>> r.__class__.__name__
    'PathIndexResolver'
    # a dir
-    >>> class_name(make_best_resolver('file://' + os.path.dirname(os.path.realpath(__file__))))
+    >>> path = os.path.realpath(__file__)
    >>> r = make_best_resolver('file://' + os.path.dirname(path))
    >>> r.__class__.__name__
    'PrefixResolver'
    """
@ -99,27 +118,29 @@ def make_best_resolver(param):
    url_parts = urlparse.urlsplit(path)
    if url_parts.scheme == 'redis':
-        logging.info('Adding Redis Index: ' + path)
+        logging.debug('Adding Redis Index: ' + path)
        return RedisResolver(path, arg)
    if url_parts.scheme == 'file':
        path = url_parts.path
    if os.path.isfile(path):
-        logging.info('Adding Path Index: ' + path)
+        logging.debug('Adding Path Index: ' + path)
        return PathIndexResolver(path)
    # non-file paths always treated as prefix for now
    else:
-        logging.info('Adding Archive Path Source: ' + path)
+        logging.debug('Adding Archive Path Source: ' + path)
        return PrefixResolver(path, arg)
 #=================================================================
 def make_best_resolvers(paths):
    """
-    >>> make_best_resolvers(['http://myhost.example.com/warcs/', 'redis://myhost.example.com:1234/1'])
+    >>> r = make_best_resolvers(['http://example.com/warcs/',\
-    [PrefixResolver('http://myhost.example.com/warcs/'), RedisResolver('redis://myhost.example.com:1234/1')]
+                                'redis://example.com:1234/1'])
    >>> map(lambda x: x.__class__.__name__, r)
    ['PrefixResolver', 'RedisResolver']
    """
    if hasattr(paths, '__iter__'):
        return map(make_best_resolver, paths)
@ -127,13 +148,7 @@ def make_best_resolvers(paths):
        return [make_best_resolver(paths)]
 import utils
 #=================================================================
-if __name__ == "__main__" or utils.enable_doctests():
+if __name__ == "__main__":
    def class_name(obj):
         return obj.__class__.__name__
    import doctest
    doctest.testmod()
--- a/pywb/warc/recordloader.py
+++ b/pywb/warc/recordloader.py
@ -0,0 +1,161 @@
 import itertools
 import urlparse
 import collections
 from pywb.utils.statusandheaders import StatusAndHeaders
 from pywb.utils.statusandheaders import StatusAndHeadersParser
 from pywb.utils.loaders import FileLoader, HttpLoader
 from pywb.utils.bufferedreaders import BufferedReader
 #=================================================================
 ArcWarcRecord = collections.namedtuple('ArchiveRecord',
                                       'type, rec_headers, ' +
                                       'stream, status_headers')
 #=================================================================
 class ArchiveLoadFailed(Exception):
    def __init__(self, reason, filename=''):
        super(ArchiveLoadFailed, self).__init__(filename + ':' + str(reason))
        #self.filename = filename
        #self.reason = reason
    def status(self):
        return '503 Service Unavailable'
 #=================================================================
 class ArcWarcRecordLoader:
    # Standard ARC headers
    ARC_HEADERS = ["uri", "ip-address", "creation-date",
                   "content-type", "length"]
    # Since loading a range request,
    # can only determine gzip-ness based on file extension
    # (BufferedReader will however default to non-gzip if
    # decompression fails)
    FORMAT_MAP = {
        '.warc.gz': ('warc', True),
        '.arc.gz':  ('arc',  True),
        '.warc':    ('warc', False),
        '.arc':     ('arc',  False),
    }
    @staticmethod
    def create_default_loaders(cookie_maker=None):
        http = HttpLoader(cookie_maker)
        file = FileLoader()
        return {
            'http': http,
            'https': http,
            'file': file,
            '': file
            }
    def __init__(self, loaders={}, cookie_maker=None, chunk_size=8192):
        self.loaders = loaders
        if not self.loaders:
            self.loaders = self.create_default_loaders(cookie_maker)
        self.chunk_size = chunk_size
        self.arc_parser = ARCHeadersParser(self.ARC_HEADERS)
        warc_types = ['WARC/1.0', 'WARC/0.17', 'WARC/0.18']
        self.warc_parser = StatusAndHeadersParser(warc_types)
        self.http_parser = StatusAndHeadersParser(['HTTP/1.0', 'HTTP/1.1'])
    def load(self, url, offset, length):
        url_parts = urlparse.urlsplit(url)
        loader = self.loaders.get(url_parts.scheme)
        if not loader:
            raise ArchiveLoadFailed('Unknown Protocol', url)
        the_format = None
        for ext, iformat in self.FORMAT_MAP.iteritems():
            if url.endswith(ext):
                the_format = iformat
                break
        if the_format is None:
            raise ArchiveLoadFailed('Unknown file format', url)
        (a_format, is_gzip) = the_format
        #decomp = utils.create_decompressor() if is_gzip else None
        decomp_type = 'gzip' if is_gzip else None
        try:
            length = int(length)
        except:
            length = -1
        raw = loader.load(url, long(offset), length)
        stream = BufferedReader(raw, length, self.chunk_size, decomp_type)
        if a_format == 'arc':
            rec_headers = self.arc_parser.parse(stream)
            rec_type = 'response'
            empty = (rec_headers.get_header('length') == 0)
        elif a_format == 'warc':
            rec_headers = self.warc_parser.parse(stream)
            rec_type = rec_headers.get_header('WARC-Type')
            empty = (rec_headers.get_header('Content-Length') == '0')
        # special case: empty w/arc record (hopefully a revisit)
        if empty:
            status_headers = StatusAndHeaders('204 No Content', [])
        # special case: warc records that are not expected to have http headers
        # attempt to add 200 status and content-type
        elif rec_type == 'metadata' or rec_type == 'resource':
            content_type = [('Content-Type',
                            rec_headers.get_header('Content-Type'))]
            status_headers = StatusAndHeaders('200 OK', content_type)
        # special case: http 0.9 response, no status or headers
        #elif rec_type == 'response':
        #    content_type = rec_headers.get_header('Content-Type')
        #    if content_type and (';version=0.9' in content_type):
        #        status_headers = StatusAndHeaders('200 OK', [])
        # response record: parse HTTP status and headers!
        else:
            #(statusline, http_headers) = self.parse_http_headers(stream)
            status_headers = self.http_parser.parse(stream)
        return ArcWarcRecord((a_format, rec_type),
                             rec_headers, stream, status_headers)
 #=================================================================
 class ARCHeadersParser:
    def __init__(self, headernames):
        self.headernames = headernames
    def parse(self, stream):
        headerline = stream.readline().rstrip()
        parts = headerline.split()
        headernames = self.headernames
        if len(parts) != len(headernames):
            msg = 'Wrong # of headers, expected arc headers {0}, Found {1}'
            raise ArchiveLoadFailed(msg.format(headernames, parts))
        headers = []
        for name, value in itertools.izip(headernames, parts):
            headers.append((name, value))
        return StatusAndHeaders(statusline='',
                                headers=headers,
                                protocol='ARC/1.0')
--- a/pywb/warc/resolvingloader.py
+++ b/pywb/warc/resolvingloader.py
@ -0,0 +1,176 @@
 from pywb.utils.timeutils import iso_date_to_timestamp
 from recordloader import ArcWarcRecordLoader, ArchiveLoadFailed
 from pathresolvers import make_best_resolvers
 #=================================================================
 class ResolvingLoader:
    def __init__(self, paths, record_loader=ArcWarcRecordLoader(),
                 cdx_server=None):
        self.path_resolvers = make_best_resolvers(paths)
        self.record_loader = record_loader
        self.cdx_server = cdx_server
    def resolve_headers_and_payload(self, cdx, failed_files):
        """
        Resolve headers and payload for a given capture
        In the simple case, headers and payload are in the same record.
        In the case of revisit records, the payload and headers may be in
        different records.
        If the original has already been found, lookup original using
        orig. fields in cdx dict.
        Otherwise, call _load_different_url_payload() to get cdx index
        from a different url to find the original record.
        """
        has_curr = (cdx['filename'] != '-')
        has_orig = (cdx.get('orig.filename', '-') != '-')
        # load headers record from cdx['filename'] unless it is '-' (rare)
        headers_record = None
        if has_curr:
            headers_record = self._resolve_path_load(cdx, False, failed_files)
        # two index lookups
        # Case 1: if mimetype is still warc/revisit
        if cdx['mimetype'] == 'warc/revisit' and headers_record:
            payload_record = self._load_different_url_payload(cdx,
                                                              headers_record,
                                                              failed_files)
        # single lookup cases
        # case 2: non-revisit
        elif (has_curr and not has_orig):
            payload_record = headers_record
        # case 3: identical url revisit, load payload from orig.filename
        elif (has_orig):
            payload_record = self._resolve_path_load(cdx, True, failed_files)
        # special case: set header to payload if old-style revisit
        # with missing header
        if not headers_record:
            headers_record = payload_record
        elif headers_record != payload_record:
            # close remainder of stream as this record only used for
            # (already parsed) headers
            headers_record.stream.close()
            # special case: check if headers record is actually empty
            # (eg empty revisit), then use headers from revisit
            if not headers_record.status_headers.headers:
                headers_record = payload_record
        if not headers_record or not payload_record:
            raise ArchiveLoadFailed('Could not load ' + str(cdx))
        return (headers_record.status_headers, payload_record.stream)
    def _resolve_path_load(self, cdx, is_original, failed_files):
        """
        Load specific record based on filename, offset and length
        fields in the cdx.
        If original=True, use the orig.* fields for the cdx
        Resolve the filename to full path using specified path resolvers
        If failed_files list provided, keep track of failed resolve attempts
        """
        if is_original:
            (filename, offset, length) = (cdx['orig.filename'],
                                          cdx['orig.offset'],
                                          cdx['orig.length'])
        else:
            (filename, offset, length) = (cdx['filename'],
                                          cdx['offset'],
                                          cdx['length'])
        # optimization: if same file already failed this request,
        # don't try again
        if failed_files and filename in failed_files:
            raise ArchiveLoadFailed('Skipping Already Failed', filename)
        any_found = False
        last_exc = None
        for resolver in self.path_resolvers:
            possible_paths = resolver(filename)
            if possible_paths:
                for path in possible_paths:
                    any_found = True
                    try:
                        return self.record_loader.load(path, offset, length)
                    except Exception as ue:
                        last_exc = ue
        # Unsuccessful if reached here
        if failed_files:
            failed_files.append(filename)
        if last_exc:
            msg = str(last_exc.__class__.__name__)
        else:
            msg = 'Archive File Not Found'
        raise ArchiveLoadFailed(msg, filename)
    def _load_different_url_payload(self, cdx, headers_record, failed_files):
        """
        Handle the case where a duplicate of a capture with same digest
        exists at a different url.
        If a cdx_server is provided, a query is made for matching
        url, timestamp and digest.
        Raise exception if no matches found.
        """
        ref_target_uri = (headers_record.rec_headers.
                          get_header('WARC-Refers-To-Target-URI'))
        target_uri = headers_record.rec_headers.get_header('WARC-Target-URI')
        # Check for unresolved revisit error,
        # if refers to target uri not present or same as the current url
        if not ref_target_uri or (ref_target_uri == target_uri):
            raise ArchiveLoadFailed('Missing Revisit Original')
        ref_target_date = (headers_record.rec_headers.
                           get_header('WARC-Refers-To-Date'))
        if not ref_target_date:
            ref_target_date = cdx['timestamp']
        else:
            ref_target_date = iso_date_to_timestamp(ref_target_date)
        orig_cdx_lines = self.load_cdx_for_dupe(ref_target_uri,
                                                ref_target_date, digest)
        for cdx in orig_cdx_lines:
            try:
                payload_record = self._load_and_resolve(cdx, False,
                                                        failed_files)
                return payload_record
            except ArchiveLoadFailed as e:
                pass
        raise ArchiveLoadFailed('Original for revisit could not be loaded')
    def load_cdx_for_dupe(url, timestamp, digest):
        """
        If a cdx_server is available, return response from server,
        otherwise empty list
        """
        if not self.cdx_server:
            return []
        params = {'url': url,
                  'closest': closest,
                  'filter': 'digest:' + digest,
                  'output': 'raw'}
        return self.cdx_server.load_cdx(params)
--- a/pywb/warc/test/test_loading.py
+++ b/pywb/warc/test/test_loading.py
@ -0,0 +1,199 @@
 """
 Test loading different types of records from a variety of formats
 # Load response record from WARC
 >>> load_test_archive('example.warc.gz', '333', '1043')
 (('warc', 'response'),
 StatusAndHeaders(protocol = 'WARC/1.0', statusline = '', headers = [ ('WARC-Type', 'response'),
  ('WARC-Record-ID', '<urn:uuid:6d058047-ede2-4a13-be79-90c17c631dd4>'),
  ('WARC-Date', '2014-01-03T03:03:21Z'),
  ('Content-Length', '1610'),
  ('Content-Type', 'application/http; msgtype=response'),
  ('WARC-Payload-Digest', 'sha1:B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'),
  ('WARC-Target-URI', 'http://example.com?example=1'),
  ('WARC-Warcinfo-ID', '<urn:uuid:fbd6cf0a-6160-4550-b343-12188dc05234>')]),
 StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
  ('Cache-Control', 'max-age=604800'),
  ('Content-Type', 'text/html'),
  ('Date', 'Fri, 03 Jan 2014 03:03:21 GMT'),
  ('Etag', '"359670651"'),
  ('Expires', 'Fri, 10 Jan 2014 03:03:21 GMT'),
  ('Last-Modified', 'Fri, 09 Aug 2013 23:54:35 GMT'),
  ('Server', 'ECS (sjc/4FCE)'),
  ('X-Cache', 'HIT'),
  ('x-ec-custom-error', '1'),
  ('Content-Length', '1270'),
  ('Connection', 'close')]))
 # Load revisit record from WARC
 >>> load_test_archive('example.warc.gz', '1864', '553')
 (('warc', 'revisit'),
 StatusAndHeaders(protocol = 'WARC/1.0', statusline = '', headers = [ ('WARC-Type', 'revisit'),
  ('WARC-Record-ID', '<urn:uuid:3619f5b0-d967-44be-8f24-762098d427c4>'),
  ('WARC-Date', '2014-01-03T03:03:41Z'),
  ('Content-Length', '340'),
  ('Content-Type', 'application/http; msgtype=response'),
  ('WARC-Payload-Digest', 'sha1:B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'),
  ('WARC-Target-URI', 'http://example.com?example=1'),
  ('WARC-Warcinfo-ID', '<urn:uuid:fbd6cf0a-6160-4550-b343-12188dc05234>'),
  ( 'WARC-Profile',
    'http://netpreserve.org/warc/0.18/revisit/identical-payload-digest'),
  ('WARC-Refers-To-Target-URI', 'http://example.com?example=1'),
  ('WARC-Refers-To-Date', '2014-01-03T03:03:21Z')]),
 StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
  ('Cache-Control', 'max-age=604800'),
  ('Content-Type', 'text/html'),
  ('Date', 'Fri, 03 Jan 2014 03:03:41 GMT'),
  ('Etag', '"359670651"'),
  ('Expires', 'Fri, 10 Jan 2014 03:03:41 GMT'),
  ('Last-Modified', 'Fri, 09 Aug 2013 23:54:35 GMT'),
  ('Server', 'ECS (sjc/4FCE)'),
  ('X-Cache', 'HIT'),
  ('x-ec-custom-error', '1'),
  ('Content-Length', '1270'),
  ('Connection', 'close')]))
 # Test of record loading based on cdx line
 # Print parsed http headers + 2 lines of content
 # ==============================================================================
 # Test loading from ARC based on cdx line
 >>> load_from_cdx_test('com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 856 171 example.arc.gz')
 StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
  ('Cache-Control', 'max-age=604800'),
  ('Content-Type', 'text/html'),
  ('Date', 'Sun, 16 Feb 2014 05:02:20 GMT'),
  ('Etag', '"359670651"'),
  ('Expires', 'Sun, 23 Feb 2014 05:02:20 GMT'),
  ('Last-Modified', 'Fri, 09 Aug 2013 23:54:35 GMT'),
  ('Server', 'ECS (sjc/4FCE)'),
  ('X-Cache', 'HIT'),
  ('x-ec-custom-error', '1'),
  ('Content-Length', '1270')])
 <!doctype html>
 <html>
 >>> load_from_cdx_test('com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1656 151 example.arc')
 StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
  ('Cache-Control', 'max-age=604800'),
  ('Content-Type', 'text/html'),
  ('Date', 'Sun, 16 Feb 2014 05:02:20 GMT'),
  ('Etag', '"359670651"'),
  ('Expires', 'Sun, 23 Feb 2014 05:02:20 GMT'),
  ('Last-Modified', 'Fri, 09 Aug 2013 23:54:35 GMT'),
  ('Server', 'ECS (sjc/4FCE)'),
  ('X-Cache', 'HIT'),
  ('x-ec-custom-error', '1'),
  ('Content-Length', '1270')])
 <!doctype html>
 <html>
 # Test loading from WARC based on cdx line
 >>> load_from_cdx_test('com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz')
 StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
  ('Cache-Control', 'max-age=604800'),
  ('Content-Type', 'text/html'),
  ('Date', 'Fri, 03 Jan 2014 03:03:21 GMT'),
  ('Etag', '"359670651"'),
  ('Expires', 'Fri, 10 Jan 2014 03:03:21 GMT'),
  ('Last-Modified', 'Fri, 09 Aug 2013 23:54:35 GMT'),
  ('Server', 'ECS (sjc/4FCE)'),
  ('X-Cache', 'HIT'),
  ('x-ec-custom-error', '1'),
  ('Content-Length', '1270'),
  ('Connection', 'close')])
 <!doctype html>
 <html>
 # Test cdx w/ revisit
 >>> load_from_cdx_test('com,example)/?example=1 20140103030341 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz 1043 333 example.warc.gz')
 StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
  ('Cache-Control', 'max-age=604800'),
  ('Content-Type', 'text/html'),
  ('Date', 'Fri, 03 Jan 2014 03:03:41 GMT'),
  ('Etag', '"359670651"'),
  ('Expires', 'Fri, 10 Jan 2014 03:03:41 GMT'),
  ('Last-Modified', 'Fri, 09 Aug 2013 23:54:35 GMT'),
  ('Server', 'ECS (sjc/4FCE)'),
  ('X-Cache', 'HIT'),
  ('x-ec-custom-error', '1'),
  ('Content-Length', '1270'),
  ('Connection', 'close')])
 <!doctype html>
 <html>
 # Test loading warc created by wget 1.14
 >>> load_from_cdx_test('com,example)/ 20140216012908 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1151 792 example-wget-1-14.warc.gz')
 StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
  ('Cache-Control', 'max-age=604800'),
  ('Content-Type', 'text/html'),
  ('Date', 'Sun, 16 Feb 2014 01:29:08 GMT'),
  ('Etag', '"359670651"'),
  ('Expires', 'Sun, 23 Feb 2014 01:29:08 GMT'),
  ('Last-Modified', 'Fri, 09 Aug 2013 23:54:35 GMT'),
  ('Server', 'ECS (sjc/4FB4)'),
  ('X-Cache', 'HIT'),
  ('x-ec-custom-error', '1'),
  ('Content-Length', '1270')])
 <!doctype html>
 <html>
 # Error Handling
 # Invalid WARC Offset
 >>> load_from_cdx_test('com,example)/?example=1 20140103030341 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1860 example.warc.gz 1043 333 example.warc.gz')
 Traceback (most recent call last):
 ArchiveLoadFailed: example.warc.gz:StatusAndHeadersParserException
 # Invalid ARC Offset
 >>> load_from_cdx_test('com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 332 example.warc.gz')
 Traceback (most recent call last):
 ArchiveLoadFailed: example.warc.gz:StatusAndHeadersParserException
 # Error Expected with revisit -- invalid offset on original
 >>> load_from_cdx_test('com,example)/?example=1 20140103030341 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz 1043 330 example.warc.gz')
 Traceback (most recent call last):
 ArchiveLoadFailed: example.warc.gz:StatusAndHeadersParserException
 """
 import os
 import sys
 import pprint
 from pywb.warc.recordloader import ArcWarcRecordLoader, ArchiveLoadFailed
 from pywb.warc.pathresolvers import make_best_resolvers
 from pywb.warc.resolvingloader import ResolvingLoader
 from pywb.cdx.cdxobject import CDXObject
 from pywb import get_test_dir
 #test_warc_dir = os.path.dirname(os.path.realpath(__file__)) + '/../sample_data/'
 test_warc_dir = get_test_dir() + 'warcs/'
 def load_test_archive(test_file, offset, length):
    path = test_warc_dir + test_file
    testloader = ArcWarcRecordLoader()
    archive = testloader.load(path, offset, length)
    archive = testloader.load(path, offset, length)
    pprint.pprint((archive.type, archive.rec_headers, archive.status_headers))
 def load_from_cdx_test(cdx):
    resolve_loader = ResolvingLoader(test_warc_dir)
    cdx = CDXObject(cdx)
    (headers, stream) = resolve_loader.resolve_headers_and_payload(cdx, None)
    print headers
    sys.stdout.write(stream.readline())
    sys.stdout.write(stream.readline())
--- a/pywb/wbapp.py
+++ b/pywb/wbapp.py
@ -1,8 +1,7 @@
 import utils
 import wbexceptions
 from wbrequestresponse import WbResponse, StatusAndHeaders
-from cdxserver.cdxserver import CDXException
+from pywb.cdx.cdxserver import CDXException
 import os
 import importlib
@ -10,13 +9,37 @@ import logging
 #=================================================================
 # adapted -from wsgiref.request_uri, but doesn't include domain name and allows all characters
 # allowed in the path segment according to: http://tools.ietf.org/html/rfc3986#section-3.3
 # explained here: http://stackoverflow.com/questions/4669692/valid-characters-for-directory-part-of-a-url-for-short-links
 def rel_request_uri(environ, include_query=1):
    """
    Return the requested path, optionally including the query string
    # Simple test:
    >>> rel_request_uri({'PATH_INFO': '/web/example.com'})
    '/web/example.com'
    # Test all unecoded special chars and double-quote
    # (double-quote must be encoded but not single quote)
    >>> rel_request_uri({'PATH_INFO': "/web/example.com/0~!+$&'()*+,;=:\\\""})
    "/web/example.com/0~!+$&'()*+,;=:%22"
    """
    from urllib import quote
    url = quote(environ.get('PATH_INFO',''), safe='/~!$&\'()*+,;=:@')
    if include_query and environ.get('QUERY_STRING'):
        url += '?' + environ['QUERY_STRING']
    return url
 #=================================================================
 def create_wb_app(wb_router):
    # Top-level wsgi application
    def application(env, start_response):
        if env.get('SCRIPT_NAME') or not env.get('REQUEST_URI'):
-            env['REL_REQUEST_URI'] = utils.rel_request_uri(env)
+            env['REL_REQUEST_URI'] = rel_request_uri(env)
        else:
            env['REL_REQUEST_URI'] = env['REQUEST_URI']
@ -95,7 +118,7 @@ def main():
        raise
 #=================================================================
-if __name__ == "__main__" or utils.enable_doctests():
+if __name__ == "__main__":
    pass
 else:
    application = main()
--- a/pywb/wbrequestresponse.py
+++ b/pywb/wbrequestresponse.py
@ -1,7 +1,6 @@
-from wburl import WbUrl
+from pywb.rewrite.wburl import WbUrl
-from url_rewriter import UrlRewriter
+from pywb.rewrite.url_rewriter import UrlRewriter
-
+from pywb.utils.statusandheaders import StatusAndHeaders
 import utils
 import pprint
 #WB Request and Response
@ -182,35 +181,6 @@ class WbResponse:
    def __repr__(self):
        return str(vars(self))
 #=================================================================
 class StatusAndHeaders:
    def __init__(self, statusline, headers, protocol = ''):
        self.statusline = statusline
        self.headers = headers
        self.protocol = protocol
    def get_header(self, name):
        name_lower = name.lower()
        for value in self.headers:
            if (value[0].lower() == name_lower):
                return value[1]
    def remove_header(self, name):
        name_lower = name.lower()
        for x in xrange(len(self.headers) - 1, -1, -1):
            if self.headers[x][0].lower() == name_lower:
                del self.headers[x]
                break
    def __repr__(self):
        return "StatusAndHeaders(protocol = '{0}', statusline = '{1}', headers = {2})".format(self.protocol, self.statusline, pprint.pformat(self.headers, indent = 2))
        #return pprint.pformat(self.__dict__)
    def __eq__(self, other):
        return self.statusline == other.statusline and self.headers == other.headers and self.protocol == other.protocol
 if __name__ == "__main__":
    import doctest
    doctest.testmod()
--- a/run-tests.py
+++ b/run-tests.py
@ -0,0 +1,3 @@
 import pytest
 result = pytest.main('-v --doctest-module tests/ pywb/')
 exit(result)
--- a/sample_archive/cdx/iana.cdx.gz
+++ b/sample_archive/cdx/iana.cdx.gz
--- a/sample_archive/warcs/example-wget-1-14.warc.gz
+++ b/sample_archive/warcs/example-wget-1-14.warc.gz
--- a/sample_archive/warcs/example.arc
+++ b/sample_archive/warcs/example.arc
@ -0,0 +1,69 @@
 filedesc://live-web-example.arc.gz 127.0.0.1 20140216050221 text/plain 75
 1 0 LiveWeb Capture
 URL IP-address Archive-date Content-type Archive-length
 http://example.com/ 93.184.216.119 20140216050221 text/html 1591
 HTTP/1.1 200 OK
 Accept-Ranges: bytes
 Cache-Control: max-age=604800
 Content-Type: text/html
 Date: Sun, 16 Feb 2014 05:02:20 GMT
 Etag: "359670651"
 Expires: Sun, 23 Feb 2014 05:02:20 GMT
 Last-Modified: Fri, 09 Aug 2013 23:54:35 GMT
 Server: ECS (sjc/4FCE)
 X-Cache: HIT
 x-ec-custom-error: 1
 Content-Length: 1270
 <!doctype html>
 <html>
 <head>
    <title>Example Domain</title>
    <meta charset="utf-8" />
    <meta http-equiv="Content-type" content="text/html; charset=utf-8" />
    <meta name="viewport" content="width=device-width, initial-scale=1" />
    <style type="text/css">
    body {
        background-color: #f0f0f2;
        margin: 0;
        padding: 0;
        font-family: "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
    }
    div {
        width: 600px;
        margin: 5em auto;
        padding: 50px;
        background-color: #fff;
        border-radius: 1em;
    }
    a:link, a:visited {
        color: #38488f;
        text-decoration: none;
    }
    @media (max-width: 700px) {
        body {
            background-color: #fff;
        }
        div {
            width: auto;
            margin: 0 auto;
            border-radius: 0;
            padding: 1em;
        }
    }
    </style>    
 </head>
 <body>
 <div>
    <h1>Example Domain</h1>
    <p>This domain is established to be used for illustrative examples in documents. You may use this
    domain in examples without prior coordination or asking for permission.</p>
    <p><a href="http://www.iana.org/domains/example">More information...</a></p>
 </div>
 </body>
 </html>
--- a/sample_archive/warcs/example.arc.gz
+++ b/sample_archive/warcs/example.arc.gz
--- a/setup.py
+++ b/setup.py
@ -5,18 +5,18 @@ import setuptools
 import glob
 setuptools.setup(name='pywb',
-        version='0.1',
+        version='0.2',
        url='https://github.com/ikreymer/pywb',
        author='Ilya Kreymer',
        author_email='ilya@archive.org',
        long_description=open('README.md').read(),
        license='GPL',
-        packages=['pywb', 'pywb.binsearch', 'pywb.cdxserver'],
+        packages=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite'],
-        provides=['pywb', 'pywb.binsearch', 'pywb.cdxserver'],
+        provides=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite'],
        package_data={'pywb': ['ui/*', 'static/*']},
        data_files = [('sample_archive/cdx/', glob.glob('sample_archive/cdx/*')),
                      ('sample_archive/warcs/', glob.glob('sample_archive/warcs/*'))],
-        install_requires=['uwsgi', 'rfc3987', 'chardet', 'redis', 'jinja2', 'surt', 'pyyaml', 'WebTest'],
+        install_requires=['uwsgi', 'rfc3987', 'chardet', 'redis', 'jinja2', 'surt', 'pyyaml', 'WebTest','pytest'],
-        tests_require=['WebTest', 'pytest'],
+#        tests_require=['WebTest', 'pytest'],
        zip_safe=False)
--- a/tests/test_archivalrouter.py
+++ b/tests/test_archivalrouter.py
@ -0,0 +1,88 @@
 """
 Test Route
 # route with relative path
 >>> Route('web', BaseHandler())({'REL_REQUEST_URI': '/web/test.example.com', 'SCRIPT_NAME': ''}, False)
 {'wb_url': ('latest_replay', '', '', 'http://test.example.com', 'http://test.example.com'), 'coll': 'web', 'wb_prefix': '/web/', 'request_uri': '/web/test.example.com'}
 # route with absolute path, running at script /my_pywb
 >>> Route('web', BaseHandler())({'REL_REQUEST_URI': '/web/2013im_/test.example.com', 'SCRIPT_NAME': '/my_pywb', 'HTTP_HOST': 'localhost:8081', 'wsgi.url_scheme': 'https'}, True)
 {'wb_url': ('replay', '2013', 'im_', 'http://test.example.com', '2013im_/http://test.example.com'), 'coll': 'web', 'wb_prefix': 'https://localhost:8081/my_pywb/web/', 'request_uri': '/web/2013im_/test.example.com'}
 # not matching route -- skipped
 >>> Route('web', BaseHandler())({'REL_REQUEST_URI': '/other/test.example.com', 'SCRIPT_NAME': ''}, False)
 # Referer Redirect Test
 >>> ReferRedirect('http://localhost:8080/').match_prefixs
 ['http://localhost:8080/']
 >>> ReferRedirect(['http://example:9090/']).match_prefixs
 ['http://example:9090/']
 >>> _test_redir('http://localhost:8080/', '/diff_path/other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html')
 'http://localhost:8080/coll/20131010/http://example.com/diff_path/other.html'
 >>> _test_redir('http://localhost:8080/', '/../other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html')
 'http://localhost:8080/coll/20131010/http://example.com/other.html'
 >>> _test_redir('http://localhost:8080/', '/../../other.html', 'http://localhost:8080/coll/20131010/http://example.com/index.html')
 'http://localhost:8080/coll/20131010/http://example.com/other.html'
 # Custom collection
 >>> _test_redir('http://localhost:8080/', '/other.html', 'http://localhost:8080/complex/123/20131010/http://example.com/path/page.html', coll='complex/123')
 'http://localhost:8080/complex/123/20131010/http://example.com/other.html'
 # With timestamp included
 >>> _test_redir('http://localhost:8080/', '/20131010/other.html', 'http://localhost:8080/coll/20131010/http://example.com/index.html')
 'http://localhost:8080/coll/20131010/http://example.com/other.html'
 # With timestamp included
 >>> _test_redir('http://localhost:8080/', '/20131010/path/other.html', 'http://localhost:8080/coll/20131010/http://example.com/some/index.html')
 'http://localhost:8080/coll/20131010/http://example.com/path/other.html'
 # Wrong Host
 >>> _test_redir('http://example:8080/', '/other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html')
 False
 # Right Host
 >>> _test_redir('http://localhost:8080/', '/other.html', 'http://example.com:8080/coll/20131010/http://example.com/path/page.html', http_host = 'example.com:8080')
 'http://example.com:8080/coll/20131010/http://example.com/other.html'
 # With custom SCRIPT_NAME
 >>> _test_redir('http://localhost:8080/', '/../other.html', 'http://localhost:8080/extra/coll/20131010/http://example.com/path/page.html', '/extra')
 'http://localhost:8080/extra/coll/20131010/http://example.com/other.html'
 # With custom SCRIPT_NAME + timestamp
 >>> _test_redir('http://localhost:8080/', '/20131010/other.html', 'http://localhost:8080/extra/coll/20131010/http://example.com/path/page.html', '/extra')
 'http://localhost:8080/extra/coll/20131010/http://example.com/other.html'
 # With custom SCRIPT_NAME, bad match
 >>> _test_redir('http://localhost:8080/', '/../other.html', 'http://localhost:8080/extra/coll/20131010/http://example.com/path/page.html', '/extr')
 False
 """
 from pywb.archivalrouter import Route, ReferRedirect
 from pywb.handlers import BaseHandler
 def _test_redir(match_host, request_uri, referrer, script_name = '', coll = 'coll', http_host = None):
    env = {'REL_REQUEST_URI': request_uri, 'HTTP_REFERER': referrer, 'SCRIPT_NAME': script_name}
    if http_host:
        env['HTTP_HOST'] = http_host
    routes = [Route(coll, BaseHandler())]
    redir = ReferRedirect(match_host)
    #req = WbRequest.from_uri(request_uri, env)
    rep = redir(env, routes)
    if not rep:
        return False
    return rep.status_headers.get_header('Location')
--- a/tests/test_binsearch.py
+++ b/tests/test_binsearch.py
@ -1,43 +0,0 @@
 import os
 from ..pywb.binsearch.binsearch import iter_prefix, iter_exact, FileReader
 test_cdx_dir = os.path.dirname(os.path.realpath(__file__)) + '/../sample_archive/cdx/'
 def binsearch_cdx_test(key, iter_func):
    """
    # Prefix Search
    >>> binsearch_cdx_test('org,iana)/domains/root', iter_prefix)
    org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz
    org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz
    org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz
    org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz
    >>> binsearch_cdx_test('org,iana)/domains/root', iter_exact)
    org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz
    >>> binsearch_cdx_test('org,iana)/', iter_exact)
    org,iana)/ 20140126200624 http://www.iana.org/ text/html 200 OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 2258 334 iana.warc.gz
    >>> binsearch_cdx_test('org,iana)/domains/root/db', iter_exact)
    org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz
    org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz
    # Exact Search
    >>> binsearch_cdx_test('org,iaana)/', iter_exact)
    >>> binsearch_cdx_test('org,ibna)/', iter_exact)
    >>> binsearch_cdx_test('org,iana)/time-zones', iter_exact)
    org,iana)/time-zones 20140126200737 http://www.iana.org/time-zones text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
    """
    cdx =  FileReader(test_cdx_dir + 'iana.cdx')
    for line in iter_func(cdx, key):
        print line
 if __name__ == "__main__":
    import doctest
    doctest.testmod()
--- a/tests/test_cdxserve.py
+++ b/tests/test_cdxserve.py
@ -1,149 +0,0 @@
 from ..pywb.binsearch.binsearch import iter_exact, iter_prefix, FileReader
 from ..pywb.cdxserver.cdxserver import CDXServer
 import os
 import sys
 import pprint
 test_cdx_dir = os.path.dirname(os.path.realpath(__file__)) + '/../sample_archive/cdx/'
 def cdx_ops_test(url, sources = [test_cdx_dir + 'iana.cdx'], **kwparams):
    """
    # Merge Sort Multipe CDX Sources
    >>> cdx_ops_test(url = 'http://iana.org/', sources = [test_cdx_dir + 'dupes.cdx', test_cdx_dir + 'iana.cdx'])
    org,iana)/ 20140126200624 http://www.iana.org/ text/html 200 OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 2258 334 iana.warc.gz
    org,iana)/ 20140127171238 http://iana.org unk 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 343 1858 dupes.warc.gz
    org,iana)/ 20140127171238 http://www.iana.org/ warc/revisit - OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 536 2678 dupes.warc.gz
    # Limit CDX Stream
    >>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', limit = 3)
    org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200625 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 117166 198285 iana.warc.gz
    org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200654 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf warc/revisit - YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 548 482544 iana.warc.gz
    org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200706 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf warc/revisit - YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 495230 iana.warc.gz
    # Reverse CDX Stream
    >>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', reverse = True, resolve_revisits = True, limit = 3)
    org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201308 https://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 551 783712 iana.warc.gz 117166 198285 iana.warc.gz
    org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201249 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 771773 iana.warc.gz 117166 198285 iana.warc.gz
    org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201240 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 551 757988 iana.warc.gz 117166 198285 iana.warc.gz
    >>> cdx_ops_test('http://iana.org/_js/2013.1/jquery.js', reverse = True, resolve_revisits = True, limit = 1)
    org,iana)/_js/2013.1/jquery.js 20140126201307 https://www.iana.org/_js/2013.1/jquery.js application/x-javascript 200 AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO - - 543 778507 iana.warc.gz 33449 7311 iana.warc.gz
    # No matching results
    >>> cdx_ops_test('http://iana.org/dont_have_this', reverse = True, resolve_revisits = True, limit = 2)
    # Filter cdx
    >>> cdx_ops_test(url = 'http://iana.org/domains', match_type = 'prefix', filter = ['mimetype:text/html'])
    org,iana)/domains 20140126200825 http://www.iana.org/domains text/html 200 7UPSCLNWNZP33LGW6OJGSF2Y4CDG4ES7 - - 2912 610534 iana.warc.gz
    org,iana)/domains/arpa 20140126201248 http://www.iana.org/domains/arpa text/html 200 QOFZZRN6JIKAL2JRL6ZC2VVG42SPKGHT - - 2939 759039 iana.warc.gz
    org,iana)/domains/idn-tables 20140126201127 http://www.iana.org/domains/idn-tables text/html 200 HNCUFTJMOQOGAEY6T56KVC3T7TVLKGEW - - 8118 715878 iana.warc.gz
    org,iana)/domains/int 20140126201239 http://www.iana.org/domains/int text/html 200 X32BBNNORV4SPEHTQF5KI5NFHSKTZK6Q - - 2482 746788 iana.warc.gz
    org,iana)/domains/reserved 20140126201054 http://www.iana.org/domains/reserved text/html 200 R5AAEQX5XY5X5DG66B23ODN5DUBWRA27 - - 3573 701457 iana.warc.gz
    org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz
    org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz
    org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz
    org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz
    >>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', filter = 'statuscode:200')
    org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz
    # Collapse by timestamp
    # unresolved revisits, different statuscode results in an extra repeat
    >>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', collapse_time = 11)
    org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz
    org,iana)/_css/2013.1/screen.css 20140126200653 http://www.iana.org/_css/2013.1/screen.css warc/revisit - BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 533 328367 iana.warc.gz
    org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css warc/revisit - BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz
    # resolved revisits
    >>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', collapse_time = '11', resolve_revisits = True)
    org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz - - -
    org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz 8754 41238 iana.warc.gz
    # Sort by closest timestamp + field select output
    >>> cdx_ops_test(closest_to = '20140126200826', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', fields = 'timestamp', limit = 10)
    20140126200826
    20140126200816
    20140126200805
    20140126200912
    20140126200738
    20140126200930
    20140126200718
    20140126200706
    20140126200654
    20140126200625
    >>> cdx_ops_test(closest_to = '20140126201306', url = 'http://iana.org/dnssec', resolve_revisits = True, sources = [test_cdx_dir + 'dupes.cdx', test_cdx_dir + 'iana.cdx'])
    org,iana)/dnssec 20140126201306 http://www.iana.org/dnssec text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 442 772827 iana.warc.gz - - -
    org,iana)/dnssec 20140126201307 https://www.iana.org/dnssec text/html 200 PHLRSX73EV3WSZRFXMWDO6BRKTVUSASI - - 2278 773766 iana.warc.gz - - -
    >>> cdx_ops_test(closest_to = '20140126201307', url = 'http://iana.org/dnssec', resolve_revisits = True)
    org,iana)/dnssec 20140126201307 https://www.iana.org/dnssec text/html 200 PHLRSX73EV3WSZRFXMWDO6BRKTVUSASI - - 2278 773766 iana.warc.gz - - -
    org,iana)/dnssec 20140126201306 http://www.iana.org/dnssec text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 442 772827 iana.warc.gz - - -
    # equal dist prefer earlier
    >>> cdx_ops_test(closest_to = '20140126200700', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolve_revisits = True, limit = 2)
    org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200654 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 548 482544 iana.warc.gz 117166 198285 iana.warc.gz
    org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200706 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 495230 iana.warc.gz 117166 198285 iana.warc.gz
    >>> cdx_ops_test(closest_to = '20140126200659', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolve_revisits = True, limit = 2, fields = 'timestamp')
    20140126200654
    20140126200706
    >>> cdx_ops_test(closest_to = '20140126200701', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolve_revisits = True, limit = 2, fields = 'timestamp')
    20140126200706
    20140126200654
    # Resolve Revisits
    >>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/inconsolata.otf', resolve_revisits = True)
    org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200826 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 34054 620049 iana.warc.gz - - -
    org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200912 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 546 667073 iana.warc.gz 34054 620049 iana.warc.gz
    org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200930 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 534 697255 iana.warc.gz 34054 620049 iana.warc.gz
    org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201055 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 547 714833 iana.warc.gz 34054 620049 iana.warc.gz
    org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201249 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 551 768625 iana.warc.gz 34054 620049 iana.warc.gz
    >>> cdx_ops_test('http://iana.org/domains/root/db', resolve_revisits = True)
    org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz - - -
    org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz - - -
    # CDX Server init
    >>> x = CDXServer([test_cdx_dir]).load_cdx(url = 'example.com', limit = 1, output = 'raw')
    >>> pprint.pprint(x.next().items())
    [('urlkey', 'com,example)/'),
     ('timestamp', '20140127171200'),
     ('original', 'http://example.com'),
     ('mimetype', 'text/html'),
     ('statuscode', '200'),
     ('digest', 'B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'),
     ('redirect', '-'),
     ('robotflags', '-'),
     ('length', '1046'),
     ('offset', '334'),
     ('filename', 'dupes.warc.gz')]
    """
    kwparams['url'] = url
    kwparams['output'] = 'text'
    server = CDXServer(sources)
    results = server.load_cdx(**kwparams)
    for x in results:
        sys.stdout.write(x)
 if __name__ == "__main__":
    import doctest
    doctest.testmod()
--- a/tests/test_integration.py
+++ b/tests/test_integration.py
@ -1,7 +1,7 @@
 import webtest
-from ..pywb.pywb_init import pywb_config
+from pywb.pywb_init import pywb_config
-from ..pywb.wbapp import create_wb_app
+from pywb.wbapp import create_wb_app
-from ..pywb.cdxserver.cdxobject import CDXObject
+from pywb.cdx.cdxobject import CDXObject
 class TestWb:
    TEST_CONFIG = 'test_config.yaml'