spin-off warcio!

update imports to point to warcio warcio rename fixes: - ArcWarcRecord.stream -> raw_stream - ArcWarcRecord.status_headers -> http_headers - ArchiveLoadFailed single param init
2025-03-15 00:03:28 +01:00 · 2017-03-01 18:37:38 -08:00 · 2017-03-01 18:37:38 -08:00 · 0784e4e5aa
commit 0784e4e5aa
parent 4a94699a65
59 changed files with 186 additions and 2538 deletions
--- a/pywb/cdx/cdxops.py
+++ b/pywb/cdx/cdxops.py
@ -3,8 +3,9 @@ from pywb.cdx.cdxobject import TIMESTAMP, STATUSCODE, MIMETYPE, DIGEST
 from pywb.cdx.cdxobject import OFFSET, LENGTH, FILENAME
 from pywb.cdx.query import CDXQuery
-from pywb.utils.timeutils import timestamp_to_sec, pad_timestamp
+
-from pywb.utils.timeutils import PAD_14_DOWN, PAD_14_UP
+from warcio.timeutils import timestamp_to_sec, pad_timestamp
 from warcio.timeutils import PAD_14_DOWN, PAD_14_UP
 import bisect
--- a/pywb/cdx/test/test_redis_source.py
+++ b/pywb/cdx/test/test_redis_source.py
@ -14,7 +14,7 @@ com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ
 from fakeredis import FakeStrictRedis
 from mock import patch
-from pywb.utils.timeutils import timestamp_to_sec
+from warcio.timeutils import timestamp_to_sec
 from pywb.cdx.cdxsource import RedisCDXSource
 from pywb.cdx.cdxserver import CDXServer
--- a/pywb/cdx/zipnum.py
+++ b/pywb/cdx/zipnum.py
@ -13,7 +13,7 @@ from pywb.cdx.cdxsource import CDXSource
 from pywb.cdx.cdxobject import IDXObject, CDXException
 from pywb.utils.loaders import BlockLoader, read_last_line
-from pywb.utils.bufferedreaders import gzip_decompressor
+from warcio.bufferedreaders import gzip_decompressor
 from pywb.utils.binsearch import iter_range, linearsearch, search
--- a/pywb/framework/memento.py
+++ b/pywb/framework/memento.py
@ -1,6 +1,6 @@
 from pywb.utils.wbexception import BadRequestException
-from pywb.utils.timeutils import http_date_to_timestamp
+from warcio.timeutils import http_date_to_timestamp
-from pywb.utils.timeutils import timestamp_to_http_date
+from warcio.timeutils import timestamp_to_http_date
 from pywb.framework.wbrequestresponse import WbRequest, WbResponse
 from pywb.rewrite.wburl import WbUrl
--- a/pywb/framework/proxy.py
+++ b/pywb/framework/proxy.py
@ -4,6 +4,7 @@ from pywb.framework.wbrequestresponse import WbResponse, WbRequest
 from pywb.framework.archivalrouter import ArchivalRouter
 from six.moves.urllib.parse import urlsplit
 from six import iteritems
 import base64
 import socket
@ -15,8 +16,8 @@ from pywb.rewrite.url_rewriter import SchemeOnlyUrlRewriter, UrlRewriter
 from pywb.rewrite.rewrite_content import RewriteContent
 from pywb.utils.wbexception import BadRequestException
-from pywb.utils.bufferedreaders import BufferedReader
+from warcio.bufferedreaders import BufferedReader
-from pywb.utils.loaders import to_native_str
+from warcio.utils import to_native_str
 from pywb.framework.proxy_resolvers import ProxyAuthResolver, CookieResolver, IPCacheResolver
@ -250,7 +251,8 @@ class ProxyRouter(object):
        # add extra headers for replay responses
        if wbrequest.wb_url and wbrequest.wb_url.is_replay():
-            response.status_headers.replace_headers(self.extra_headers)
+            for name, value in iteritems(self.extra_headers):
                response.status_headers.replace_header(name, value)
        # check for content-length
        res = response.status_headers.get_header('content-length')
--- a/pywb/framework/proxy_resolvers.py
+++ b/pywb/framework/proxy_resolvers.py
@ -1,7 +1,6 @@
 from pywb.framework.wbrequestresponse import WbResponse
 from pywb.utils.loaders import extract_client_cookie
 from pywb.utils.wbexception import WbException
 from pywb.utils.statusandheaders import StatusAndHeaders
 from pywb.rewrite.wburl import WbUrl
 from pywb.framework.cache import create_cache
@ -10,7 +9,8 @@ from pywb.framework.basehandlers import WbUrlHandler
 from six.moves.urllib.parse import parse_qs, urlsplit
 import six
-from pywb.utils.loaders import to_native_str
+from warcio.statusandheaders import StatusAndHeaders
 from warcio.utils import to_native_str
 import base64
 import os
--- a/pywb/framework/test/test_wbrequestresponse.py
+++ b/pywb/framework/test/test_wbrequestresponse.py
@ -61,7 +61,7 @@
 from pywb.rewrite.wburl import WbUrl
 from pywb.rewrite.url_rewriter import UrlRewriter
-from pywb.utils.statusandheaders import StatusAndHeaders
+from warcio.statusandheaders import StatusAndHeaders
 from pywb.framework.wbrequestresponse import WbRequest, WbResponse
--- a/pywb/framework/wbrequestresponse.py
+++ b/pywb/framework/wbrequestresponse.py
@ -1,4 +1,4 @@
-from pywb.utils.statusandheaders import StatusAndHeaders
+from warcio.statusandheaders import StatusAndHeaders
 from pywb.utils.loaders import extract_post_query, append_post_query
 from io import BytesIO
--- a/pywb/framework/wsgi_wrappers.py
+++ b/pywb/framework/wsgi_wrappers.py
@ -1,7 +1,10 @@
 from pywb.utils.wbexception import WbException, NotFoundException
-from pywb.utils.loaders import load_yaml_config, to_native_str
+from pywb.utils.loaders import load_yaml_config
 from pywb.utils.loaders import load_yaml_config
 from warcio.utils import to_native_str
-from pywb.framework.wbrequestresponse import WbResponse, StatusAndHeaders
+from pywb.framework.wbrequestresponse import WbResponse
 from warcio.statusandheaders import StatusAndHeaders
 import os
--- a/pywb/manager/manager.py
+++ b/pywb/manager/manager.py
@ -13,7 +13,7 @@ from pkg_resources import resource_string
 from argparse import ArgumentParser, RawTextHelpFormatter
 from pywb.utils.loaders import load_yaml_config
-from pywb.utils.timeutils import timestamp20_now
+from warcio.timeutils import timestamp20_now
 from pywb import DEFAULT_CONFIG
--- a/pywb/recorder/filters.py
+++ b/pywb/recorder/filters.py
@ -1,4 +1,4 @@
-from pywb.utils.timeutils import timestamp_to_datetime, datetime_to_iso_date
+from warcio.timeutils import timestamp_to_datetime, datetime_to_iso_date
 import re
--- a/pywb/recorder/multifilewarcwriter.py
+++ b/pywb/recorder/multifilewarcwriter.py
@ -7,12 +7,11 @@ import traceback
 import portalocker
-from pywb.utils.timeutils import timestamp20_now
+from warcio.timeutils import timestamp20_now
 from warcio.warcwriter import BaseWARCWriter
 from pywb.webagg.utils import res_template
 from pywb.warc.warcwriter import BaseWARCWriter
 # ============================================================================
 class MultiFileWARCWriter(BaseWARCWriter):
@ -22,6 +21,8 @@ class MultiFileWARCWriter(BaseWARCWriter):
                 max_idle_secs=1800, *args, **kwargs):
        super(MultiFileWARCWriter, self).__init__(*args, **kwargs)
        self.header_filter = kwargs.get('header_filter')
        if not filename_template:
            dir_template, filename_template = os.path.split(dir_template)
            dir_template += os.path.sep
@ -41,27 +42,8 @@ class MultiFileWARCWriter(BaseWARCWriter):
        self.fh_cache = {}
    def write_req_resp(self, req, resp, params):
        url = resp.rec_headers.get_header('WARC-Target-URI')
        dt = resp.rec_headers.get_header('WARC-Date')
        #req.rec_headers['Content-Type'] = req.content_type
        req.rec_headers.replace_header('WARC-Target-URI', url)
        req.rec_headers.replace_header('WARC-Date', dt)
        resp_id = resp.rec_headers.get_header('WARC-Record-ID')
        if resp_id:
            req.rec_headers.add_header('WARC-Concurrent-To', resp_id)
        resp = self._check_revisit(resp, params)
        if not resp:
            print('Skipping due to dedup')
            return
        self._do_write_req_resp(req, resp, params)
    def _check_revisit(self, record, params):
-        if not self.dedup_index:
+        if not self.dedup_index or record.rec_type != 'response':
            return record
        try:
@ -77,14 +59,18 @@ class MultiFileWARCWriter(BaseWARCWriter):
            return None
        if isinstance(result, tuple) and result[0] == 'revisit':
-            record.rec_headers.replace_header('WARC-Type', 'revisit')
+            record = self.create_revisit_record(url, digest, result[1], result[2],
-            record.rec_headers.add_header('WARC-Profile', self.REVISIT_PROFILE)
+                                                http_headers=record.http_headers)
            record.rec_headers.add_header('WARC-Refers-To-Target-URI', result[1])
            record.rec_headers.add_header('WARC-Refers-To-Date', result[2])
        return record
    def _set_header_buff(self, record):
        exclude_list = None
        if self.header_filter:
            exclude_list = self.header_filter(record)
        buff = record.http_headers.to_bytes(exclude_list)
        record.http_headers.headers_buff = buff
    def get_new_filename(self, dir_, params):
        timestamp = timestamp20_now()
@ -153,6 +139,11 @@ class MultiFileWARCWriter(BaseWARCWriter):
        self._do_write_req_resp(None, record, params)
    def _do_write_req_resp(self, req, resp, params):
        resp = self._check_revisit(resp, params)
        if not resp:
            print('Skipping due to dedup')
            return
        def write_callback(out, filename):
            #url = resp.rec_headers.get_header('WARC-Target-URI')
            #print('Writing req/resp {0} to {1} '.format(url, filename))
@ -180,6 +171,8 @@ class MultiFileWARCWriter(BaseWARCWriter):
        close_file = False
        new_size = start = 0
        if result:
            out, filename = result
            is_new = False
--- a/pywb/recorder/recorderapp.py
+++ b/pywb/recorder/recorderapp.py
@ -69,16 +69,21 @@ class RecorderApp(object):
            req_head, req_pay, resp_head, resp_pay, params = result
            #resp_type, resp = self.writer.read_resp_record(resp_head, resp_pay)
-            resp = self.writer.copy_warc_record(resp_pay)
+            resp_length = resp_pay.tell()
            resp_pay.seek(0)
            resp = self.writer.create_record_from_stream(resp_pay, resp_length)
            if resp.rec_type == 'response':
                uri = resp.rec_headers.get_header('WARC-Target-Uri')
                req_length = req_pay.tell()
                req_pay.seek(0)
                req = self.writer.create_warc_record(uri=uri,
                                                     record_type='request',
                                                     payload=req_pay,
                                                     length=req_length,
                                                     warc_headers_dict=req_head)
-                self.writer.write_req_resp(req, resp, params)
+                self.writer.write_request_response_pair(req, resp, params)
            else:
                self.writer.write_record(resp, params)
@ -133,9 +138,13 @@ class RecorderApp(object):
            content_type = headers.get('Content-Type')
            payload_length = req_stream.out.tell()
            req_stream.out.seek(0)
            record = self.writer.create_warc_record(uri=params['url'],
                                                    record_type=record_type,
                                                    payload=req_stream.out,
                                                    length=payload_length,
                                                    warc_content_type=content_type,
                                                    warc_headers_dict=req_stream.headers)
--- a/pywb/recorder/redisindexer.py
+++ b/pywb/recorder/redisindexer.py
@ -1,7 +1,8 @@
 from pywb.utils.canonicalize import calc_search_range
 from pywb.cdx.cdxobject import CDXObject
 from pywb.warc.cdxindexer import write_cdx_index
-from pywb.utils.timeutils import iso_date_to_timestamp
+
 from warcio.timeutils import iso_date_to_timestamp
 from io import BytesIO
 import os
--- a/pywb/recorder/test/test_recorder.py
+++ b/pywb/recorder/test/test_recorder.py
@ -20,11 +20,13 @@ from pywb.recorder.filters import SkipDupePolicy, WriteDupePolicy, WriteRevisitD
 from pywb.webagg.utils import MementoUtils
 from pywb.cdx.cdxobject import CDXObject
-from pywb.utils.statusandheaders import StatusAndHeadersParser
+
-from pywb.utils.bufferedreaders import DecompressingBufferedReader
+from warcio.statusandheaders import StatusAndHeadersParser
-from pywb.warc.recordloader import ArcWarcRecordLoader
+from warcio.bufferedreaders import DecompressingBufferedReader
 from warcio.recordloader import ArcWarcRecordLoader
 from warcio.archiveiterator import ArchiveIterator
 from pywb.warc.cdxindexer import write_cdx_index
 from pywb.warc.archiveiterator import ArchiveIterator
 from six.moves.urllib.parse import quote, unquote, urlencode
 from io import BytesIO
@ -122,9 +124,9 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass)
            filename = os.path.join(base_dir, filename)
            with open(filename, 'rb') as fh:
                for record in ArchiveIterator(fh, no_record_parse=True):
-                    assert record.status_headers == None
+                    assert record.http_headers == None
                    assert int(record.rec_headers.get_header('Content-Length')) == record.length
-                    assert record.length == len(record.stream.read())
+                    assert record.length == len(record.raw_stream.read())
    def test_record_warc_1(self):
        recorder_app = RecorderApp(self.upstream_url,
@ -168,16 +170,16 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass)
        buff = BytesIO(resp.body)
        record = ArcWarcRecordLoader().parse_record_stream(buff)
-        assert ('Set-Cookie', 'name=value; Path=/') in record.status_headers.headers
+        assert ('Set-Cookie', 'name=value; Path=/') in record.http_headers.headers
-        assert ('Set-Cookie', 'foo=bar; Path=/') in record.status_headers.headers
+        assert ('Set-Cookie', 'foo=bar; Path=/') in record.http_headers.headers
        stored_req, stored_resp = self._load_resp_req(base_path)
-        assert ('Set-Cookie', 'name=value; Path=/') in stored_resp.status_headers.headers
+        assert ('Set-Cookie', 'name=value; Path=/') in stored_resp.http_headers.headers
-        assert ('Set-Cookie', 'foo=bar; Path=/') in stored_resp.status_headers.headers
+        assert ('Set-Cookie', 'foo=bar; Path=/') in stored_resp.http_headers.headers
-        assert ('X-Other', 'foo') in stored_req.status_headers.headers
+        assert ('X-Other', 'foo') in stored_req.http_headers.headers
-        assert ('Cookie', 'boo=far') in stored_req.status_headers.headers
+        assert ('Cookie', 'boo=far') in stored_req.http_headers.headers
        self._test_all_warcs('/warcs/cookiecheck/', 1)
@ -193,16 +195,16 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass)
        buff = BytesIO(resp.body)
        record = ArcWarcRecordLoader().parse_record_stream(buff)
-        assert ('Set-Cookie', 'name=value; Path=/') in record.status_headers.headers
+        assert ('Set-Cookie', 'name=value; Path=/') in record.http_headers.headers
-        assert ('Set-Cookie', 'foo=bar; Path=/') in record.status_headers.headers
+        assert ('Set-Cookie', 'foo=bar; Path=/') in record.http_headers.headers
        stored_req, stored_resp = self._load_resp_req(warc_path)
-        assert ('Set-Cookie', 'name=value; Path=/') not in stored_resp.status_headers.headers
+        assert ('Set-Cookie', 'name=value; Path=/') not in stored_resp.http_headers.headers
-        assert ('Set-Cookie', 'foo=bar; Path=/') not in stored_resp.status_headers.headers
+        assert ('Set-Cookie', 'foo=bar; Path=/') not in stored_resp.http_headers.headers
-        assert ('X-Other', 'foo') in stored_req.status_headers.headers
+        assert ('X-Other', 'foo') in stored_req.http_headers.headers
-        assert ('Cookie', 'boo=far') not in stored_req.status_headers.headers
+        assert ('Cookie', 'boo=far') not in stored_req.http_headers.headers
        self._test_all_warcs('/warcs/cookieskip/', 1)
@ -530,10 +532,10 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass)
        assert status_headers.get_header('Content-Length') == str(len(buff))
        assert status_headers.get_header('WARC-Custom') == 'foo'
-        assert record.stream.read() == buff
+        assert record.raw_stream.read() == buff
-        status_headers = record.status_headers
+        status_headers = record.http_headers
-        assert len(record.status_headers.headers) == 2
+        assert len(record.http_headers.headers) == 2
        assert status_headers.get_header('Content-Type') == 'text/plain'
        assert status_headers.get_header('Content-Length') == str(len(buff))
--- a/pywb/rewrite/header_rewriter.py
+++ b/pywb/rewrite/header_rewriter.py
@ -1,5 +1,5 @@
-from pywb.utils.statusandheaders import StatusAndHeaders
+from warcio.statusandheaders import StatusAndHeaders
-from pywb.utils.timeutils import datetime_to_http_date
+from warcio.timeutils import datetime_to_http_date
 from datetime import datetime, timedelta
 import six
--- a/pywb/rewrite/rewrite_content.py
+++ b/pywb/rewrite/rewrite_content.py
@ -12,10 +12,11 @@ from pywb.rewrite.header_rewriter import RewrittenStatusAndHeaders
 from pywb.rewrite.rewriterules import RewriteRules
 from pywb.utils.dsrules import RuleSet
-from pywb.utils.statusandheaders import StatusAndHeaders
+
-from pywb.utils.bufferedreaders import DecompressingBufferedReader
+from warcio.statusandheaders import StatusAndHeaders
-from pywb.utils.bufferedreaders import ChunkedDataReader, BufferedReader
+from warcio.bufferedreaders import DecompressingBufferedReader
-from pywb.utils.loaders import to_native_str
+from warcio.bufferedreaders import ChunkedDataReader, BufferedReader
 from warcio.utils import to_native_str
 from pywb.rewrite.regex_rewriters import JSNoneRewriter, JSLinkOnlyRewriter
--- a/pywb/rewrite/rewrite_live.py
+++ b/pywb/rewrite/rewrite_live.py
@ -11,10 +11,11 @@ import os
 from six.moves.urllib.parse import urlsplit
 import six
 from warcio.timeutils import timestamp_now
 from warcio.statusandheaders import StatusAndHeaders
 from pywb.utils.loaders import is_http, LimitReader, LocalFileLoader, to_file_url
 from pywb.utils.loaders import extract_client_cookie
 from pywb.utils.timeutils import timestamp_now
 from pywb.utils.statusandheaders import StatusAndHeaders
 from pywb.utils.canonicalize import canonicalize
 from pywb.rewrite.rewrite_content import RewriteContent
--- a/pywb/rewrite/test/test_header_rewriter.py
+++ b/pywb/rewrite/test/test_header_rewriter.py
@ -50,9 +50,9 @@ HTTP Headers Rewriting
 from pywb.rewrite.header_rewriter import HeaderRewriter
 from pywb.rewrite.url_rewriter import UrlRewriter
-from pywb.utils.statusandheaders import StatusAndHeaders
+from warcio.statusandheaders import StatusAndHeaders
-from pywb.utils.timeutils import datetime_to_http_date
+from warcio.timeutils import datetime_to_http_date
 from datetime import datetime
 import pprint
--- a/pywb/static/flowplayer/flowplayer-3.2.18.swf
+++ b/pywb/static/flowplayer/flowplayer-3.2.18.swf
--- a/pywb/static/flowplayer/flowplayer.audio-3.2.11.swf
+++ b/pywb/static/flowplayer/flowplayer.audio-3.2.11.swf
--- a/pywb/static/flowplayer/flowplayer.controls-3.2.16.swf
+++ b/pywb/static/flowplayer/flowplayer.controls-3.2.16.swf
--- a/pywb/static/flowplayer/flowplayer.pseudostreaming-3.2.13.swf
+++ b/pywb/static/flowplayer/flowplayer.pseudostreaming-3.2.13.swf
--- a/pywb/urlrewrite/cookies.py
+++ b/pywb/urlrewrite/cookies.py
@ -1,5 +1,5 @@
 from pywb.rewrite.cookie_rewriter import WbUrlBaseCookieRewriter, HostScopeCookieRewriter
-from pywb.utils.timeutils import datetime_to_http_date
+from warcio.timeutils import datetime_to_http_date
 from six.moves import zip
 import redis
--- a/pywb/urlrewrite/platformhandler.py
+++ b/pywb/urlrewrite/platformhandler.py
@ -6,10 +6,10 @@ from pywb.framework.archivalrouter import Route
 from pywb.rewrite.rewrite_content import RewriteContent
 from pywb.rewrite.wburl import WbUrl
-from pywb.warc.recordloader import ArcWarcRecordLoader
+from warcio.recordloader import ArcWarcRecordLoader
 from pywb.webapp.live_rewrite_handler import RewriteHandler
 from pywb.utils.canonicalize import canonicalize
-from pywb.utils.timeutils import http_date_to_timestamp
+from warcio.timeutils import http_date_to_timestamp
 from pywb.cdx.cdxobject import CDXObject
 from io import BytesIO
@ -81,7 +81,7 @@ class PlatformHandler(RewriteHandler):
        head_insert_func = self.head_insert_view.create_insert_func(wbrequest)
        result = self.content_rewriter.rewrite_content(wbrequest.urlrewriter,
-                                               record.status_headers,
+                                               record.http_headers,
                                               record.stream,
                                               head_insert_func,
                                               urlkey,
--- a/pywb/urlrewrite/rewriterapp.py
+++ b/pywb/urlrewrite/rewriterapp.py
@ -6,14 +6,15 @@ from pywb.rewrite.url_rewriter import UrlRewriter
 from pywb.utils.wbexception import WbException
 from pywb.utils.canonicalize import canonicalize
 from pywb.utils.timeutils import http_date_to_timestamp
 from pywb.utils.loaders import extract_client_cookie
-from pywb.utils.bufferedreaders import BufferedReader
+
 from warcio.timeutils import http_date_to_timestamp
 from warcio.bufferedreaders import BufferedReader
 from warcio.recordloader import ArcWarcRecordLoader
 from pywb.webagg.utils import BUFF_SIZE
 from pywb.cdx.cdxobject import CDXObject
 from pywb.warc.recordloader import ArcWarcRecordLoader
 from pywb.framework.wbrequestresponse import WbResponse
 from pywb.webagg.utils import MementoUtils, buffer_iter
@ -200,11 +201,11 @@ class RewriterApp(object):
        self._add_custom_params(cdx, r.headers, kwargs)
        if readd_range:
-            content_length = (record.status_headers.
+            content_length = (record.http_headers.
                              get_header('Content-Length'))
            try:
                content_length = int(content_length)
-                record.status_headers.add_range(0, content_length,
+                record.http_headers.add_range(0, content_length,
                                                   content_length)
            except (ValueError, TypeError):
                pass
@ -228,8 +229,8 @@ class RewriterApp(object):
                                                               cookie_key)
        result = self.content_rewriter.rewrite_content(urlrewriter,
-                                               record.status_headers,
+                                               record.http_headers,
-                                               record.stream,
+                                               record.raw_stream,
                                               head_insert_func,
                                               urlkey,
                                               cdx,
--- a/pywb/urlrewrite/templateview.py
+++ b/pywb/urlrewrite/templateview.py
@ -1,5 +1,6 @@
-from pywb.utils.timeutils import timestamp_to_datetime, timestamp_to_sec
+from warcio.timeutils import timestamp_to_datetime, timestamp_to_sec
-from pywb.utils.timeutils import timestamp_now
+from warcio.timeutils import timestamp_now
 from pywb.utils.loaders import load
 from six.moves.urllib.parse import urlsplit
--- a/pywb/utils/bufferedreaders.py
+++ b/pywb/utils/bufferedreaders.py
@ -1,336 +0,0 @@
 from io import BytesIO
 import zlib
 import brotli
 #=================================================================
 def gzip_decompressor():
    """
    Decompressor which can handle decompress gzip stream
    """
    return zlib.decompressobj(16 + zlib.MAX_WBITS)
 def deflate_decompressor():
    return zlib.decompressobj()
 def deflate_decompressor_alt():
    return zlib.decompressobj(-zlib.MAX_WBITS)
 def brotli_decompressor():
    decomp = brotli.Decompressor()
    decomp.unused_data = None
    return decomp
 #=================================================================
 class BufferedReader(object):
    """
    A wrapping line reader which wraps an existing reader.
    Read operations operate on underlying buffer, which is filled to
    block_size (1024 default)
    If an optional decompress type is specified,
    data is fed through the decompressor when read from the buffer.
    Currently supported decompression: gzip
    If unspecified, default decompression is None
    If decompression is specified, and decompress fails on first try,
    data is assumed to not be compressed and no exception is thrown.
    If a failure occurs after data has been
    partially decompressed, the exception is propagated.
    """
    DECOMPRESSORS = {'gzip': gzip_decompressor,
                     'deflate': deflate_decompressor,
                     'deflate_alt': deflate_decompressor_alt,
                     'br': brotli_decompressor
                    }
    def __init__(self, stream, block_size=1024,
                 decomp_type=None,
                 starting_data=None):
        self.stream = stream
        self.block_size = block_size
        self._init_decomp(decomp_type)
        self.buff = None
        self.starting_data = starting_data
        self.num_read = 0
        self.buff_size = 0
    def set_decomp(self, decomp_type):
        self._init_decomp(decomp_type)
    def _init_decomp(self, decomp_type):
        self.num_block_read = 0
        if decomp_type:
            try:
                self.decomp_type = decomp_type
                self.decompressor = self.DECOMPRESSORS[decomp_type.lower()]()
            except KeyError:
                raise Exception('Decompression type not supported: ' +
                                decomp_type)
        else:
            self.decomp_type = None
            self.decompressor = None
    def _fillbuff(self, block_size=None):
        if not self.empty():
            return
        # can't read past next member
        if self.rem_length() > 0:
            return
        if self.starting_data:
            data = self.starting_data
            self.starting_data = None
        else:
            if not block_size:
                block_size = self.block_size
            data = self.stream.read(block_size)
        self._process_read(data)
    def _process_read(self, data):
        data = self._decompress(data)
        self.buff_size = len(data)
        self.num_read += self.buff_size
        self.num_block_read += self.buff_size
        self.buff = BytesIO(data)
    def _decompress(self, data):
        if self.decompressor and data:
            try:
                data = self.decompressor.decompress(data)
            except Exception as e:
                # if first read attempt, assume non-gzipped stream
                if self.num_block_read == 0:
                    if self.decomp_type == 'deflate':
                        self._init_decomp('deflate_alt')
                        data = self._decompress(data)
                    else:
                        self.decompressor = None
                # otherwise (partly decompressed), something is wrong
                else:
                    print(str(e))
                    return b''
        return data
    def read(self, length=None):
        """
        Fill bytes and read some number of bytes
        (up to length if specified)
        <= length bytes may be read if reached the end of input
        if at buffer boundary, will attempt to read again until
        specified length is read
        """
        all_buffs = []
        while length is None or length > 0:
            self._fillbuff()
            buff = self.buff.read(length)
            if not buff:
                break
            all_buffs.append(buff)
            if length:
                length -= len(buff)
        return b''.join(all_buffs)
    def readline(self, length=None):
        """
        Fill buffer and read a full line from the buffer
        (up to specified length, if provided)
        If no newline found at end, try filling buffer again in case
        at buffer boundary.
        """
        if length == 0:
            return b''
        self._fillbuff()
        linebuff = self.buff.readline(length)
        # we may be at a boundary
        while not linebuff.endswith(b'\n'):
            if length:
                length -= len(linebuff)
                if length <= 0:
                    break
            self._fillbuff()
            if self.empty():
                break
            linebuff += self.buff.readline(length)
        return linebuff
    def empty(self):
        return not self.buff or self.buff.tell() >= self.buff_size
    def read_next_member(self):
        if not self.decompressor or not self.decompressor.unused_data:
            return False
        self.starting_data = self.decompressor.unused_data
        self._init_decomp(self.decomp_type)
        return True
    def rem_length(self):
        rem = 0
        if self.buff:
            rem = self.buff_size - self.buff.tell()
        if self.decompressor and self.decompressor.unused_data:
            rem += len(self.decompressor.unused_data)
        return rem
    def close(self):
        if self.stream:
            self.stream.close()
            self.stream = None
    @classmethod
    def get_supported_decompressors(cls):
        return cls.DECOMPRESSORS.keys()
 #=================================================================
 class DecompressingBufferedReader(BufferedReader):
    """
    A BufferedReader which defaults to gzip decompression,
    (unless different type specified)
    """
    def __init__(self, *args, **kwargs):
        if 'decomp_type' not in kwargs:
            kwargs['decomp_type'] = 'gzip'
        super(DecompressingBufferedReader, self).__init__(*args, **kwargs)
 #=================================================================
 class ChunkedDataException(Exception):
    def __init__(self, msg, data=b''):
        Exception.__init__(self, msg)
        self.data = data
 #=================================================================
 class ChunkedDataReader(BufferedReader):
    r"""
    A ChunkedDataReader is a DecompressingBufferedReader
    which also supports de-chunking of the data if it happens
    to be http 'chunk-encoded'.
    If at any point the chunked header is not available, the stream is
    assumed to not be chunked and no more dechunking occurs.
    """
    def __init__(self, stream, raise_exceptions=False, **kwargs):
        super(ChunkedDataReader, self).__init__(stream, **kwargs)
        self.all_chunks_read = False
        self.not_chunked = False
        # if False, we'll use best-guess fallback for parse errors
        self.raise_chunked_data_exceptions = raise_exceptions
    def _fillbuff(self, block_size=None):
        if self.not_chunked:
            return super(ChunkedDataReader, self)._fillbuff(block_size)
        # Loop over chunks until there is some data (not empty())
        # In particular, gzipped data may require multiple chunks to
        # return any decompressed result
        while (self.empty() and
               not self.all_chunks_read and
               not self.not_chunked):
            try:
                length_header = self.stream.readline(64)
                self._try_decode(length_header)
            except ChunkedDataException as e:
                if self.raise_chunked_data_exceptions:
                    raise
                # Can't parse the data as chunked.
                # It's possible that non-chunked data is served
                # with a Transfer-Encoding: chunked.
                # Treat this as non-chunk encoded from here on.
                self._process_read(length_header + e.data)
                self.not_chunked = True
                # parse as block as non-chunked
                return super(ChunkedDataReader, self)._fillbuff(block_size)
    def _try_decode(self, length_header):
        # decode length header
        try:
            chunk_size = int(length_header.strip().split(b';')[0], 16)
        except ValueError:
            raise ChunkedDataException(b"Couldn't decode length header " +
                                       length_header)
        if not chunk_size:
            # chunk_size 0 indicates end of file
            self.all_chunks_read = True
            self._process_read(b'')
            return
        data_len = 0
        data = b''
        # read chunk
        while data_len < chunk_size:
            new_data = self.stream.read(chunk_size - data_len)
            # if we unexpectedly run out of data,
            # either raise an exception or just stop reading,
            # assuming file was cut off
            if not new_data:
                if self.raise_chunked_data_exceptions:
                    msg = 'Ran out of data before end of chunk'
                    raise ChunkedDataException(msg, data)
                else:
                    chunk_size = data_len
                    self.all_chunks_read = True
            data += new_data
            data_len = len(data)
        # if we successfully read a block without running out,
        # it should end in \r\n
        if not self.all_chunks_read:
            clrf = self.stream.read(2)
            if clrf != b'\r\n':
                raise ChunkedDataException(b"Chunk terminator not found.",
                                           data)
        # hand to base class for further processing
        self._process_read(data)
    def read(self, length=None):
        """ read bytes from stream, if length specified,
        may read across multiple chunks to get exact length
        """
        buf = super(ChunkedDataReader, self).read(length)
        if not length:
            return buf
        # if length specified, attempt to read exact length
        rem = length - len(buf)
        while rem > 0:
            new_buf = super(ChunkedDataReader, self).read(rem)
            if not new_buf:
                break
            buf += new_buf
            rem -= len(new_buf)
        return buf
--- a/pywb/utils/limitreader.py
+++ b/pywb/utils/limitreader.py
@ -1,69 +0,0 @@
 # ============================================================================
 class LimitReader(object):
    """
    A reader which will not read more than specified limit
    """
    def __init__(self, stream, limit):
        self.stream = stream
        self.limit = limit
        if hasattr(stream, 'tell'):
            self.tell = self._tell
    def _update(self, buff):
        length = len(buff)
        self.limit -= length
        return buff
    def read(self, length=None):
        if length is not None:
            length = min(length, self.limit)
        else:
            length = self.limit
        if length == 0:
            return b''
        buff = self.stream.read(length)
        return self._update(buff)
    def readline(self, length=None):
        if length is not None:
            length = min(length, self.limit)
        else:
            length = self.limit
        if length == 0:
            return b''
        buff = self.stream.readline(length)
        return self._update(buff)
    def close(self):
        self.stream.close()
    def _tell(self):
        return self.stream.tell()
    @staticmethod
    def wrap_stream(stream, content_length):
        """
        If given content_length is an int > 0, wrap the stream
        in a LimitReader. Otherwise, return the stream unaltered
        """
        try:
            content_length = int(content_length)
            if content_length >= 0:
                # optimize: if already a LimitStream, set limit to
                # the smaller of the two limits
                if isinstance(stream, LimitReader):
                    stream.limit = min(stream.limit, content_length)
                else:
                    stream = LimitReader(stream, content_length)
        except (ValueError, TypeError):
            pass
        return stream
--- a/pywb/utils/loaders.py
+++ b/pywb/utils/loaders.py
@ -17,7 +17,7 @@ import base64
 import cgi
 from io import open, BytesIO
-from pywb.utils.limitreader import LimitReader
+from warcio.limitreader import LimitReader
 try:
    from boto import connect_s3
--- a/pywb/utils/statusandheaders.py
+++ b/pywb/utils/statusandheaders.py
@ -1,285 +0,0 @@
 """
 Representation and parsing of HTTP-style status + headers
 """
 from pprint import pformat
 from copy import copy
 from six.moves import range
 from six import iteritems
 from pywb.utils.loaders import to_native_str
 import uuid
 WRAP_WIDTH = 80
 #=================================================================
 class StatusAndHeaders(object):
    """
    Representation of parsed http-style status line and headers
    Status Line if first line of request/response
    Headers is a list of (name, value) tuples
    An optional protocol which appears on first line may be specified
    """
    def __init__(self, statusline, headers, protocol='', total_len=0):
        self.statusline = statusline
        self.headers = headers
        self.protocol = protocol
        self.total_len = total_len
    def get_header(self, name, default_value=None):
        """
        return header (name, value)
        if found
        """
        name_lower = name.lower()
        for value in self.headers:
            if value[0].lower() == name_lower:
                return value[1]
        return default_value
    def add_header(self, name, value):
        self.headers.append((name, value))
    def replace_header(self, name, value):
        """
        replace header with new value or add new header
        return old header value, if any
        """
        name_lower = name.lower()
        for index in range(len(self.headers) - 1, -1, -1):
            curr_name, curr_value = self.headers[index]
            if curr_name.lower() == name_lower:
                self.headers[index] = (curr_name, value)
                return curr_value
        self.headers.append((name, value))
        return None
    def replace_headers(self, header_dict):
        """
        replace all headers in header_dict that already exist
        add any remaining headers
        """
        header_dict = copy(header_dict)
        for index in range(len(self.headers) - 1, -1, -1):
            curr_name, curr_value = self.headers[index]
            name_lower = curr_name.lower()
            if name_lower in header_dict:
                self.headers[index] = (curr_name, header_dict[name_lower])
                del header_dict[name_lower]
        for name, value in iteritems(header_dict):
            self.headers.append((name, value))
    def remove_header(self, name):
        """
        Remove header (case-insensitive)
        return True if header removed, False otherwise
        """
        name_lower = name.lower()
        for index in range(len(self.headers) - 1, -1, -1):
            if self.headers[index][0].lower() == name_lower:
                del self.headers[index]
                return True
        return False
    def get_statuscode(self):
        """
        Return the statuscode part of the status response line
        (Assumes no protocol in the statusline)
        """
        code = self.statusline.split(' ', 1)[0]
        return code
    def validate_statusline(self, valid_statusline):
        """
        Check that the statusline is valid, eg. starts with a numeric
        code. If not, replace with passed in valid_statusline
        """
        code = self.get_statuscode()
        try:
            code = int(code)
            assert(code > 0)
            return True
        except(ValueError, AssertionError):
            self.statusline = valid_statusline
            return False
    def add_range(self, start, part_len, total_len):
        """
        Add range headers indicating that this a partial response
        """
        content_range = 'bytes {0}-{1}/{2}'.format(start,
                                                   start + part_len - 1,
                                                   total_len)
        self.statusline = '206 Partial Content'
        self.replace_header('Content-Range', content_range)
        self.replace_header('Accept-Ranges', 'bytes')
        return self
    def __repr__(self):
        headers_str = pformat(self.headers, indent=2, width=WRAP_WIDTH)
        return "StatusAndHeaders(protocol = '{0}', statusline = '{1}', \
 headers = {2})".format(self.protocol, self.statusline, headers_str)
    def __eq__(self, other):
        return (self.statusline == other.statusline and
                self.headers == other.headers and
                self.protocol == other.protocol)
    def __str__(self, exclude_list=None):
        return self.to_str(exclude_list)
    def __bool__(self):
        return bool(self.statusline or self.headers)
    __nonzero__ = __bool__
    def to_str(self, exclude_list):
        string = self.protocol
        if string and self.statusline:
            string += ' '
        if self.statusline:
            string += self.statusline
        if string:
            string += '\r\n'
        for h in self.headers:
            if exclude_list and h[0].lower() in exclude_list:
                continue
            string += ': '.join(h) + '\r\n'
        return string
    def to_bytes(self, exclude_list=None):
        return self.to_str(exclude_list).encode('iso-8859-1') + b'\r\n'
 #=================================================================
 def _strip_count(string, total_read):
    length = len(string)
    return string.rstrip(), total_read + length
 #=================================================================
 class StatusAndHeadersParser(object):
    """
    Parser which consumes a stream support readline() to read
    status and headers and return a StatusAndHeaders object
    """
    def __init__(self, statuslist, verify=True):
        self.statuslist = statuslist
        self.verify = verify
    def parse(self, stream, full_statusline=None):
        """
        parse stream for status line and headers
        return a StatusAndHeaders object
        support continuation headers starting with space or tab
        """
        def readline():
            return to_native_str(stream.readline())
        # status line w newlines intact
        if full_statusline is None:
            full_statusline = readline()
        else:
            full_statusline = to_native_str(full_statusline)
        statusline, total_read = _strip_count(full_statusline, 0)
        headers = []
        # at end of stream
        if total_read == 0:
            raise EOFError()
        elif not statusline:
            return StatusAndHeaders(statusline=statusline,
                                    headers=headers,
                                    protocol='',
                                    total_len=total_read)
        # validate only if verify is set
        if self.verify:
            protocol_status = self.split_prefix(statusline, self.statuslist)
            if not protocol_status:
                msg = 'Expected Status Line starting with {0} - Found: {1}'
                msg = msg.format(self.statuslist, statusline)
                raise StatusAndHeadersParserException(msg, full_statusline)
        else:
            protocol_status = statusline.split(' ', 1)
        line, total_read = _strip_count(readline(), total_read)
        while line:
            result = line.split(':', 1)
            if len(result) == 2:
                name = result[0].rstrip(' \t')
                value = result[1].lstrip()
            else:
                name = result[0]
                value = None
            next_line, total_read = _strip_count(readline(),
                                                 total_read)
            # append continuation lines, if any
            while next_line and next_line.startswith((' ', '\t')):
                if value is not None:
                    value += next_line
                next_line, total_read = _strip_count(readline(),
                                                     total_read)
            if value is not None:
                header = (name, value)
                headers.append(header)
            line = next_line
        if len(protocol_status) > 1:
            statusline = protocol_status[1].strip()
        else:
            statusline = ''
        return StatusAndHeaders(statusline=statusline,
                                headers=headers,
                                protocol=protocol_status[0],
                                total_len=total_read)
    @staticmethod
    def split_prefix(key, prefixs):
        """
        split key string into prefix and remainder
        for first matching prefix from a list
        """
        key_upper = key.upper()
        for prefix in prefixs:
            if key_upper.startswith(prefix):
                plen = len(prefix)
                return (key_upper[:plen], key[plen:])
    @staticmethod
    def make_warc_id(id_=None):
        if not id_:
            id_ = uuid.uuid1()
        return '<urn:uuid:{0}>'.format(id_)
 #=================================================================
 class StatusAndHeadersParserException(Exception):
    """
    status + headers parsing exception
    """
    def __init__(self, msg, statusline):
        super(StatusAndHeadersParserException, self).__init__(msg)
        self.statusline = statusline
--- a/pywb/utils/test/test_bufferedreaders.py
+++ b/pywb/utils/test/test_bufferedreaders.py
@ -1,174 +0,0 @@
 r"""
 # DecompressingBufferedReader Tests
 #=================================================================
 # DecompressingBufferedReader readline()
 >>> print_str(DecompressingBufferedReader(open(test_cdx_dir + 'iana.cdx', 'rb')).readline())
 ' CDX N b a m s k r M S V g\n'
 # detect not compressed
 >>> print_str(DecompressingBufferedReader(open(test_cdx_dir + 'iana.cdx', 'rb'), decomp_type = 'gzip').readline())
 ' CDX N b a m s k r M S V g\n'
 # decompress with on the fly compression, default gzip compression
 >>> print_str(DecompressingBufferedReader(BytesIO(compress('ABC\n1234\n'))).read())
 'ABC\n1234\n'
 # decompress with on the fly compression, default 'inflate' compression
 >>> print_str(DecompressingBufferedReader(BytesIO(compress_alt('ABC\n1234\n')), decomp_type='deflate').read())
 'ABC\n1234\n'
 # error: invalid compress type
 >>> DecompressingBufferedReader(BytesIO(compress('ABC')), decomp_type = 'bzip2').read()
 Traceback (most recent call last):
 Exception: Decompression type not supported: bzip2
 # invalid output when reading compressed data as not compressed
 >>> DecompressingBufferedReader(BytesIO(compress('ABC')), decomp_type = None).read() != b'ABC'
 True
 # DecompressingBufferedReader readline() with decompression (zipnum file, no header)
 >>> print_str(DecompressingBufferedReader(open(test_zip_dir + 'zipnum-sample.cdx.gz', 'rb'), decomp_type = 'gzip').readline())
 'com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz\n'
 # test very small block size
 >>> dbr = DecompressingBufferedReader(BytesIO(b'ABCDEFG\nHIJKLMN\nOPQR\nXYZ'), block_size = 3)
 >>> print_str(dbr.readline()); print_str(dbr.readline(4)); print_str(dbr.readline()); print_str(dbr.readline()); print_str(dbr.readline(2)); print_str(dbr.readline()); print_str(dbr.readline())
 'ABCDEFG\n'
 'HIJK'
 'LMN\n'
 'OPQR\n'
 'XY'
 'Z'
 ''
 # test zero length reads
 >>> x = DecompressingBufferedReader(LimitReader(BytesIO(b'\r\n'), 1))
 >>> print_str(x.readline(0)); print_str(x.read(0))
 ''
 ''
 # Chunk-Decoding Buffered Reader Tests
 #=================================================================
 Properly formatted chunked data:
 >>> c = ChunkedDataReader(BytesIO(b"4\r\n1234\r\n0\r\n\r\n"));
 >>> print_str(c.read() + c.read() + c.read())
 '1234'
 Non-chunked data:
 >>> print_str(ChunkedDataReader(BytesIO(b"xyz123!@#")).read())
 'xyz123!@#'
 Non-chunked, compressed data, specify decomp_type
 >>> print_str(ChunkedDataReader(BytesIO(compress('ABCDEF')), decomp_type='gzip').read())
 'ABCDEF'
 Non-chunked, compressed data, specifiy compression seperately
 >>> c = ChunkedDataReader(BytesIO(compress('ABCDEF'))); c.set_decomp('gzip'); print_str(c.read())
 'ABCDEF'
 Non-chunked, compressed data, wrap in DecompressingBufferedReader
 >>> print_str(DecompressingBufferedReader(ChunkedDataReader(BytesIO(compress('\nABCDEF\nGHIJ')))).read())
 '\nABCDEF\nGHIJ'
 Chunked compressed data
 Split compressed stream into 10-byte chunk and a remainder chunk
 >>> b = compress('ABCDEFGHIJKLMNOP')
 >>> l = len(b)
 >>> in_ = format(10, 'x').encode('utf-8') + b"\r\n" + b[:10] + b"\r\n" + format(l - 10, 'x').encode('utf-8') + b"\r\n" + b[10:] + b"\r\n0\r\n\r\n"
 >>> c = ChunkedDataReader(BytesIO(in_), decomp_type='gzip')
 >>> print_str(c.read())
 'ABCDEFGHIJKLMNOP'
 Starts like chunked data, but isn't:
 >>> c = ChunkedDataReader(BytesIO(b"1\r\nxyz123!@#"));
 >>> print_str(c.read() + c.read())
 '1\r\nx123!@#'
 Chunked data cut off part way through:
 >>> c = ChunkedDataReader(BytesIO(b"4\r\n1234\r\n4\r\n12"));
 >>> print_str(c.read() + c.read())
 '123412'
 Zero-Length chunk:
 >>> print_str(ChunkedDataReader(BytesIO(b"0\r\n\r\n")).read())
 ''
 """
 from io import BytesIO
 from pywb.utils.bufferedreaders import ChunkedDataReader, ChunkedDataException
 from pywb.utils.bufferedreaders import DecompressingBufferedReader
 from pywb.utils.limitreader import LimitReader
 from pywb import get_test_dir
 import six
 import zlib
 import pytest
 test_cdx_dir = get_test_dir() + 'cdx/'
 test_zip_dir = get_test_dir() + 'zipcdx/'
 def compress(buff):
    buff = buff.encode('utf-8')
    compressobj = zlib.compressobj(6, zlib.DEFLATED, zlib.MAX_WBITS + 16)
    compressed = compressobj.compress(buff)
    compressed += compressobj.flush()
    return compressed
 # plain "inflate"
 def compress_alt(buff):
    buff = buff.encode('utf-8')
    compressobj = zlib.compressobj(6, zlib.DEFLATED)
    compressed = compressobj.compress(buff)
    compressed += compressobj.flush()
    # drop gzip headers/tail
    compressed = compressed[2:-4]
    return compressed
 # Brotli
 def test_brotli():
    with open(get_test_dir() + 'text_content/quickfox_repeated.compressed', 'rb') as fh:
        x = DecompressingBufferedReader(fh, decomp_type='br')
        x.read() == b'The quick brown fox jumps over the lazy dog' * 4096
 # Compression
 def test_compress_mix():
    # error: compressed member, followed by not compressed -- now allowed!
    x = DecompressingBufferedReader(BytesIO(compress('ABC') + b'123'), decomp_type = 'gzip')
    b = x.read()
    assert b == b'ABC'
    x.read_next_member()
    assert x.read() == b'123'
    #with pytest.raises(zlib.error):
    #    x.read()
    #error: Error -3 while decompressing: incorrect header check
 # Errors
 def test_err_chunk_cut_off():
    # Chunked data cut off with exceptions
    c = ChunkedDataReader(BytesIO(b"4\r\n1234\r\n4\r\n12"), raise_exceptions=True)
    with pytest.raises(ChunkedDataException):
        c.read() + c.read()
    #ChunkedDataException: Ran out of data before end of chunk
 def print_str(string):
    return string.decode('utf-8') if six.PY3 else string
 if __name__ == "__main__":
    import doctest
    doctest.testmod()
--- a/pywb/utils/test/test_loaders.py
+++ b/pywb/utils/test/test_loaders.py
@ -1,27 +1,5 @@
 #=================================================================
 r"""
-# LimitReader Tests
+#=================================================================
 >>> LimitReader(StringIO('abcdefghjiklmnopqrstuvwxyz'), 10).read(26)
 'abcdefghji'
 >>> LimitReader(StringIO('abcdefghjiklmnopqrstuvwxyz'), 8).readline(26)
 'abcdefgh'
 >>> LimitReader.wrap_stream(LimitReader(StringIO('abcdefghjiklmnopqrstuvwxyz'), 8), 4).readline(26)
 'abcd'
 >>> read_multiple(LimitReader(StringIO('abcdefghjiklmnopqrstuvwxyz'), 10), [2, 2, 20])
 'efghji'
 # zero-length read
 >>> print_str(LimitReader(StringIO('a'), 0).readline(0))
 ''
 # don't wrap if invalid length
 >>> b = StringIO('b')
 >>> LimitReader.wrap_stream(b, 'abc') == b
 True
 # BlockLoader Tests (includes LimitReader)
 # Ensure attempt to read more than 100 bytes, reads exactly 100 bytes
 >>> len(BlockLoader().load(test_cdx_dir + 'iana.cdx', 0, 100).read(400))
@ -143,27 +121,14 @@ import requests
 from pywb.utils.loaders import BlockLoader, HMACCookieMaker, to_file_url
 from pywb.utils.loaders import extract_client_cookie, extract_post_query
 from pywb.utils.loaders import append_post_query, read_last_line
 from pywb.utils.limitreader import LimitReader
-from pywb.utils.bufferedreaders import DecompressingBufferedReader
+from warcio.bufferedreaders import DecompressingBufferedReader
 from pywb import get_test_dir
 test_cdx_dir = get_test_dir() + 'cdx/'
 def read_multiple(reader, inc_reads):
    result = None
    for x in inc_reads:
        result = reader.read(x)
    return result
 def seek_read_full(seekable_reader, offset):
    seekable_reader.seek(offset)
    seekable_reader.readline() #skip
    return seekable_reader.readline()
 def test_s3_read_1():
    pytest.importorskip('boto')
@ -178,15 +143,6 @@ def test_s3_read_1():
    assert reader.readline() == b'WARC/1.0\r\n'
    assert reader.readline() == b'WARC-Type: response\r\n'
 def test_limit_post():
    reader = LimitReader(BytesIO(b'abcdefg'), 3)
    r = requests.request(method='POST',
                         url='http://httpbin.org/post',
                         data=reader,
                         headers={'Content-Length': '3'})
    assert '"abc"' in r.text
 # Error
 def test_err_no_such_file():
    # no such file
--- a/pywb/utils/test/test_statusandheaders.py
+++ b/pywb/utils/test/test_statusandheaders.py
@ -1,182 +0,0 @@
 """
 >>> st1 = StatusAndHeadersParser(['HTTP/1.0']).parse(StringIO(status_headers_1))
 >>> st1
 StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '200 OK', headers = [ ('Content-Type', 'ABC'),
  ('Some', 'Value'),
  ('Multi-Line', 'Value1    Also This')])
 # add range
 >>> StatusAndHeaders(statusline = '200 OK', headers=[('Content-Type', 'text/plain')]).add_range(10, 4, 100)
 StatusAndHeaders(protocol = '', statusline = '206 Partial Content', headers = [ ('Content-Type', 'text/plain'),
  ('Content-Range', 'bytes 10-13/100'),
  ('Accept-Ranges', 'bytes')])
 # other protocol expected
 >>> StatusAndHeadersParser(['Other']).parse(StringIO(status_headers_1))  # doctest: +IGNORE_EXCEPTION_DETAIL
 Traceback (most recent call last):
 StatusAndHeadersParserException: Expected Status Line starting with ['Other'] - Found: HTTP/1.0 200 OK
 >>> StatusAndHeadersParser(['Other'], verify=False).parse(StringIO(status_headers_1))
 StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '200 OK', headers = [ ('Content-Type', 'ABC'),
  ('Some', 'Value'),
  ('Multi-Line', 'Value1    Also This')])
 # verify protocol line
 >>> StatusAndHeadersParser(['HTTP/1.0'], verify=True).parse(StringIO(unknown_protocol_headers))  # doctest: +IGNORE_EXCEPTION_DETAIL
 Traceback (most recent call last):
 StatusAndHeadersParserException: Expected Status Line starting with ['HTTP/1.0'] - Found: OtherBlah
 # allow unexpected/invalid protocol line
 >>> StatusAndHeadersParser(['HTTP/1.0'], verify=False).parse(StringIO(unknown_protocol_headers))
 StatusAndHeaders(protocol = 'OtherBlah', statusline = '', headers = [('Foo', 'Bar')])
 # test equality op
 >>> st1 == StatusAndHeadersParser(['HTTP/1.0']).parse(StringIO(status_headers_1))
 True
 # replace header, print new headers
 >>> st1.replace_header('some', 'Another-Value'); st1
 'Value'
 StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '200 OK', headers = [ ('Content-Type', 'ABC'),
  ('Some', 'Another-Value'),
  ('Multi-Line', 'Value1    Also This')])
 # remove header
 >>> st1.remove_header('some')
 True
 # already removed
 >>> st1.remove_header('Some')
 False
 # empty
 >>> st2 = StatusAndHeadersParser(['HTTP/1.0']).parse(StringIO(status_headers_2)); x = st2.validate_statusline('204 No Content'); st2
 StatusAndHeaders(protocol = '', statusline = '204 No Content', headers = [])
 >>> StatusAndHeadersParser(['HTTP/1.0']).parse(StringIO(status_headers_3))
 StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '204 Empty', headers = [('Content-Type', 'Value'), ('Content-Length', '0')])
 # case-insensitive match
 >>> StatusAndHeadersParser(['HTTP/1.0']).parse(StringIO(status_headers_4))
 StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '204 empty', headers = [('Content-Type', 'Value'), ('Content-Length', '0')])
 """
 from pywb.utils.statusandheaders import StatusAndHeadersParser, StatusAndHeaders
 from six import StringIO
 import pytest
 status_headers_1 = "\
 HTTP/1.0 200 OK\r\n\
 Content-Type: ABC\r\n\
 HTTP/1.0 200 OK\r\n\
 Some: Value\r\n\
 Multi-Line: Value1\r\n\
    Also This\r\n\
 \r\n\
 Body"
 status_headers_2 = """
 """
 status_headers_3 = "\
 HTTP/1.0 204 Empty\r\n\
 Content-Type: Value\r\n\
 %Invalid%\r\n\
 \tMultiline\r\n\
 Content-Length: 0\r\n\
 \r\n"
 status_headers_4 = "\
 http/1.0 204 empty\r\n\
 Content-Type: Value\r\n\
 %Invalid%\r\n\
 \tMultiline\r\n\
 Content-Length: 0\r\n\
 \r\n"
 unknown_protocol_headers = "\
 OtherBlah\r\n\
 Foo: Bar\r\n\
 \r\n"
 req_headers = "\
 GET / HTTP/1.0\r\n\
 Foo: Bar\r\n\
 Content-Length: 0\r\n"
 if __name__ == "__main__":
    import doctest
    doctest.testmod()
 def test_to_str_1():
    res = str(StatusAndHeadersParser(['HTTP/1.0']).parse(StringIO(status_headers_1)))
    exp = "\
 HTTP/1.0 200 OK\r\n\
 Content-Type: ABC\r\n\
 Some: Value\r\n\
 Multi-Line: Value1    Also This\r\n\
 "
    assert(res == exp)
 def test_to_str_exclude():
    sah = StatusAndHeadersParser(['HTTP/1.0']).parse(StringIO(status_headers_1))
    res = sah.to_str(['multi-line'])
    exp = "\
 HTTP/1.0 200 OK\r\n\
 Content-Type: ABC\r\n\
 Some: Value\r\n\
 "
    assert(res == exp)
    assert(sah.to_bytes(['multi-line']) == (exp.encode('latin-1') + b'\r\n'))
 def test_to_str_2():
    res = str(StatusAndHeadersParser(['GET']).parse(StringIO(req_headers)))
    assert(res == req_headers)
    res = str(StatusAndHeadersParser(['GET']).parse(StringIO(req_headers + '\r\n')))
    assert(res == req_headers)
 def test_to_str_with_remove():
    res = StatusAndHeadersParser(['GET']).parse(StringIO(req_headers))
    res.remove_header('Foo')
    exp = "\
 GET / HTTP/1.0\r\n\
 Content-Length: 0\r\n"
    assert(str(res) == exp)
 def test_status_empty():
    with pytest.raises(EOFError):
        StatusAndHeadersParser([], verify=False).parse(StringIO(''))
 def test_status_one_word():
    res = StatusAndHeadersParser(['GET'], verify=False).parse(StringIO('A'))
    assert(str(res) == 'A\r\n')
--- a/pywb/utils/timeutils.py
+++ b/pywb/utils/timeutils.py
@ -1,330 +0,0 @@
 """
 utility functions for converting between
 datetime, iso date and 14-digit timestamp
 """
 import re
 import time
 import datetime
 import calendar
 from email.utils import parsedate, formatdate
 #=================================================================
 # str <-> datetime conversion
 #=================================================================
 DATE_TIMESPLIT = re.compile(r'[^\d]')
 TIMESTAMP_14 = '%Y%m%d%H%M%S'
 ISO_DT = '%Y-%m-%dT%H:%M:%SZ'
 PAD_14_DOWN = '10000101000000'
 PAD_14_UP =   '29991231235959'
 PAD_6_UP =    '299912'
 def iso_date_to_datetime(string):
    """
    >>> iso_date_to_datetime('2013-12-26T10:11:12Z')
    datetime.datetime(2013, 12, 26, 10, 11, 12)
    >>> iso_date_to_datetime('2013-12-26T10:11:12Z')
    datetime.datetime(2013, 12, 26, 10, 11, 12)
     """
    nums = DATE_TIMESPLIT.split(string)
    if nums[-1] == '':
        nums = nums[:-1]
    the_datetime = datetime.datetime(*(int(num) for num in nums))
    return the_datetime
 def http_date_to_datetime(string):
    """
    >>> http_date_to_datetime('Thu, 26 Dec 2013 09:50:10 GMT')
    datetime.datetime(2013, 12, 26, 9, 50, 10)
    """
    return datetime.datetime(*parsedate(string)[:6])
 def datetime_to_http_date(the_datetime):
    """
    >>> datetime_to_http_date(datetime.datetime(2013, 12, 26, 9, 50, 10))
    'Thu, 26 Dec 2013 09:50:10 GMT'
    # Verify inverses
    >>> x = 'Thu, 26 Dec 2013 09:50:10 GMT'
    >>> datetime_to_http_date(http_date_to_datetime(x)) == x
    True
    """
    timeval = calendar.timegm(the_datetime.utctimetuple())
    return formatdate(timeval=timeval,
                      localtime=False,
                      usegmt=True)
 def datetime_to_iso_date(the_datetime):
    """
    >>> datetime_to_iso_date(datetime.datetime(2013, 12, 26, 10, 11, 12))
    '2013-12-26T10:11:12Z'
    >>> datetime_to_iso_date( datetime.datetime(2013, 12, 26, 10, 11, 12))
    '2013-12-26T10:11:12Z'
    """
    return the_datetime.strftime(ISO_DT)
 def datetime_to_timestamp(the_datetime):
    """
    >>> datetime_to_timestamp(datetime.datetime(2013, 12, 26, 10, 11, 12))
    '20131226101112'
    """
    return the_datetime.strftime(TIMESTAMP_14)
 def timestamp_now():
    """
    >>> len(timestamp_now())
    14
    """
    return datetime_to_timestamp(datetime.datetime.utcnow())
 def timestamp20_now():
    """
    Create 20-digit timestamp, useful to timestamping temp files
    >>> n = timestamp20_now()
    >>> timestamp20_now() >= n
    True
    >>> len(n)
    20
    """
    now = datetime.datetime.utcnow()
    return now.strftime('%Y%m%d%H%M%S%f')
 def iso_date_to_timestamp(string):
    """
    >>> iso_date_to_timestamp('2013-12-26T10:11:12Z')
    '20131226101112'
    >>> iso_date_to_timestamp('2013-12-26T10:11:12')
    '20131226101112'
     """
    return datetime_to_timestamp(iso_date_to_datetime(string))
 def timestamp_to_iso_date(string):
    """
    >>> timestamp_to_iso_date('20131226101112')
    '2013-12-26T10:11:12Z'
    >>> timestamp_to_iso_date('20131226101112')
    '2013-12-26T10:11:12Z'
    """
    return datetime_to_iso_date(timestamp_to_datetime(string))
 def http_date_to_timestamp(string):
    """
    >>> http_date_to_timestamp('Thu, 26 Dec 2013 09:50:00 GMT')
    '20131226095000'
    >>> http_date_to_timestamp('Sun, 26 Jan 2014 20:08:04 GMT')
    '20140126200804'
    """
    return datetime_to_timestamp(http_date_to_datetime(string))
 # pad to certain length (default 6)
 def pad_timestamp(string, pad_str=PAD_6_UP):
    """
    >>> pad_timestamp('20')
    '209912'
    >>> pad_timestamp('2014')
    '201412'
    >>> pad_timestamp('20141011')
    '20141011'
    >>> pad_timestamp('201410110010')
    '201410110010'
     """
    str_len = len(string)
    pad_len = len(pad_str)
    if str_len < pad_len:
        string = string + pad_str[str_len:]
    return string
 def timestamp_to_datetime(string):
    """
    # >14-digit -- rest ignored
    >>> timestamp_to_datetime('2014122609501011')
    datetime.datetime(2014, 12, 26, 9, 50, 10)
    # 14-digit
    >>> timestamp_to_datetime('20141226095010')
    datetime.datetime(2014, 12, 26, 9, 50, 10)
    # 13-digit padding
    >>> timestamp_to_datetime('2014122609501')
    datetime.datetime(2014, 12, 26, 9, 50, 59)
    # 12-digit padding
    >>> timestamp_to_datetime('201412260950')
    datetime.datetime(2014, 12, 26, 9, 50, 59)
    # 11-digit padding
    >>> timestamp_to_datetime('20141226095')
    datetime.datetime(2014, 12, 26, 9, 59, 59)
    # 10-digit padding
    >>> timestamp_to_datetime('2014122609')
    datetime.datetime(2014, 12, 26, 9, 59, 59)
    # 9-digit padding
    >>> timestamp_to_datetime('201412260')
    datetime.datetime(2014, 12, 26, 23, 59, 59)
    # 8-digit padding
    >>> timestamp_to_datetime('20141226')
    datetime.datetime(2014, 12, 26, 23, 59, 59)
    # 7-digit padding
    >>> timestamp_to_datetime('2014122')
    datetime.datetime(2014, 12, 31, 23, 59, 59)
    # 6-digit padding
    >>> timestamp_to_datetime('201410')
    datetime.datetime(2014, 10, 31, 23, 59, 59)
    # 5-digit padding
    >>> timestamp_to_datetime('20141')
    datetime.datetime(2014, 12, 31, 23, 59, 59)
    # 4-digit padding
    >>> timestamp_to_datetime('2014')
    datetime.datetime(2014, 12, 31, 23, 59, 59)
    # 3-digit padding
    >>> timestamp_to_datetime('201')
    datetime.datetime(2019, 12, 31, 23, 59, 59)
    # 2-digit padding
    >>> timestamp_to_datetime('20')
    datetime.datetime(2099, 12, 31, 23, 59, 59)
    # 1-digit padding
    >>> timestamp_to_datetime('2')
    datetime.datetime(2999, 12, 31, 23, 59, 59)
    # 1-digit out-of-range padding
    >>> timestamp_to_datetime('3')
    datetime.datetime(2999, 12, 31, 23, 59, 59)
    # 0-digit padding
    >>> timestamp_to_datetime('')
    datetime.datetime(2999, 12, 31, 23, 59, 59)
    # bad month
    >>> timestamp_to_datetime('20131709005601')
    datetime.datetime(2013, 12, 9, 0, 56, 1)
    # all out of range except minutes
    >>> timestamp_to_datetime('40001965252477')
    datetime.datetime(2999, 12, 31, 23, 24, 59)
    # not a number!
    >>> timestamp_to_datetime('2010abc')
    datetime.datetime(2010, 12, 31, 23, 59, 59)
    """
    # pad to 6 digits
    string = pad_timestamp(string, PAD_6_UP)
    def clamp(val, min_, max_):
        try:
            val = int(val)
            val = max(min_, min(val, max_))
            return val
        except:
            return max_
    def extract(string, start, end, min_, max_):
        if len(string) >= end:
            return clamp(string[start:end], min_, max_)
        else:
            return max_
    # now parse, clamp to boundary
    year = extract(string, 0, 4, 1900, 2999)
    month = extract(string, 4, 6, 1, 12)
    day = extract(string, 6, 8, 1, calendar.monthrange(year, month)[1])
    hour = extract(string, 8, 10, 0, 23)
    minute = extract(string, 10, 12, 0, 59)
    second = extract(string, 12, 14, 0, 59)
    return datetime.datetime(year=year,
                             month=month,
                             day=day,
                             hour=hour,
                             minute=minute,
                             second=second)
    #return time.strptime(pad_timestamp(string), TIMESTAMP_14)
 def timestamp_to_sec(string):
    """
    >>> timestamp_to_sec('20131226095010')
    1388051410
    # rounds to end of 2014
    >>> timestamp_to_sec('2014')
    1420070399
    """
    return calendar.timegm(timestamp_to_datetime(string).utctimetuple())
 def sec_to_timestamp(secs):
    """
    >>> sec_to_timestamp(1388051410)
    '20131226095010'
    >>> sec_to_timestamp(1420070399)
    '20141231235959'
    """
    return datetime_to_timestamp(datetime.datetime.utcfromtimestamp(secs))
 def timestamp_to_http_date(string):
    """
    >>> timestamp_to_http_date('20131226095000')
    'Thu, 26 Dec 2013 09:50:00 GMT'
    >>> timestamp_to_http_date('20140126200804')
    'Sun, 26 Jan 2014 20:08:04 GMT'
    """
    return datetime_to_http_date(timestamp_to_datetime(string))
 if __name__ == "__main__":
    import doctest
    doctest.testmod()
--- a/pywb/warc/archiveindexer.py
+++ b/pywb/warc/archiveindexer.py
@ -1,8 +1,8 @@
 from pywb.utils.timeutils import iso_date_to_timestamp
 from pywb.utils.canonicalize import canonicalize
 from pywb.utils.loaders import extract_post_query, append_post_query
-from pywb.warc.archiveiterator import ArchiveIterator
+from warcio.timeutils import iso_date_to_timestamp
 from warcio.archiveiterator import ArchiveIterator
 import hashlib
 import base64
@ -71,7 +71,7 @@ class ArchiveIndexEntryMixin(object):
            self['urlkey'] = canonicalize(url, surt_ordered)
            other['urlkey'] = self['urlkey']
-        referer = other.record.status_headers.get_header('referer')
+        referer = other.record.http_headers.get_header('referer')
        if referer:
            self['_referer'] = referer
@ -141,7 +141,7 @@ class DefaultRecordParser(object):
        for record in raw_iter:
            entry = None
-            if not include_all and not minimal and (record.status_headers.get_statuscode() == '-'):
+            if not include_all and not minimal and (record.http_headers.get_statuscode() == '-'):
                continue
            if record.rec_type == 'arc_header':
@ -175,13 +175,13 @@ class DefaultRecordParser(object):
                compute_digest = True
            elif not minimal and record.rec_type == 'request' and append_post:
-                method = record.status_headers.protocol
+                method = record.http_headers.protocol
-                len_ = record.status_headers.get_header('Content-Length')
+                len_ = record.http_headers.get_header('Content-Length')
                post_query = extract_post_query(method,
                                                entry.get('_content_type'),
                                                len_,
-                                                record.stream)
+                                                record.raw_stream)
                entry['_post_query'] = post_query
@ -236,7 +236,7 @@ class DefaultRecordParser(object):
        if record.rec_type == 'warcinfo':
            entry['url'] = record.rec_headers.get_header('WARC-Filename')
            entry['urlkey'] = entry['url']
-            entry['_warcinfo'] = record.stream.read(record.length)
+            entry['_warcinfo'] = record.raw_stream.read(record.length)
            return entry
        entry['url'] = record.rec_headers.get_header('WARC-Target-Uri')
@ -252,13 +252,13 @@ class DefaultRecordParser(object):
            entry['mime'] = '-'
        else:
            def_mime = '-' if record.rec_type == 'request' else 'unk'
-            entry.extract_mime(record.status_headers.
+            entry.extract_mime(record.http_headers.
                               get_header('Content-Type'),
                               def_mime)
        # status -- only for response records (by convention):
        if record.rec_type == 'response' and not self.options.get('minimal'):
-            entry.extract_status(record.status_headers)
+            entry.extract_status(record.http_headers)
        else:
            entry['status'] = '-'
@ -304,7 +304,7 @@ class DefaultRecordParser(object):
            entry.extract_mime(record.rec_headers.get_header('content-type'))
            # status
-            entry.extract_status(record.status_headers)
+            entry.extract_status(record.http_headers)
        # digest
        entry['digest'] = '-'
--- a/pywb/warc/archiveiterator.py
+++ b/pywb/warc/archiveiterator.py
@ -1,233 +0,0 @@
 from pywb.utils.bufferedreaders import DecompressingBufferedReader
 from pywb.warc.recordloader import ArcWarcRecordLoader
 import six
 import sys
 # ============================================================================
 BUFF_SIZE = 16384
 class ArchiveIterator(six.Iterator):
    """ Iterate over records in WARC and ARC files, both gzip chunk
    compressed and uncompressed
    The indexer will automatically detect format, and decompress
    if necessary.
    """
    GZIP_ERR_MSG = """
    ERROR: Non-chunked gzip file detected, gzip block continues
    beyond single record.
    This file is probably not a multi-chunk gzip but a single gzip file.
    To allow seek, a gzipped {1} must have each record compressed into
    a single gzip chunk and concatenated together.
    This file is likely still valid and you can use it by decompressing it:
    gunzip myfile.{0}.gz
    You can then also use the 'warc2warc' tool from the 'warc-tools'
    package which will create a properly chunked gzip file:
    warc2warc -Z myfile.{0} > myfile.{0}.gz
 """
    INC_RECORD = """\
    WARNING: Record not followed by newline, perhaps Content-Length is invalid
    Offset: {0}
    Remainder: {1}
 """
    def __init__(self, fileobj, no_record_parse=False,
                 verify_http=False, arc2warc=False, block_size=BUFF_SIZE):
        self.fh = fileobj
        self.loader = ArcWarcRecordLoader(verify_http=verify_http,
                                          arc2warc=arc2warc)
        self.reader = None
        self.offset = 0
        self.known_format = None
        self.mixed_arc_warc = arc2warc
        self.member_info = None
        self.no_record_parse = no_record_parse
        self.reader = DecompressingBufferedReader(self.fh,
                                                  block_size=block_size)
        self.offset = self.fh.tell()
        self.next_line = None
        self._raise_invalid_gzip = False
        self._is_empty = False
        self._is_first = True
        self.last_record = None
    def __iter__(self):
        return self
    def __next__(self):
        while True:
            if not self._is_first:
                self._finish_record()
            self._is_first = False
            try:
                self.last_record = self._next_record(self.next_line)
                if self._raise_invalid_gzip:
                    self._raise_invalid_gzip_err()
                return self.last_record
            except EOFError:
                self._is_empty = True
    def _finish_record(self):
        if self.last_record:
            self.read_to_end(self.last_record)
        if self.reader.decompressor:
            # if another gzip member, continue
            if self.reader.read_next_member():
                return
            # if empty record, then we're done
            elif self._is_empty:
                raise StopIteration()
            # otherwise, probably a gzip
            # containing multiple non-chunked records
            # raise this as an error
            else:
                self._raise_invalid_gzip = True
        # non-gzip, so we're done
        elif self._is_empty:
            raise StopIteration()
    def _raise_invalid_gzip_err(self):
        """ A gzip file with multiple ARC/WARC records, non-chunked
        has been detected. This is not valid for replay, so notify user
        """
        frmt = 'warc/arc'
        if self.known_format:
            frmt = self.known_format
        frmt_up = frmt.upper()
        msg = self.GZIP_ERR_MSG.format(frmt, frmt_up)
        raise Exception(msg)
    def _consume_blanklines(self):
        """ Consume blank lines that are between records
        - For warcs, there are usually 2
        - For arcs, may be 1 or 0
        - For block gzipped files, these are at end of each gzip envelope
          and are included in record length which is the full gzip envelope
        - For uncompressed, they are between records and so are NOT part of
          the record length
          count empty_size so that it can be substracted from
          the record length for uncompressed
          if first line read is not blank, likely error in WARC/ARC,
          display a warning
        """
        empty_size = 0
        first_line = True
        while True:
            line = self.reader.readline()
            if len(line) == 0:
                return None, empty_size
            stripped = line.rstrip()
            if len(stripped) == 0 or first_line:
                empty_size += len(line)
                if len(stripped) != 0:
                    # if first line is not blank,
                    # likely content-length was invalid, display warning
                    err_offset = self.fh.tell() - self.reader.rem_length() - empty_size
                    sys.stderr.write(self.INC_RECORD.format(err_offset, line))
                first_line = False
                continue
            return line, empty_size
    def read_to_end(self, record, payload_callback=None):
        """ Read remainder of the stream
        If a digester is included, update it
        with the data read
        """
        # already at end of this record, don't read until it is consumed
        if self.member_info:
            return None
        num = 0
        curr_offset = self.offset
        while True:
            b = record.stream.read(BUFF_SIZE)
            if not b:
                break
            num += len(b)
            if payload_callback:
                payload_callback(b)
        """
        - For compressed files, blank lines are consumed
          since they are part of record length
        - For uncompressed files, blank lines are read later,
          and not included in the record length
        """
        #if self.reader.decompressor:
        self.next_line, empty_size = self._consume_blanklines()
        self.offset = self.fh.tell() - self.reader.rem_length()
        #if self.offset < 0:
        #    raise Exception('Not Gzipped Properly')
        if self.next_line:
            self.offset -= len(self.next_line)
        length = self.offset - curr_offset
        if not self.reader.decompressor:
            length -= empty_size
        self.member_info = (curr_offset, length)
        #return self.member_info
        #return next_line
    def _next_record(self, next_line):
        """ Use loader to parse the record from the reader stream
        Supporting warc and arc records
        """
        record = self.loader.parse_record_stream(self.reader,
                                                 next_line,
                                                 self.known_format,
                                                 self.no_record_parse)
        self.member_info = None
        # Track known format for faster parsing of other records
        if not self.mixed_arc_warc:
            self.known_format = record.format
        return record
--- a/pywb/warc/blockrecordloader.py
+++ b/pywb/warc/blockrecordloader.py
@ -1,6 +1,6 @@
 from warcio.bufferedreaders import DecompressingBufferedReader
 from warcio.recordloader import ArcWarcRecordLoader
 from pywb.utils.loaders import BlockLoader
 from pywb.utils.bufferedreaders import DecompressingBufferedReader
 from pywb.warc.recordloader import ArcWarcRecordLoader
 #=================================================================
--- a/pywb/warc/recordloader.py
+++ b/pywb/warc/recordloader.py
@ -1,298 +0,0 @@
 import collections
 from pywb.utils.statusandheaders import StatusAndHeaders
 from pywb.utils.statusandheaders import StatusAndHeadersParser
 from pywb.utils.statusandheaders import StatusAndHeadersParserException
 from pywb.utils.limitreader import LimitReader
 from pywb.utils.loaders import to_native_str
 #from pywb.utils.wbexception import WbException
 from pywb.utils.timeutils import timestamp_to_iso_date
 from six.moves import zip
 #=================================================================
 #ArcWarcRecord = collections.namedtuple('ArcWarcRecord',
 #                                       'format, rec_type, rec_headers, ' +
 #                                       'stream, status_headers, ' +
 #                                       'content_type, length')
 #=================================================================
 class ArcWarcRecord(object):
    def __init__(self, *args):
        (self.format, self.rec_type, self.rec_headers, self.stream,
         self.status_headers, self.content_type, self.length) = args
 #=================================================================
 class ArchiveLoadFailed(Exception):
    def __init__(self, reason, filename=''):
        if filename:
            msg = filename + ': ' + str(reason)
        else:
            msg = str(reason)
        super(ArchiveLoadFailed, self).__init__(msg)
        self.msg = msg
 #=================================================================
 class ArcWarcRecordLoader(object):
    WARC_TYPES = ['WARC/1.0', 'WARC/0.17', 'WARC/0.18']
    HTTP_TYPES = ['HTTP/1.0', 'HTTP/1.1']
    HTTP_VERBS = ['GET', 'HEAD', 'POST', 'PUT', 'DELETE', 'TRACE',
                  'OPTIONS', 'CONNECT', 'PATCH']
    NON_HTTP_RECORDS = ('warcinfo', 'arc_header', 'metadata', 'resource')
    NON_HTTP_SCHEMES = ('dns:', 'whois:', 'ntp:')
    HTTP_SCHEMES = ('http:', 'https:')
    def __init__(self, verify_http=True, arc2warc=True):
        if arc2warc:
            self.arc_parser = ARC2WARCHeadersParser()
        else:
            self.arc_parser = ARCHeadersParser()
        self.warc_parser = StatusAndHeadersParser(self.WARC_TYPES)
        self.http_parser = StatusAndHeadersParser(self.HTTP_TYPES, verify_http)
        self.http_req_parser = StatusAndHeadersParser(self.HTTP_VERBS, verify_http)
    def parse_record_stream(self, stream,
                            statusline=None,
                            known_format=None,
                            no_record_parse=False):
        """ Parse file-like stream and return an ArcWarcRecord
        encapsulating the record headers, http headers (if any),
        and a stream limited to the remainder of the record.
        Pass statusline and known_format to detect_type_loader_headers()
        to faciliate parsing.
        """
        (the_format, rec_headers) = (self.
                                     _detect_type_load_headers(stream,
                                                               statusline,
                                                               known_format))
        if the_format == 'arc':
            uri = rec_headers.get_header('uri')
            length = rec_headers.get_header('length')
            content_type = rec_headers.get_header('content-type')
            sub_len = rec_headers.total_len
            if uri and uri.startswith('filedesc://'):
                rec_type = 'arc_header'
            else:
                rec_type = 'response'
        elif the_format in ('warc', 'arc2warc'):
            rec_type = rec_headers.get_header('WARC-Type')
            uri = rec_headers.get_header('WARC-Target-URI')
            length = rec_headers.get_header('Content-Length')
            content_type = rec_headers.get_header('Content-Type')
            if the_format == 'warc':
                sub_len = 0
            else:
                sub_len = rec_headers.total_len
                the_format = 'warc'
        is_err = False
        try:
            if length is not None:
                length = int(length) - sub_len
                if length < 0:
                    is_err = True
        except (ValueError, TypeError):
            is_err = True
        # err condition
        if is_err:
            length = 0
        # or status and headers are completely empty (blank lines found)
        elif not rec_headers:
            length = 0
        # limit stream to the length for all valid records
        if length is not None and length >= 0:
            stream = LimitReader.wrap_stream(stream, length)
        # don't parse the http record at all
        if no_record_parse:
            status_headers = None#StatusAndHeaders('', [])
        # if empty record (error or otherwise) set status to 204
        elif length == 0:
            if is_err:
                msg = '204 Possible Error'
            else:
                msg = '204 No Content'
            status_headers = StatusAndHeaders(msg, [])
        # response record or non-empty revisit: parse HTTP status and headers!
        elif (rec_type in ('response', 'revisit')
              and uri.startswith(self.HTTP_SCHEMES)):
            status_headers = self.http_parser.parse(stream)
        # request record: parse request
        elif ((rec_type == 'request')
              and uri.startswith(self.HTTP_SCHEMES)):
            status_headers = self.http_req_parser.parse(stream)
        # everything else: create a no-status entry, set content-type
        else:
            content_type_header = [('Content-Type', content_type)]
            if length is not None and length >= 0:
                content_type_header.append(('Content-Length', str(length)))
            status_headers = StatusAndHeaders('200 OK', content_type_header)
        return ArcWarcRecord(the_format, rec_type,
                             rec_headers, stream, status_headers,
                             content_type, length)
    def _detect_type_load_headers(self, stream,
                                  statusline=None, known_format=None):
        """ If known_format is specified ('warc' or 'arc'),
        parse only as that format.
        Otherwise, try parsing record as WARC, then try parsing as ARC.
        if neither one succeeds, we're out of luck.
        """
        if known_format != 'arc':
            # try as warc first
            try:
                rec_headers = self.warc_parser.parse(stream, statusline)
                return 'warc', rec_headers
            except StatusAndHeadersParserException as se:
                if known_format == 'warc':
                    msg = 'Invalid WARC record, first line: '
                    raise ArchiveLoadFailed(msg + str(se.statusline))
                statusline = se.statusline
                pass
        # now try as arc
        try:
            rec_headers = self.arc_parser.parse(stream, statusline)
            return self.arc_parser.get_rec_type(), rec_headers
        except StatusAndHeadersParserException as se:
            if known_format == 'arc':
                msg = 'Invalid ARC record, first line: '
            else:
                msg = 'Unknown archive format, first line: '
            raise ArchiveLoadFailed(msg + str(se.statusline))
 #=================================================================
 class ARCHeadersParser(object):
    # ARC 1.0 headers
    ARC_HEADERS = ["uri", "ip-address", "archive-date",
                       "content-type", "length"]
    def __init__(self):
        self.headernames = self.get_header_names()
    def get_rec_type(self):
        return 'arc'
    def parse(self, stream, headerline=None):
        total_read = 0
        def readline():
            return to_native_str(stream.readline())
        # if headerline passed in, use that
        if headerline is None:
            headerline = readline()
        else:
            headerline = to_native_str(headerline)
        header_len = len(headerline)
        if header_len == 0:
            raise EOFError()
        headerline = headerline.rstrip()
        headernames = self.headernames
        # if arc header, consume next two lines
        if headerline.startswith('filedesc://'):
            version = readline()  # skip version
            spec = readline()  # skip header spec, use preset one
            total_read += len(version)
            total_read += len(spec)
        parts = headerline.split(' ')
        if len(parts) != len(headernames):
            msg = 'Wrong # of headers, expected arc headers {0}, Found {1}'
            msg = msg.format(headernames, parts)
            raise StatusAndHeadersParserException(msg, parts)
        protocol, headers = self._get_protocol_and_headers(headerline, parts)
        return StatusAndHeaders(statusline='',
                                headers=headers,
                                protocol='WARC/1.0',
                                total_len=total_read)
    @classmethod
    def get_header_names(cls):
        return cls.ARC_HEADERS
    def _get_protocol_and_headers(self, headerline, parts):
        headers = []
        for name, value in zip(self.headernames, parts):
            headers.append((name, value))
        return ('ARC/1.0', headers)
 #=================================================================
 class ARC2WARCHeadersParser(ARCHeadersParser):
    # Headers for converting ARC -> WARC Header
    ARC_TO_WARC_HEADERS = ["WARC-Target-URI",
                           "WARC-IP-Address",
                           "WARC-Date",
                           "Content-Type",
                           "Content-Length"]
    def get_rec_type(self):
        return 'arc2warc'
    @classmethod
    def get_header_names(cls):
        return cls.ARC_TO_WARC_HEADERS
    def _get_protocol_and_headers(self, headerline, parts):
        headers = []
        for name, value in zip(self.headernames, parts):
            if name == 'WARC-Date':
                value = timestamp_to_iso_date(value)
            headers.append((name, value))
        if headerline.startswith('filedesc://'):
            rec_type = 'arc_header'
        else:
            rec_type = 'response'
        headers.append(('WARC-Type', rec_type))
        headers.append(('WARC-Record-ID', StatusAndHeadersParser.make_warc_id()))
        return ('WARC/1.0', headers)
--- a/pywb/warc/resolvingloader.py
+++ b/pywb/warc/resolvingloader.py
@ -1,6 +1,7 @@
-from pywb.utils.timeutils import iso_date_to_timestamp
+from warcio.recordloader import ArchiveLoadFailed
 from warcio.timeutils import iso_date_to_timestamp
 from pywb.warc.blockrecordloader import BlockArcWarcRecordLoader
 from pywb.warc.recordloader import ArchiveLoadFailed
 from pywb.utils.wbexception import NotFoundException
 import six
@ -27,20 +28,20 @@ class ResolvingLoader(object):
        elif headers_record != payload_record:
            # close remainder of stream as this record only used for
            # (already parsed) headers
-            headers_record.stream.close()
+            headers_record.raw_stream.close()
            # special case: check if headers record is actually empty
            # (eg empty revisit), then use headers from revisit
-            if not headers_record.status_headers.headers:
+            if not headers_record.http_headers.headers:
                headers_record = payload_record
        if not headers_record or not payload_record:
            raise ArchiveLoadFailed('Could not load ' + str(cdx))
        # ensure status line is valid from here
-        headers_record.status_headers.validate_statusline('204 No Content')
+        headers_record.http_headers.validate_statusline('204 No Content')
-        return (headers_record.status_headers, payload_record.stream)
+        return (headers_record.http_headers, payload_record.raw_stream)
    def load_headers_and_payload(self, cdx, failed_files, cdx_loader):
        """
@ -107,7 +108,7 @@ class ResolvingLoader(object):
        # optimization: if same file already failed this request,
        # don't try again
        if failed_files is not None and filename in failed_files:
-            raise ArchiveLoadFailed('Skipping Already Failed', filename)
+            raise ArchiveLoadFailed('Skipping Already Failed: ' + filename)
        any_found = False
        last_exc = None
@ -144,7 +145,7 @@ class ResolvingLoader(object):
            msg = 'Archive File Not Found'
        #raise ArchiveLoadFailed(msg, filename), None, last_traceback
-        six.reraise(ArchiveLoadFailed, ArchiveLoadFailed(msg, filename), last_traceback)
+        six.reraise(ArchiveLoadFailed, ArchiveLoadFailed(filename + ': ' + msg), last_traceback)
    def _load_different_url_payload(self, cdx, headers_record,
                                    failed_files, cdx_loader):
--- a/pywb/warc/test/test_loading.py
+++ b/pywb/warc/test/test_loading.py
@ -292,13 +292,13 @@ import sys
 import pprint
 import six
-from pywb.warc.recordloader import ArcWarcRecordLoader, ArchiveLoadFailed
+from warcio.recordloader import ArcWarcRecordLoader, ArchiveLoadFailed
 from pywb.warc.blockrecordloader import BlockArcWarcRecordLoader
 from pywb.warc.resolvingloader import ResolvingLoader
 from pywb.warc.pathresolvers import PathResolverMapper
 from pywb.cdx.cdxobject import CDXObject
-import pywb.utils.statusandheaders
+import warcio.statusandheaders
 from pywb import get_test_dir
@ -331,10 +331,12 @@ def load_test_archive(test_file, offset, length):
    archive = testloader.load(path, offset, length)
-    pywb.utils.statusandheaders.WRAP_WIDTH = 160
+    warcio.statusandheaders.WRAP_WIDTH = 160
    pprint.pprint(((archive.format, archive.rec_type),
-                   archive.rec_headers, archive.status_headers), indent=1, width=160)
+                   archive.rec_headers, archive.http_headers), indent=1, width=160)
    warcio.statusandheaders.WRAP_WIDTH = 80
 #==============================================================================
--- a/pywb/warc/test/test_writer.py
+++ b/pywb/warc/test/test_writer.py
@ -1,121 +0,0 @@
 from pywb.utils.statusandheaders import StatusAndHeaders
 from pywb.warc.warcwriter import BufferWARCWriter
 from pywb.warc.recordloader import ArcWarcRecordLoader
 from pywb.warc.archiveiterator import ArchiveIterator
 from io import BytesIO
 from collections import OrderedDict
 import json
 # ============================================================================
 class FixedTestWARCWriter(BufferWARCWriter):
    @classmethod
    def _make_warc_id(cls, id_=None):
        return '<urn:uuid:12345678-feb0-11e6-8f83-68a86d1772ce>'
    @classmethod
    def _make_warc_date(cls):
        return '2000-01-01T00:00:00Z'
 # ============================================================================
 class TestWarcWriter(object):
    def _validate_record_content_len(self, stream):
        for record in ArchiveIterator(stream, no_record_parse=True):
            assert record.status_headers == None
            assert int(record.rec_headers.get_header('Content-Length')) == record.length
            assert record.length == len(record.stream.read())
    def test_warcinfo_record(self):
        simplewriter = FixedTestWARCWriter(gzip=False)
        params = OrderedDict([('software', 'recorder test'),
                              ('format', 'WARC File Format 1.0'),
                              ('json-metadata', json.dumps({'foo': 'bar'}))])
        record = simplewriter.create_warcinfo_record('testfile.warc.gz', params)
        simplewriter.write_record(record)
        buff = simplewriter.get_contents()
        assert isinstance(buff, bytes)
        buff = BytesIO(buff)
        parsed_record = ArcWarcRecordLoader().parse_record_stream(buff)
        assert parsed_record.rec_headers.get_header('WARC-Type') == 'warcinfo'
        assert parsed_record.rec_headers.get_header('Content-Type') == 'application/warc-fields'
        assert parsed_record.rec_headers.get_header('WARC-Filename') == 'testfile.warc.gz'
        buff = parsed_record.stream.read().decode('utf-8')
        length = parsed_record.rec_headers.get_header('Content-Length')
        assert len(buff) == int(length)
        assert 'json-metadata: {"foo": "bar"}\r\n' in buff
        assert 'format: WARC File Format 1.0\r\n' in buff
        warcinfo_record = '\
 WARC/1.0\r\n\
 WARC-Type: warcinfo\r\n\
 WARC-Record-ID: <urn:uuid:12345678-feb0-11e6-8f83-68a86d1772ce>\r\n\
 WARC-Filename: testfile.warc.gz\r\n\
 WARC-Date: 2000-01-01T00:00:00Z\r\n\
 Content-Type: application/warc-fields\r\n\
 Content-Length: 86\r\n\
 \r\n\
 software: recorder test\r\n\
 format: WARC File Format 1.0\r\n\
 json-metadata: {"foo": "bar"}\r\n\
 \r\n\
 \r\n\
 '
        assert simplewriter.get_contents().decode('utf-8') == warcinfo_record
    def test_generate_response(self):
        headers_list = [('Content-Type', 'text/plain; charset="UTF-8"'),
                        ('Custom-Header', 'somevalue')
                       ]
        payload = b'some\ntext'
        status_headers = StatusAndHeaders('200 OK', headers_list, protocol='HTTP/1.0')
        writer = FixedTestWARCWriter(gzip=False)
        record = writer.create_warc_record('http://example.com/', 'response',
                                           payload=BytesIO(payload),
                                           length=len(payload),
                                           status_headers=status_headers)
        writer.write_record(record)
        buff = writer.get_contents()
        self._validate_record_content_len(BytesIO(buff))
        warc_record = '\
 WARC/1.0\r\n\
 WARC-Type: response\r\n\
 WARC-Record-ID: <urn:uuid:12345678-feb0-11e6-8f83-68a86d1772ce>\r\n\
 WARC-Target-URI: http://example.com/\r\n\
 WARC-Date: 2000-01-01T00:00:00Z\r\n\
 WARC-Block-Digest: sha1:B6QJ6BNJ3R4B23XXMRKZKHLPGJY2VE4O\r\n\
 WARC-Payload-Digest: sha1:B6QJ6BNJ3R4B23XXMRKZKHLPGJY2VE4O\r\n\
 Content-Type: application/http; msgtype=response\r\n\
 Content-Length: 97\r\n\
 \r\n\
 HTTP/1.0 200 OK\r\n\
 Content-Type: text/plain; charset="UTF-8"\r\n\
 Custom-Header: somevalue\r\n\
 \r\n\
 some\n\
 text\
 \r\n\
 \r\n\
 '
        assert buff.decode('utf-8') == warc_record
--- a/pywb/warc/warcwriter.py
+++ b/pywb/warc/warcwriter.py
@ -1,304 +0,0 @@
 import tempfile
 import uuid
 import base64
 import hashlib
 import datetime
 import zlib
 import six
 from socket import gethostname
 from io import BytesIO
 from pywb.utils.loaders import to_native_str
 from pywb.utils.timeutils import datetime_to_iso_date
 from pywb.utils.statusandheaders import StatusAndHeadersParser, StatusAndHeaders
 from pywb.warc.recordloader import ArcWarcRecord
 from pywb.warc.recordloader import ArcWarcRecordLoader
 # ============================================================================
 class BaseWARCWriter(object):
    BUFF_SIZE = 16384
    WARC_RECORDS = {'warcinfo': 'application/warc-fields',
         'response': 'application/http; msgtype=response',
         'revisit': 'application/http; msgtype=response',
         'request': 'application/http; msgtype=request',
         'metadata': 'application/warc-fields',
        }
    REVISIT_PROFILE = 'http://netpreserve.org/warc/1.0/revisit/uri-agnostic-identical-payload-digest'
    WARC_VERSION = 'WARC/1.0'
    def __init__(self, gzip=True, header_filter=None, *args, **kwargs):
        self.gzip = gzip
        self.header_filter = header_filter
        self.hostname = gethostname()
        self.parser = StatusAndHeadersParser([], verify=False)
        self.warc_version = kwargs.get('warc_version', self.WARC_VERSION)
    @classmethod
    def _iter_stream(cls, stream):
        while True:
            buf = stream.read(cls.BUFF_SIZE)
            if not buf:
                return
            yield buf
    def ensure_digest(self, record):
        block_digest = record.rec_headers.get_header('WARC-Block-Digest')
        payload_digest = record.rec_headers.get_header('WARC-Payload-Digest')
        if block_digest and payload_digest:
            return
        block_digester = self._create_digester()
        payload_digester = self._create_digester()
        pos = record.stream.tell()
        if record.status_headers and hasattr(record.status_headers, 'headers_buff'):
            block_digester.update(record.status_headers.headers_buff)
        for buf in self._iter_stream(record.stream):
            block_digester.update(buf)
            payload_digester.update(buf)
        record.stream.seek(pos)
        record.rec_headers.add_header('WARC-Block-Digest', str(block_digester))
        record.rec_headers.add_header('WARC-Payload-Digest', str(payload_digester))
    def _create_digester(self):
        return Digester('sha1')
    def _set_header_buff(self, record):
        exclude_list = None
        if self.header_filter:
            exclude_list = self.header_filter(record)
        buff = record.status_headers.to_bytes(exclude_list)
        record.status_headers.headers_buff = buff
    def create_warcinfo_record(self, filename, info):
        warc_headers = StatusAndHeaders(self.warc_version, [])
        warc_headers.add_header('WARC-Type', 'warcinfo')
        warc_headers.add_header('WARC-Record-ID', self._make_warc_id())
        if filename:
            warc_headers.add_header('WARC-Filename', filename)
        warc_headers.add_header('WARC-Date', self._make_warc_date())
        warcinfo = BytesIO()
        for n, v in six.iteritems(info):
            self._header(warcinfo, n, v)
        warcinfo.seek(0)
        record = ArcWarcRecord('warc', 'warcinfo', warc_headers, warcinfo,
                               None, '', len(warcinfo.getvalue()))
        return record
    def copy_warc_record(self, payload):
        len_ = payload.tell()
        payload.seek(0)
        warc_headers = self.parser.parse(payload)
        record_type = warc_headers.get_header('WARC-Type', 'response')
        return self._fill_record(record_type, warc_headers, None, payload, '', len_)
    def create_warc_record(self, uri, record_type, payload,
                           length=None,
                           warc_content_type='',
                           warc_headers_dict={},
                           status_headers=None):
        if length is None:
            length = payload.tell()
            payload.seek(0)
        warc_headers = StatusAndHeaders(self.warc_version, list(warc_headers_dict.items()))
        warc_headers.replace_header('WARC-Type', record_type)
        if not warc_headers.get_header('WARC-Record-ID'):
            warc_headers.add_header('WARC-Record-ID', self._make_warc_id())
        if uri:
            warc_headers.replace_header('WARC-Target-URI', uri)
        if not warc_headers.get_header('WARC-Date'):
            warc_headers.add_header('WARC-Date', self._make_warc_date())
        return self._fill_record(record_type, warc_headers, status_headers,
                                 payload, warc_content_type, length)
    def _fill_record(self, record_type, warc_headers, status_headers, payload, warc_content_type, len_):
        has_http_headers = (record_type in ('request', 'response', 'revisit'))
        if not status_headers and has_http_headers:
            status_headers = self.parser.parse(payload)
        record = ArcWarcRecord('warc', record_type, warc_headers, payload,
                               status_headers, warc_content_type, len_)
        self.ensure_digest(record)
        if has_http_headers:
            self._set_header_buff(record)
        return record
    def _write_warc_record(self, out, record, adjust_cl=True):
        if self.gzip:
            out = GzippingWrapper(out)
        # compute Content-Type
        content_type = record.rec_headers.get_header('Content-Type')
        if not content_type:
            content_type = record.content_type
            if not content_type:
                content_type = self.WARC_RECORDS.get(record.rec_headers.get_header('WARC-Type'))
            if content_type:
                record.rec_headers.replace_header('Content-Type', content_type)
                #self._header(out, 'Content-Type', content_type)
        if record.rec_headers.get_header('WARC-Type') == 'revisit':
            http_headers_only = True
        else:
            http_headers_only = False
        # compute Content-Length
        if record.length or record.status_headers:
            actual_len = 0
            if record.status_headers:
                actual_len = len(record.status_headers.headers_buff)
            if not http_headers_only:
                if adjust_cl:
                    diff = record.stream.tell() - actual_len
                else:
                    diff = 0
                actual_len = record.length - diff
            record.rec_headers.replace_header('Content-Length', str(actual_len))
            #self._header(out, 'Content-Length', str(actual_len))
            # add empty line
            #self._line(out, b'')
            # write record headers
            out.write(record.rec_headers.to_bytes())
            # write headers buffer, if any
            if record.status_headers:
                out.write(record.status_headers.headers_buff)
            if not http_headers_only:
                for buf in self._iter_stream(record.stream):
                    out.write(buf)
                #out.write(record.stream.read())
            # add two lines
            self._line(out, b'\r\n')
        else:
            # add three lines (1 for end of header, 2 for end of record)
            self._line(out, b'Content-Length: 0\r\n\r\n')
        out.flush()
    def _header(self, out, name, value):
        if not value:
            return
        self._line(out, (name + ': ' + str(value)).encode('latin-1'))
    def _line(self, out, line):
        out.write(line + b'\r\n')
    @classmethod
    def _make_warc_id(cls, id_=None):
        if not id_:
            id_ = uuid.uuid1()
        return '<urn:uuid:{0}>'.format(id_)
    @classmethod
    def _make_warc_date(cls):
        return datetime_to_iso_date(datetime.datetime.utcnow())
 # ============================================================================
 class GzippingWrapper(object):
    def __init__(self, out):
        self.compressor = zlib.compressobj(9, zlib.DEFLATED, zlib.MAX_WBITS + 16)
        self.out = out
    def write(self, buff):
        #if isinstance(buff, str):
        #    buff = buff.encode('utf-8')
        buff = self.compressor.compress(buff)
        self.out.write(buff)
    def flush(self):
        buff = self.compressor.flush()
        self.out.write(buff)
        self.out.flush()
 # ============================================================================
 class Digester(object):
    def __init__(self, type_='sha1'):
        self.type_ = type_
        self.digester = hashlib.new(type_)
    def update(self, buff):
        self.digester.update(buff)
    def __str__(self):
        return self.type_ + ':' + to_native_str(base64.b32encode(self.digester.digest()))
 # ============================================================================
 class BufferWARCWriter(BaseWARCWriter):
    def __init__(self, *args, **kwargs):
        super(BufferWARCWriter, self).__init__(*args, **kwargs)
        self.out = self._create_buffer()
    def _create_buffer(self):
        return tempfile.SpooledTemporaryFile(max_size=512*1024)
    def write_record(self, record):
        self._write_warc_record(self.out, record)
    def get_contents(self):
        pos = self.out.tell()
        self.out.seek(0)
        buff = self.out.read()
        self.out.seek(pos)
        return buff
 # ============================================================================
 class FileWARCWriter(BufferWARCWriter):
    def __init__(self, *args, **kwargs):
        file_or_buff = None
        if len(args) > 0:
            file_or_buff = args[0]
        else:
            file_or_buff = kwargs.get('file')
        if isinstance(file_or_buff, str):
            self.out = open(file_or_buff, 'rb')
        elif hasattr(file_or_buff, 'read'):
            self.out = file_or_buff
        else:
            raise Exception('file must be a readable or valid filename')
--- a/pywb/webagg/aggregator.py
+++ b/pywb/webagg/aggregator.py
@ -5,7 +5,8 @@ import json
 import time
 import os
-from pywb.utils.timeutils import timestamp_now
+from warcio.timeutils import timestamp_now
 from pywb.cdx.cdxops import process_cdx
 from pywb.cdx.query import CDXQuery
--- a/pywb/webagg/handlers.py
+++ b/pywb/webagg/handlers.py
@ -2,7 +2,7 @@ from pywb.webagg.responseloader import  WARCPathLoader, LiveWebLoader, VideoLoad
 from pywb.webagg.utils import MementoUtils
 from pywb.utils.wbexception import BadRequestException, WbException
 from pywb.utils.wbexception import NotFoundException
-from pywb.warc.recordloader import ArchiveLoadFailed
+from warcio.recordloader import ArchiveLoadFailed
 from pywb.cdx.query import CDXQuery
 from pywb.cdx.cdxdomainspecific import load_domain_specific_cdx_rules
--- a/pywb/webagg/indexsource.py
+++ b/pywb/webagg/indexsource.py
@ -1,6 +1,6 @@
 from pywb.utils.binsearch import iter_range
-from pywb.utils.timeutils import timestamp_to_http_date, http_date_to_timestamp
+from warcio.timeutils import timestamp_to_http_date, http_date_to_timestamp
-from pywb.utils.timeutils import timestamp_now
+from warcio.timeutils import timestamp_now
 from pywb.utils.canonicalize import canonicalize
 from pywb.utils.wbexception import NotFoundException
--- a/pywb/webagg/inputrequest.py
+++ b/pywb/webagg/inputrequest.py
@ -1,6 +1,7 @@
 from warcio.limitreader import LimitReader
 from warcio.statusandheaders import StatusAndHeadersParser
 from pywb.utils.loaders import extract_post_query, append_post_query
 from pywb.utils.limitreader import LimitReader
 from pywb.utils.statusandheaders import StatusAndHeadersParser
 from six.moves.urllib.parse import urlsplit, quote
 from six import iteritems, StringIO
--- a/pywb/webagg/proxyindexsource.py
+++ b/pywb/webagg/proxyindexsource.py
@ -3,7 +3,7 @@ from pywb.utils.wbexception import NotFoundException
 from pywb.webagg.indexsource import BaseIndexSource, RemoteIndexSource
 from pywb.webagg.responseloader import LiveWebLoader
 from pywb.webagg.utils import ParamFormatter, res_template
-from pywb.utils.timeutils import timestamp_now
+from warcio.timeutils import timestamp_now
 #=============================================================================
--- a/pywb/webagg/responseloader.py
+++ b/pywb/webagg/responseloader.py
@ -2,12 +2,13 @@ from pywb.webagg.utils import MementoUtils, StreamIter, compress_gzip_iter
 from pywb.webagg.utils import ParamFormatter
 from pywb.webagg.indexsource import RedisIndexSource
-from pywb.utils.timeutils import timestamp_to_datetime, datetime_to_timestamp
+from warcio.timeutils import timestamp_to_datetime, datetime_to_timestamp
-from pywb.utils.timeutils import iso_date_to_datetime, datetime_to_iso_date
+from warcio.timeutils import iso_date_to_datetime, datetime_to_iso_date
-from pywb.utils.timeutils import http_date_to_datetime, datetime_to_http_date
+from warcio.timeutils import http_date_to_datetime, datetime_to_http_date
 from warcio.statusandheaders import StatusAndHeaders, StatusAndHeadersParser
 from pywb.utils.wbexception import LiveResourceException, WbException
 from pywb.utils.statusandheaders import StatusAndHeaders, StatusAndHeadersParser
 from pywb.warc.resolvingloader import ResolvingLoader
@ -243,11 +244,11 @@ class WARCPathLoader(BaseLoader):
        status = cdx.get('status')
        if not status or status.startswith('3'):
-            status_headers = self.headers_parser.parse(payload.stream)
+            http_headers = self.headers_parser.parse(payload.raw_stream)
            self.raise_on_self_redirect(params, cdx,
-                                        status_headers.get_statuscode(),
+                                        http_headers.get_statuscode(),
-                                        status_headers.get_header('Location'))
+                                        http_headers.get_header('Location'))
-            http_headers_buff = status_headers.to_bytes()
+            http_headers_buff = http_headers.to_bytes()
        else:
            http_headers_buff = None
@ -266,9 +267,9 @@ class WARCPathLoader(BaseLoader):
            warc_headers.replace_header('WARC-Date',
                     headers.rec_headers.get_header('WARC-Date'))
-            headers.stream.close()
+            headers.raw_stream.close()
-        return (warc_headers, http_headers_buff, payload.stream)
+        return (warc_headers, http_headers_buff, payload.raw_stream)
    def __str__(self):
        return  'WARCPathLoader'
--- a/pywb/webagg/test/test_handlers.py
+++ b/pywb/webagg/test/test_handlers.py
@ -13,10 +13,10 @@ from pywb.webagg.aggregator import DirectoryIndexSource
 from pywb.webagg.app import ResAggApp
 from pywb.webagg.utils import MementoUtils
-from pywb.warc.recordloader import ArcWarcRecordLoader
+from warcio.recordloader import ArcWarcRecordLoader
 from warcio.statusandheaders import StatusAndHeadersParser
 from warcio.bufferedreaders import ChunkedDataReader
 from pywb.utils.statusandheaders import StatusAndHeadersParser
 from pywb.utils.bufferedreaders import ChunkedDataReader
 from io import BytesIO
 from six.moves.urllib.parse import urlencode
@ -215,9 +215,9 @@ class TestResAgg(MementoOverrideTests, FakeRedisTests, BaseTestClass):
        buff = BytesIO(resp.body)
        record = ArcWarcRecordLoader().parse_record_stream(buff, no_record_parse=False)
-        print(record.status_headers)
+        print(record.http_headers)
-        assert record.status_headers.get_statuscode() == '302'
+        assert record.http_headers.get_statuscode() == '302'
-        assert record.status_headers.get_header('Location') == 'https://www.iana.org/'
+        assert record.http_headers.get_header('Location') == 'https://www.iana.org/'
    @patch('pywb.webagg.indexsource.MementoIndexSource.get_timegate_links', MementoOverrideTests.mock_link_header('select_live'))
    def test_agg_select_live(self):
--- a/pywb/webagg/test/test_indexsource.py
+++ b/pywb/webagg/test/test_indexsource.py
@ -3,7 +3,7 @@ from pywb.webagg.indexsource import LiveIndexSource
 from pywb.webagg.aggregator import SimpleAggregator
-from pywb.utils.timeutils import timestamp_now
+from warcio.timeutils import timestamp_now
 from .testutils import key_ts_res, TEST_CDX_PATH
--- a/pywb/webagg/test/test_upstream.py
+++ b/pywb/webagg/test/test_upstream.py
@ -8,7 +8,7 @@ from pywb.webagg.handlers import DefaultResourceHandler
 from pywb.webagg.aggregator import SimpleAggregator
 from pywb.webagg.proxyindexsource import ProxyMementoIndexSource, UpstreamAggIndexSource
-from pywb.warc.recordloader import ArcWarcRecordLoader
+from warcio.recordloader import ArcWarcRecordLoader
 from .testutils import LiveServerTests, BaseTestClass
@ -48,7 +48,7 @@ class TestUpstream(LiveServerTests, BaseTestClass):
        record = ArcWarcRecordLoader().parse_record_stream(resp.raw, no_record_parse=False)
        assert record.rec_headers.get_header('WARC-Target-URI') == 'http://httpbin.org/get'
-        assert record.status_headers.get_header('Date') != ''
+        assert record.http_headers.get_header('Date') != ''
    def test_upstream_1(self):
        resp = self.testapp.get('/upstream/resource?url=http://httpbin.org/get')
@ -58,7 +58,7 @@ class TestUpstream(LiveServerTests, BaseTestClass):
        record = ArcWarcRecordLoader().parse_record_stream(raw, no_record_parse=False)
        assert record.rec_headers.get_header('WARC-Target-URI') == 'http://httpbin.org/get'
-        assert record.status_headers.get_header('Date') != ''
+        assert record.http_headers.get_header('Date') != ''
    def test_upstream_2(self):
        resp = self.testapp.get('/upstream_opt/resource?url=http://httpbin.org/get')
@ -68,7 +68,7 @@ class TestUpstream(LiveServerTests, BaseTestClass):
        record = ArcWarcRecordLoader().parse_record_stream(raw, no_record_parse=False)
        assert record.rec_headers.get_header('WARC-Target-URI') == 'http://httpbin.org/get'
-        assert record.status_headers.get_header('Date') != ''
+        assert record.http_headers.get_header('Date') != ''
--- a/pywb/webagg/utils.py
+++ b/pywb/webagg/utils.py
@ -7,7 +7,9 @@ import zlib
 from contextlib import closing
-from pywb.utils.timeutils import timestamp_to_http_date
+from warcio.timeutils import timestamp_to_http_date
 from warcio.utils import BUFF_SIZE
 from pywb.utils.wbexception import BadRequestException
 from pywb.utils.loaders import load_yaml_config
@ -20,9 +22,6 @@ LINK_SEG_SPLIT = re.compile(';\s*')
 LINK_URL = re.compile('<(.*)>')
 LINK_PROP = re.compile('([\w]+)="([^"]+)')
 BUFF_SIZE = 16384
 #=============================================================================
 class MementoException(BadRequestException):
    pass
--- a/pywb/webagg/zipnum.py
+++ b/pywb/webagg/zipnum.py
@ -15,7 +15,7 @@ from pywb.cdx.cdxobject import IDXObject, CDXException, CDXObject
 from pywb.cdx.query import CDXQuery
 from pywb.utils.loaders import BlockLoader, read_last_line
-from pywb.utils.bufferedreaders import gzip_decompressor
+from warcio.bufferedreaders import gzip_decompressor
 from pywb.utils.binsearch import iter_range, linearsearch, search
--- a/pywb/webapp/handlers.py
+++ b/pywb/webapp/handlers.py
@ -5,9 +5,11 @@ import logging
 from datetime import datetime
 from warcio.statusandheaders import StatusAndHeaders
 from warcio.timeutils import datetime_to_timestamp
 from pywb.utils.wbexception import NotFoundException
 from pywb.utils.loaders import LocalFileLoader
 from pywb.utils.statusandheaders import StatusAndHeaders
 from pywb.framework.basehandlers import BaseHandler, WbUrlHandler
 from pywb.framework.wbrequestresponse import WbResponse
@ -19,7 +21,6 @@ from pywb.warc.pathresolvers import PathResolverMapper
 from pywb.webapp.views import J2TemplateView, init_view
 from pywb.webapp.replay_views import ReplayView
 from pywb.framework.memento import MementoResponse
 from pywb.utils.timeutils import datetime_to_timestamp
 #=================================================================
--- a/pywb/webapp/rangecache.py
+++ b/pywb/webapp/rangecache.py
@ -1,5 +1,6 @@
-from pywb.utils.statusandheaders import StatusAndHeaders
+from warcio.statusandheaders import StatusAndHeaders
-from pywb.utils.limitreader import LimitReader
+from warcio.limitreader import LimitReader
 from pywb.framework.cache import create_cache
 from tempfile import NamedTemporaryFile, mkdtemp
--- a/pywb/webapp/replay_views.py
+++ b/pywb/webapp/replay_views.py
@ -5,16 +5,17 @@ from io import BytesIO
 from six.moves.urllib.parse import urlsplit
 from itertools import chain
-from pywb.utils.statusandheaders import StatusAndHeaders
+from warcio.statusandheaders import StatusAndHeaders
 from warcio.limitreader import LimitReader
 from warcio.timeutils import timestamp_now
 from warcio.recordloader import ArchiveLoadFailed
 from pywb.utils.wbexception import WbException, NotFoundException
 from pywb.utils.limitreader import LimitReader
 from pywb.utils.timeutils import timestamp_now
 from pywb.framework.wbrequestresponse import WbResponse
 from pywb.framework.memento import MementoResponse
 from pywb.rewrite.rewrite_content import RewriteContent
 from pywb.warc.recordloader import ArchiveLoadFailed
 from pywb.webapp.views import HeadInsertView
--- a/pywb/webapp/views.py
+++ b/pywb/webapp/views.py
@ -1,4 +1,4 @@
-from pywb.utils.timeutils import timestamp_to_datetime, timestamp_to_sec
+from warcio.timeutils import timestamp_to_datetime, timestamp_to_sec
 from pywb.framework.wbrequestresponse import WbResponse
 from pywb.framework.memento import make_timemap, LINK_FORMAT
--- a/setup.py
+++ b/setup.py
@ -71,6 +71,7 @@ setup(
        ],
    install_requires=[
        'six',
        'warcio',
        'chardet',
        'requests',
        'redis',
`@ -1,4 +1,4 @@`
	`from pywb.utils.timeutils import timestamp_to_datetime, datetime_to_iso_date`	`from warcio.timeutils import timestamp_to_datetime, datetime_to_iso_date`
	`import re`	`import re`