Merge branch 'develop' for 0.7.5

2025-03-24 06:59:52 +01:00 · 2015-01-12 00:50:16 -08:00 · 2015-01-12 00:50:16 -08:00 · c935aa5ec9
commit c935aa5ec9
parent de403c457e 43805c67ef
28 changed files with 158 additions and 75 deletions
--- a/.gitattributes
+++ b/.gitattributes
@ -0,0 +1,4 @@
 *.arc  -text
 *.warc -text
 *.cdx  -text
 *.gz   -text
--- a/CHANGES.rst
+++ b/CHANGES.rst
@ -1,3 +1,16 @@
 pywb 0.7.5 changelist
 ~~~~~~~~~~~~~~~~~~~~~
 * Cross platform fixes to support Windows -- all tests pass on Linux, OS X and Windows now. Improved cross-platform support includes:
  - read all files as binary to avoid line ending issues
  - properly convert url <-> file
  - avoid platform dependent apis
 * Change any unhandled exceptions to result in a 500 error, instead of 400.
 * More compresensive client side ``src`` attribute rewriting (via wombat.js), additional server-side HTML tag rewriting.
 pywb 0.7.2 changelist
 ~~~~~~~~~~~~~~~~~~~~~
--- a/README.rst
+++ b/README.rst
@ -1,4 +1,4 @@
-PyWb 0.7.2
+PyWb 0.7.5
 ==========
 .. image:: https://travis-ci.org/ikreymer/pywb.png?branch=master
@ -13,7 +13,7 @@ pywb is a python implementation of web archival replay tools, sometimes also kno
 pywb allows high-quality replay (browsing) of archived web data stored in standardized `ARC <http://en.wikipedia.org/wiki/ARC_(file_format)>`_ and `WARC <http://en.wikipedia.org/wiki/Web_ARChive>`_.
 The replay system is designed to accurately replay complex dynamic sites, including video and audio content.
-pywb can be used as a traditional web application or an HTTP or HTTPS proxy server.
+pywb can be used as a traditional web application or an HTTP or HTTPS proxy server, and has been tested on Linux, OS X and Windows platforms.
 pywb is also fully compliant with the `Memento <http://mementoweb.org/>`_ protocol (`RFC-7089 <http://tools.ietf.org/html/rfc7089>`_).
--- a/pywb/cdx/cdxsource.py
+++ b/pywb/cdx/cdxsource.py
@ -30,7 +30,7 @@ class CDXFile(CDXSource):
    def load_cdx(self, query):
        def do_open():
            try:
-                source = open(self.filename)
+                source = open(self.filename, 'rb')
                gen = iter_range(source, query.key, query.end_key)
                for line in gen:
                    yield line
--- a/pywb/cdx/test/test_redis_source.py
+++ b/pywb/cdx/test/test_redis_source.py
@ -26,7 +26,7 @@ test_cdx_dir = get_test_dir() + 'cdx/'
 def load_cdx_into_redis(source, filename, key=None):
    # load a cdx into mock redis
-    with open(test_cdx_dir + filename) as fh:
+    with open(test_cdx_dir + filename, 'rb') as fh:
        for line in fh:
            zadd_cdx(source, line, key)
--- a/pywb/cdx/zipnum.py
+++ b/pywb/cdx/zipnum.py
@ -84,7 +84,7 @@ class ZipNumCluster(CDXSource):
        self.loc_mtime = new_mtime
        logging.debug('Loading loc from: ' + self.loc_filename)
-        with open(self.loc_filename) as fh:
+        with open(self.loc_filename, 'rb') as fh:
            for line in fh:
                parts = line.rstrip().split('\t')
                self.loc_map[parts[0]] = parts[1:]
@ -112,7 +112,7 @@ class ZipNumCluster(CDXSource):
    def load_cdx(self, query):
        self.load_loc()
-        reader = open(self.summary)
+        reader = open(self.summary, 'rb')
        idx_iter = iter_range(reader,
                              query.key,
--- a/pywb/framework/certauth.py
+++ b/pywb/framework/certauth.py
@ -13,8 +13,8 @@ from argparse import ArgumentParser
 #=================================================================
-# Duration of 100 years
+# Duration of 10 years
-CERT_DURATION = 100 * 365 * 24 * 60 * 60
+CERT_DURATION = 10 * 365 * 24 * 60 * 60
 CERTS_DIR = './ca/certs/'
--- a/pywb/framework/proxy.py
+++ b/pywb/framework/proxy.py
@ -334,7 +334,7 @@ class ProxyRouter(object):
                return None
            buff = ''
-            with open(self.ca.ca_file) as fh:
+            with open(self.ca.ca_file, 'rb') as fh:
                buff = fh.read()
            content_type = 'application/x-x509-ca-cert'
--- a/pywb/framework/test/test_certauth.py
+++ b/pywb/framework/test/test_certauth.py
@ -5,8 +5,8 @@ import shutil
 from pywb.framework.certauth import main, CertificateAuthority
-TEST_CA_DIR = './pywb/framework/test/pywb_test_ca_certs'
+TEST_CA_DIR = os.path.join('.', 'pywb', 'framework', 'test', 'pywb_test_ca_certs')
-TEST_CA_ROOT = './pywb/framework/test/pywb_test_ca.pem'
+TEST_CA_ROOT = os.path.join('.', 'pywb', 'framework', 'test', 'pywb_test_ca.pem')
 def setup_module():
    openssl_support = pytest.importorskip("OpenSSL")
--- a/pywb/framework/test/test_wsgi_wrapper.py
+++ b/pywb/framework/test/test_wsgi_wrapper.py
@ -14,7 +14,7 @@ class TestOkApp:
 class TestErrApp:
    def __call__(self, env):
-        raise Exception('Test Error')
+        raise Exception('Test Unexpected Error')
 class TestCustomErrApp:
    def __call__(self, env):
@ -41,8 +41,8 @@ def test_err_app():
    testapp = webtest.TestApp(the_app)
    resp = testapp.get('/abc', expect_errors=True)
-    assert resp.status_int == 400
+    assert resp.status_int == 500
-    assert '400 Bad Request Error: Test Error' in resp.body
+    assert '500 Internal Server Error Error: Test Unexpected Error' in resp.body
 def test_custom_err_app():
    the_app = init_app(initer(TestCustomErrApp), load_yaml=False)
--- a/pywb/framework/wsgi_wrappers.py
+++ b/pywb/framework/wsgi_wrappers.py
@ -118,7 +118,7 @@ class WSGIApp(object):
        if hasattr(exc, 'status'):
            status = exc.status()
        else:
-            status = '400 Bad Request'
+            status = '500 Internal Server Error'
        if hasattr(exc, 'url'):
            err_url = exc.url
--- a/pywb/rewrite/html_rewriter.py
+++ b/pywb/rewrite/html_rewriter.py
@ -30,6 +30,8 @@ class HTMLRewriterMixin(object):
            'base':    {'href': defmod},
            'blockquote': {'cite': defmod},
            'body':    {'background': 'im_'},
            'button':  {'formaction': defmod},
            'command': {'icon': 'im_'},
            'del':     {'cite': defmod},
            'embed':   {'src': 'oe_'},
            'head':    {'': defmod},  # for head rewriting
@ -37,7 +39,8 @@ class HTMLRewriterMixin(object):
            'img':     {'src': 'im_',
                        'srcset': 'im_'},
            'ins':     {'cite': defmod},
-            'input':   {'src': 'im_'},
+            'input':   {'src': 'im_',
                        'formaction': defmod},
            'form':    {'action': defmod},
            'frame':   {'src': 'fr_'},
            'link':    {'href': 'oe_'},
@ -49,7 +52,8 @@ class HTMLRewriterMixin(object):
            'ref':     {'href': 'oe_'},
            'script':  {'src': 'js_'},
            'source':  {'src': 'oe_'},
-            'video':   {'src': 'oe_'},
+            'video':   {'src': 'oe_',
                        'poster': 'im_'},
            'div':     {'data-src': defmod,
                        'data-uri': defmod},
--- a/pywb/rewrite/rewrite_live.py
+++ b/pywb/rewrite/rewrite_live.py
@ -6,10 +6,11 @@ import requests
 import datetime
 import mimetypes
 import logging
 import os
 from urlparse import urlsplit
-from pywb.utils.loaders import is_http, LimitReader, BlockLoader
+from pywb.utils.loaders import is_http, LimitReader, BlockLoader, to_file_url
 from pywb.utils.loaders import extract_client_cookie
 from pywb.utils.timeutils import datetime_to_timestamp
 from pywb.utils.statusandheaders import StatusAndHeaders
@ -180,11 +181,18 @@ class LiveRewriter(object):
        if url.startswith('//'):
            url = 'http:' + url
        if is_http(url):
            is_remote = True
        else:
            is_remote = False
            if not url.startswith('file:'):
                url = to_file_url(url)
        # explicit urlkey may be passed in (say for testing)
        if not urlkey:
            urlkey = canonicalize(url)
-        if is_http(url):
+        if is_remote:
            (status_headers, stream) = self.fetch_http(url, urlkey, env,
                                                       req_headers,
                                                       follow_redirects,
--- a/pywb/rewrite/test/test_url_rewriter.py
+++ b/pywb/rewrite/test/test_url_rewriter.py
@ -103,12 +103,17 @@
 'http://example.com/file.html?param=https://example.com/filename.html&other=value&a=b&param2=http://test.example.com'
 # HttpsUrlRewriter tests
->>> HttpsUrlRewriter('http://example.com/', None).rewrite('https://example.com/abc')
+>>> httpsrewriter = HttpsUrlRewriter('http://example.com/', None)
 >>> httpsrewriter.rewrite('https://example.com/abc')
 'http://example.com/abc'
->>> HttpsUrlRewriter('http://example.com/', None).rewrite('http://example.com/abc')
+>>> httpsrewriter.rewrite('http://example.com/abc')
 'http://example.com/abc'
 # rebase is identity
 >>> httpsrewriter.rebase_rewriter('https://example.com/') == httpsrewriter
 True
 """
--- a/pywb/static/wombat.js
+++ b/pywb/static/wombat.js
@ -105,6 +105,8 @@ _WBWombat = (function() {
                        "http:/" + prefix, "https:/" + prefix];
    }
    var SRC_TAGS = ["IMG", "SCRIPT", "VIDEO", "AUDIO", "SOURCE", "EMBED", "INPUT"];
    //============================================
    function rewrite_url_(url) {
        // If undefined, just return it
@ -692,12 +694,9 @@ _WBWombat = (function() {
                    }
                    override_attr(created, "src");
-                } else if (created.tagName == "IMG" || created.tagName == "VIDEO" || created.tagName == "AUDIO") {
+                } else if (created.tagName && starts_with(created.tagName, SRC_TAGS)) {
                    override_attr(created, "src");
                }
 //                } else if (created.tagName == "A") {
 //                    override_attr(created, "href");
 //                }
                return created;
            }
--- a/pywb/utils/bufferedreaders.py
+++ b/pywb/utils/bufferedreaders.py
@ -46,9 +46,6 @@ class BufferedReader(object):
        self.buff_size = 0
    def set_decomp(self, decomp_type):
        if self.num_read > 0:
            raise Exception('Attempting to change decompression mid-stream')
        self._init_decomp(decomp_type)
    def _init_decomp(self, decomp_type):
--- a/pywb/utils/loaders.py
+++ b/pywb/utils/loaders.py
@ -7,6 +7,7 @@ import os
 import hmac
 import urllib
 import urllib2
 import urlparse
 import time
 import pkg_resources
 from io import open
@ -17,6 +18,15 @@ def is_http(filename):
    return filename.startswith(('http://', 'https://'))
 #=================================================================
 def to_file_url(filename):
    """ Convert a filename to a file:// url
    """
    url = os.path.abspath(filename)
    url = urlparse.urljoin('file:', urllib.pathname2url(url))
    return url
 #=================================================================
 def load_yaml_config(config_file):
    import yaml
@ -39,12 +49,12 @@ def extract_post_query(method, mime, length, stream):
         not mime.lower().startswith('application/x-www-form-urlencoded'))):
        return None
    if not length or length == '0':
        return None
    try:
        length = int(length)
-    except ValueError:
+    except (ValueError, TypeError):
        return None
    if length <= 0:
        return None
    #todo: encoding issues?
@ -129,9 +139,10 @@ class BlockLoader(object):
        # if starting with . or /, can only be a file path..
        file_only = url.startswith(('/', '.'))
        # convert to filename
        if url.startswith('file://'):
            url = url[len('file://'):]
            file_only = True
            url = urllib.url2pathname(url[len('file://'):])
        try:
            # first, try as file
--- a/pywb/utils/test/test_binsearch.py
+++ b/pywb/utils/test/test_binsearch.py
@ -66,12 +66,12 @@ from pywb import get_test_dir
 test_cdx_dir = get_test_dir() + 'cdx/'
 def print_binsearch_results(key, iter_func):
-    with open(test_cdx_dir + 'iana.cdx') as cdx:
+    with open(test_cdx_dir + 'iana.cdx', 'rb') as cdx:
        for line in iter_func(cdx, key):
            print line
 def print_binsearch_results_range(key, end_key, iter_func, prev_size=0):
-    with open(test_cdx_dir + 'iana.cdx') as cdx:
+    with open(test_cdx_dir + 'iana.cdx', 'rb') as cdx:
        for line in iter_func(cdx, key, end_key, prev_size=prev_size):
            print line
--- a/pywb/utils/test/test_loaders.py
+++ b/pywb/utils/test/test_loaders.py
@ -25,7 +25,7 @@ True
 100
 # no length specified, read full amount requested
->>> len(BlockLoader().load('file://' + test_cdx_dir + 'example.cdx', 0, -1).read(400))
+>>> len(BlockLoader().load(to_file_url(test_cdx_dir + 'example.cdx'), 0, -1).read(400))
 400
 # HMAC Cookie Maker
@ -56,14 +56,41 @@ True
 >>> extract_client_cookie(dict(HTTP_COOKIE='x'), 'x')
 >>> extract_client_cookie({}, 'y')
 # extract_post_query tests
 # correct POST data
 >>> post_data = 'foo=bar&dir=%2Fbaz'
 >>> extract_post_query('POST', 'application/x-www-form-urlencoded', len(post_data), BytesIO(post_data))
 'foo=bar&dir=/baz'
 # unsupported method
 >>> extract_post_query('PUT', 'application/x-www-form-urlencoded', len(post_data), BytesIO(post_data))
 # unsupported type
 >>> extract_post_query('POST', 'text/plain', len(post_data), BytesIO(post_data))
 # invalid length
 >>> extract_post_query('POST', 'application/x-www-form-urlencoded', 'abc', BytesIO(post_data))
 >>> extract_post_query('POST', 'application/x-www-form-urlencoded', 0, BytesIO(post_data))
 # length too short
 >>> extract_post_query('POST', 'application/x-www-form-urlencoded', len(post_data) - 4, BytesIO(post_data))
 'foo=bar&dir=%2'
 # length too long
 >>> extract_post_query('POST', 'application/x-www-form-urlencoded', len(post_data) + 4, BytesIO(post_data))
 'foo=bar&dir=/baz'
 """
 #=================================================================
 import re
 import os
 from io import BytesIO
-from pywb.utils.loaders import BlockLoader, HMACCookieMaker
+from pywb.utils.loaders import BlockLoader, HMACCookieMaker, to_file_url
-from pywb.utils.loaders import LimitReader, extract_client_cookie
+from pywb.utils.loaders import LimitReader, extract_client_cookie, extract_post_query
 from pywb import get_test_dir
@ -82,7 +109,6 @@ def seek_read_full(seekable_reader, offset):
    return seekable_reader.readline()
 if __name__ == "__main__":
    import doctest
    doctest.testmod()
--- a/pywb/utils/wbexception.py
+++ b/pywb/utils/wbexception.py
@ -6,8 +6,9 @@ class WbException(Exception):
        Exception.__init__(self, msg)
        self.url = url
-    def status(self):
+# Default Error Code
-        return '500 Internal Server Error'
+#    def status(self):
 #        return '500 Internal Server Error'
 #=================================================================
--- a/pywb/warc/cdxindexer.py
+++ b/pywb/warc/cdxindexer.py
@ -115,8 +115,8 @@ def write_multi_cdx_index(output, inputs, **options):
            outpath = cdx_filename(filename)
            outpath = os.path.join(output, outpath)
-            with open(outpath, 'w') as outfile:
+            with open(outpath, 'wb') as outfile:
-                with open(fullpath, 'r') as infile:
+                with open(fullpath, 'rb') as infile:
                    write_cdx_index(outfile, infile, filename, **options)
    # write to one cdx file
@ -124,7 +124,7 @@ def write_multi_cdx_index(output, inputs, **options):
        if output == '-':
            outfile = sys.stdout
        else:
-            outfile = open(output, 'w')
+            outfile = open(output, 'wb')
        if options.get('sort'):
            writer_cls = SortedCDXWriter
@ -133,7 +133,7 @@ def write_multi_cdx_index(output, inputs, **options):
        with writer_cls(outfile, options.get('cdx09')) as writer:
            for fullpath, filename in iter_file_or_dir(inputs):
-                with open(fullpath, 'r') as infile:
+                with open(fullpath, 'rb') as infile:
                    entry_iter = create_index_iter(infile, **options)
                    for entry in entry_iter:
--- a/pywb/warc/pathresolvers.py
+++ b/pywb/warc/pathresolvers.py
@ -3,6 +3,7 @@ import redis
 from pywb.utils.binsearch import iter_exact
 import urlparse
 import urllib
 import os
 import logging
@ -56,7 +57,7 @@ class RedisResolver:
 class PathIndexResolver:
    def __init__(self, pathindex_file):
        self.pathindex_file = pathindex_file
-        self.reader = open(pathindex_file)
+        self.reader = open(pathindex_file, 'rb')
    def __call__(self, filename):
        result = iter_exact(self.reader, filename, '\t')
@ -92,6 +93,7 @@ def make_best_resolver(param):
    if url_parts.scheme == 'file':
        path = url_parts.path
        path = urllib.url2pathname(path)
    if os.path.isfile(path):
        logging.debug('Adding Path Index: ' + path)
--- a/pywb/warc/test/test_indexing.py
+++ b/pywb/warc/test/test_indexing.py
@ -160,7 +160,7 @@ TEST_CDX_DIR = get_test_dir() + 'cdx/'
 TEST_WARC_DIR = get_test_dir() + 'warcs/'
 def read_fully(cdx):
-    with open(TEST_CDX_DIR + cdx) as fh:
+    with open(TEST_CDX_DIR + cdx, 'rb') as fh:
        curr = BytesIO()
        while True:
            b = fh.read()
@ -172,7 +172,7 @@ def read_fully(cdx):
 def cdx_index(warc, **options):
    buff = BytesIO()
-    with open(TEST_WARC_DIR + warc) as fh:
+    with open(TEST_WARC_DIR + warc, 'rb') as fh:
        write_cdx_index(buff, fh,  warc, **options)
    return buff.getvalue()
@ -213,7 +213,7 @@ def cli_lines_with_dir(input_):
        print filename
-        with open(os.path.join(tmp_dir, filename), 'r') as fh:
+        with open(os.path.join(tmp_dir, filename), 'rb') as fh:
            lines = fh.read(8192).rstrip().split('\n')
    finally:
--- a/pywb/warc/test/test_pathresolvers.py
+++ b/pywb/warc/test/test_pathresolvers.py
@ -33,13 +33,13 @@ PrefixResolver('http://myhost.example.com/warcs/', contains = '/')
 RedisResolver('redis://myhost.example.com:1234/1')
 # a file
->>> r = make_best_resolver('file://' + os.path.realpath(__file__))
+>>> r = make_best_resolver(to_file_url(os.path.realpath(__file__)))
 >>> r.__class__.__name__
 'PathIndexResolver'
 # a dir
 >>> path = os.path.realpath(__file__)
->>> r = make_best_resolver('file://' + os.path.dirname(path))
+>>> r = make_best_resolver(to_file_url(os.path.dirname(path)))
 >>> r.__class__.__name__
 'PrefixResolver'
@ -54,8 +54,9 @@ RedisResolver('redis://myhost.example.com:1234/1')
 from pywb import get_test_dir
 from pywb.warc.pathresolvers import PrefixResolver, PathIndexResolver, RedisResolver
 from pywb.warc.pathresolvers import make_best_resolver, make_best_resolvers
-import os
+from pywb.utils.loaders import to_file_url
 import os
 from fakeredis import FakeStrictRedis
 from mock import patch
@ -68,7 +69,6 @@ def init_redis_resolver():
 def hset_path(filename, path):
    redis_resolver.redis.hset(redis_resolver.key_prefix + filename, 'path', path)
 redis_resolver = init_redis_resolver()
 #=================================================================
--- a/pywb/webapp/rangecache.py
+++ b/pywb/webapp/rangecache.py
@ -65,7 +65,7 @@ class RangeCache(object):
            maxlen = min(maxlen, end - start + 1)
        def read_range():
-            with open(spec['name']) as fh:
+            with open(spec['name'], 'rb') as fh:
                fh.seek(start)
                fh = LimitReader.wrap_stream(fh, maxlen)
                while True:
--- a/pywb/webapp/test/test_view_filters.py
+++ b/pywb/webapp/test/test_view_filters.py
@ -0,0 +1,20 @@
 """
 >>> format_ts('20141226101000')
 'Fri, Dec 26 2014 10:10:00'
 >>> format_ts('20141226101000', '%s')
 1419588600
 >>> is_wb_handler(DebugEchoHandler())
 False
 """
 from pywb.webapp.views import format_ts, is_wb_handler
 from pywb.webapp.handlers import DebugEchoHandler
 if __name__ == "__main__":
    import doctest
    doctest.testmod()
--- a/pywb/webapp/views.py
+++ b/pywb/webapp/views.py
@ -1,4 +1,4 @@
-from pywb.utils.timeutils import timestamp_to_datetime
+from pywb.utils.timeutils import timestamp_to_datetime, timestamp_to_sec
 from pywb.framework.wbrequestresponse import WbResponse
 from pywb.framework.memento import make_timemap, LINK_FORMAT
@ -22,11 +22,7 @@ class template_filter(object):
    Otherwise, the func name is the filter name
    """
    def __init__(self, param=None):
-        if hasattr(param, '__call__'):
+        self.name = param
            self.name = None
            self.__call__(param)
        else:
            self.name = param
    def __call__(self, func):
        name = self.name
@ -39,10 +35,13 @@ class template_filter(object):
 #=================================================================
 # Filters
-@template_filter
+@template_filter()
 def format_ts(value, format_='%a, %b %d %Y %H:%M:%S'):
-    value = timestamp_to_datetime(value)
+    if format_ == '%s':
-    return value.strftime(format_)
+        return timestamp_to_sec(value)
    else:
        value = timestamp_to_datetime(value)
        return value.strftime(format_)
@template_filter('urlsplit')
@ -51,17 +50,11 @@ def get_urlsplit(url):
    return split
@template_filter()
 def request_hostname(env):
    return env.get('HTTP_HOST', 'localhost')
@template_filter()
 def is_wb_handler(obj):
    if not hasattr(obj, 'handler'):
        return False
    #return isinstance(obj.handler, WBHandler)
    return obj.handler.__class__.__name__ == "WBHandler"
--- a/setup.py
+++ b/setup.py
@ -34,7 +34,7 @@ class PyTest(TestCommand):
 setup(
    name='pywb',
-    version='0.7.2',
+    version='0.7.5',
    url='https://github.com/ikreymer/pywb',
    author='Ilya Kreymer',
    author_email='ikreymer@gmail.com',
@ -58,10 +58,10 @@ setup(
        'pywb': ['static/flowplayer/*', 'static/*.*', 'ui/*', '*.yaml'],
        },
    data_files=[
-        ('sample_archive/cdx/', glob.glob('sample_archive/cdx/*')),
+        ('sample_archive/cdx', glob.glob('sample_archive/cdx/*')),
-        ('sample_archive/zipcdx/', glob.glob('sample_archive/zipcdx/*')),
+        ('sample_archive/zipcdx', glob.glob('sample_archive/zipcdx/*')),
-        ('sample_archive/warcs/', glob.glob('sample_archive/warcs/*')),
+        ('sample_archive/warcs', glob.glob('sample_archive/warcs/*')),
-        ('sample_archive/text_content/',
+        ('sample_archive/text_content',
            glob.glob('sample_archive/text_content/*')),
        ],
    install_requires=[
@ -90,7 +90,7 @@ setup(
        live-rewrite-server = pywb.apps.live_rewrite_server:main
        proxy-cert-auth = pywb.framework.certauth:main
        """,
-    zip_safe=False,
+    zip_safe=True,
    classifiers=[
        'Development Status :: 4 - Beta',
        'Environment :: Web Environment',