Merge branch 'develop' 0.6.6 into video

2025-03-15 00:03:28 +01:00 · 2014-12-06 19:19:12 -08:00 · 2014-12-06 19:19:12 -08:00 · 7e36ad29e7
commit 7e36ad29e7
parent 7251c37c08 0495423e86
9 changed files with 84 additions and 19 deletions
--- a/CHANGES.rst
+++ b/CHANGES.rst
@ -1,7 +1,36 @@
 pywb 0.7.0 changelist
 ~~~~~~~~~~~~~~~~~~~~~
-Video Buffering Replay
+Video/streaming content replay and buffering improvements!
 pywb 0.6.6 changelist
 ~~~~~~~~~~~~~~~~~~~~~
 * Beginning of new rewrite options, settable per collections and stored in UrlRewriter. Available options:
  - `rewrite_base` - set to False to disable rewriting `<base href="...">` tag
  - `rewrite_rel_canon` - set to false to disable rewriting `<link rel=canon href="...">`
 * JS rewrite: Don't rewrite location if starting with '$'
 pywb 0.6.5 changelist
 ~~~~~~~~~~~~~~~~~~~~~
 * fix static handling when content type can not be guessed, default to 'application/octet-stream'
 * rewrite fix: understand partially encoded urls such as http%3A// in WbUrl, decode correctly
 * rewrite fix: rewrite \/\/example.com and \\/\\/example.com in JS same as \\example.com
 * cookies: add exact cookie rewriter which sets cookie to exact url only, never collection or host root
 * don't rewrite rel=canonical links for services which rely on these
 * cdx-indexer: Detect non-gzip chunk encoded .warc.gz/arc.gz archive files and show a meaningful
  error message explaining how to fix issue (uncompress and possibly use warctools warc2warc to recompress)
 >>>>>>> develop
 pywb 0.6.4 changelist
--- a/README.rst
+++ b/README.rst
@ -58,10 +58,10 @@ running ``python setup.py install``:
  for all options.
-* ``cdx-server`` -- a CDX API only server which returns a responses about CDX captures in bulk. 
+* ``cdx-server`` -- a CDX API only server which returns a responses about CDX captures in bulk.
  Includes most of the features of the `original cdx server implementation <https://github.com/internetarchive/wayback/tree/master/wayback-cdx-server>`_,
  updated documentation coming soon.
-  
+
 * ``proxy-cert-auth`` -- a utility to support proxy mode. It can be used in CA root certificate, or per-host certificate with an existing root cert.
--- a/pywb/framework/archivalrouter.py
+++ b/pywb/framework/archivalrouter.py
@ -62,7 +62,8 @@ class ArchivalRouter(object):
                              use_abs_prefix=use_abs_prefix,
                              wburl_class=route.handler.get_wburl_type(),
                              urlrewriter_class=UrlRewriter,
-                              cookie_scope=route.cookie_scope)
+                              cookie_scope=route.cookie_scope,
                              rewrite_opts=route.rewrite_opts)
        # Allow for applying of additional filters
        route.apply_filters(wbrequest, matcher)
@ -101,6 +102,7 @@ class Route(object):
        # collection id from regex group (default 0)
        self.coll_group = coll_group
        self.cookie_scope = config.get('cookie_scope')
        self.rewrite_opts = config.get('rewrite_opts', {})
        self._custom_init(config)
    def is_handling(self, request_uri):
--- a/pywb/framework/wbrequestresponse.py
+++ b/pywb/framework/wbrequestresponse.py
@ -38,7 +38,8 @@ class WbRequest(object):
                 wburl_class=None,
                 urlrewriter_class=None,
                 is_proxy=False,
-                 cookie_scope=None):
+                 cookie_scope=None,
                 rewrite_opts={}):
        self.env = env
@ -77,7 +78,8 @@ class WbRequest(object):
                                                 host_prefix + rel_prefix,
                                                 rel_prefix,
                                                 env.get('SCRIPT_NAME', '/'),
-                                                 cookie_scope)
+                                                 cookie_scope,
                                                 rewrite_opts)
            self.urlrewriter.deprefix_url()
        else:
--- a/pywb/rewrite/html_rewriter.py
+++ b/pywb/rewrite/html_rewriter.py
@ -94,6 +94,9 @@ class HTMLRewriterMixin(object):
        self.rewrite_tags = self._init_rewrite_tags(defmod)
        # get opts from urlrewriter
        self.opts = url_rewriter.rewrite_opts
    # ===========================
    META_REFRESH_REGEX = re.compile('^[\\d.]+\\s*;\\s*url\\s*=\\s*(.+?)\\s*$',
                                    re.IGNORECASE | re.MULTILINE)
@ -186,9 +189,11 @@ class HTMLRewriterMixin(object):
            elif attr_name == 'crossorigin':
                attr_name = '_crossorigin'
-            # special case: link don't rewrite canonical
+            # special case: if rewrite_canon not set,
            # don't rewrite rel=canonical
            elif tag == 'link' and attr_name == 'href':
-                if not self.has_attr(tag_attrs, ('rel', 'canonical')):
+                if (self.opts.get('rewrite_rel_canon', True) or
                    not self.has_attr(tag_attrs, ('rel', 'canonical'))):
                    rw_mod = handler.get(attr_name)
                    attr_value = self._rewrite_url(attr_value, rw_mod)
@ -209,17 +214,21 @@ class HTMLRewriterMixin(object):
                    rw_mod = 'oe_'
                    attr_value = self._rewrite_url(attr_value, rw_mod)
            # special case: base tag
            elif (tag == 'base') and (attr_name == 'href') and attr_value:
                rw_mod = handler.get(attr_name)
                base_value = self._rewrite_url(attr_value, rw_mod)
                if self.opts.get('rewrite_base', True):
                    attr_value = base_value
                self.url_rewriter = (self.url_rewriter.
                                     rebase_rewriter(base_value))
            else:
                # rewrite url using tag handler
                rw_mod = handler.get(attr_name)
                if rw_mod is not None:
                    attr_value = self._rewrite_url(attr_value, rw_mod)
                # special case: base tag
                if (tag == 'base') and (attr_name == 'href') and attr_value:
                    self.url_rewriter = (self.url_rewriter.
                                         rebase_rewriter(attr_value))
            # write the attr!
            self._write_attr(attr_name, attr_value)
--- a/pywb/rewrite/regex_rewriters.py
+++ b/pywb/rewrite/regex_rewriters.py
@ -130,7 +130,7 @@ class JSLinkAndLocationRewriter(JSLinkOnlyRewriter):
    def __init__(self, rewriter, rules=[], prefix='WB_wombat_'):
        rules = rules + [
-             (r'(?<!/)\blocation\b(?!\":)', RegexRewriter.add_prefix(prefix), 0),
+             (r'(?<![/$])\blocation\b(?!\":)', RegexRewriter.add_prefix(prefix), 0),
             (r'(?<=document\.)domain', RegexRewriter.add_prefix(prefix), 0),
             (r'(?<=document\.)referrer', RegexRewriter.add_prefix(prefix), 0),
             (r'(?<=document\.)cookie', RegexRewriter.add_prefix(prefix), 0),
--- a/pywb/rewrite/test/test_html_rewriter.py
+++ b/pywb/rewrite/test/test_html_rewriter.py
@ -20,13 +20,22 @@ ur"""
 #>>> parse('<input "selected"><img src></div>')
 #<input "selected"=""><img src=""></div>
-# Base Tests
+# Base Tests -- w/ rewrite (default)
 >>> parse('<html><head><base href="http://example.com/diff/path/file.html"/>')
 <html><head><base href="/web/20131226101010/http://example.com/diff/path/file.html"/>
 >>> parse('<base href="static/"/><img src="image.gif"/>')
 <base href="/web/20131226101010/http://example.com/some/path/static/"/><img src="/web/20131226101010im_/http://example.com/some/path/static/image.gif"/>
 # Base Tests -- no rewrite
 >>> parse('<html><head><base href="http://example.com/diff/path/file.html"/>', urlrewriter=no_base_canon_rewriter)
 <html><head><base href="http://example.com/diff/path/file.html"/>
 >>> parse('<base href="static/"/><img src="image.gif"/>', urlrewriter=no_base_canon_rewriter)
 <base href="static/"/><img src="/web/20131226101010im_/http://example.com/some/path/static/image.gif"/>
 # HTML Entities
 >>> parse('<a href="">&rsaquo; &nbsp; &#62; &#63</div>')
 <a href="">&rsaquo; &nbsp; &#62; &#63</div>
@ -106,8 +115,12 @@ ur"""
 >>> parse('<link href="abc.txt"><div>SomeTest</div>', head_insert = '<script>load_stuff();</script>')
 <link href="/web/20131226101010oe_/http://example.com/some/path/abc.txt"><script>load_stuff();</script><div>SomeTest</div>
-# don't rewrite rel=canonical
+# rel=canonical: rewrite (default)
 >>> parse('<link rel=canonical href="http://example.com/">')
 <link rel="canonical" href="/web/20131226101010oe_/http://example.com/">
 # rel=canonical: no_rewrite
 >>> parse('<link rel=canonical href="http://example.com/">', urlrewriter=no_base_canon_rewriter)
 <link rel="canonical" href="http://example.com/">
 # doctype
@ -147,7 +160,12 @@ import pprint
 urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/web/')
-def parse(data, head_insert = None):
+no_base_canon_rewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html',
                                     '/web/',
                                     rewrite_opts=dict(rewrite_rel_canon=False,
                                                       rewrite_base=False))
 def parse(data, head_insert=None, urlrewriter=urlrewriter):
    parser = HTMLRewriter(urlrewriter, head_insert = head_insert)
    #data = data.decode('utf-8')
    result = parser.rewrite(data) + parser.close()
--- a/pywb/rewrite/url_rewriter.py
+++ b/pywb/rewrite/url_rewriter.py
@ -20,13 +20,14 @@ class UrlRewriter(object):
    REL_SCHEME = ('//', r'\/\/', r'\\/\\/')
    def __init__(self, wburl, prefix, full_prefix=None, rel_prefix=None,
-                 root_path=None, cookie_scope=None):
+                 root_path=None, cookie_scope=None, rewrite_opts={}):
        self.wburl = wburl if isinstance(wburl, WbUrl) else WbUrl(wburl)
        self.prefix = prefix
        self.full_prefix = full_prefix
        self.rel_prefix = rel_prefix if rel_prefix else prefix
        self.root_path = root_path if root_path else '/'
        self.cookie_scope = cookie_scope
        self.rewrite_opts = rewrite_opts
    def rewrite(self, url, mod=None):
        # if special protocol, no rewriting at all
--- a/pywb/webapp/handlers.py
+++ b/pywb/webapp/handlers.py
@ -193,7 +193,11 @@ class StaticHandler(BaseHandler):
            else:
                reader = iter(lambda: data.read(), '')
-            content_type, _ = mimetypes.guess_type(full_path)
+            content_type = 'application/octet-stream'
            guessed = mimetypes.guess_type(full_path)
            if guessed[0]:
                content_type = guessed[0]
            return WbResponse.text_stream(reader,
                                          content_type=content_type,