Merge branch 'develop' into https-proxy

2025-03-15 00:03:28 +01:00 · 2014-08-04 22:01:16 -07:00 · 2014-08-04 22:01:16 -07:00 · a2d86fa495
commit a2d86fa495
parent 924f71a4cc e1e8f679b2
16 changed files with 148 additions and 111 deletions
--- a/pywb/framework/wbrequestresponse.py
+++ b/pywb/framework/wbrequestresponse.py
@ -179,13 +179,6 @@ class WbResponse(object):
        return WbResponse(StatusAndHeaders(status, redir_headers))

    def __call__(self, env, start_response):
-
-        # PERF
-        perfstats = env.get('X_PERF')
-        if perfstats:
-            self.status_headers.headers.append(('X-Archive-Perf-Stats',
-                                                str(perfstats)))
-
        start_response(self.status_headers.statusline,
                       self.status_headers.headers)

--- a/pywb/rewrite/html_rewriter.py
+++ b/pywb/rewrite/html_rewriter.py
@ -101,12 +101,9 @@ class HTMLRewriterMixin(object):
        if not m:
            return meta_refresh

-        try:
-            meta_refresh = (meta_refresh[:m.start(1)] +
-                            self._rewrite_url(m.group(1)) +
-                            meta_refresh[m.end(1):])
-        except Exception:
-            pass
+        meta_refresh = (meta_refresh[:m.start(1)] +
+                        self._rewrite_url(m.group(1)) +
+                        meta_refresh[m.end(1):])

        return meta_refresh
    # ===========================
@ -136,7 +133,7 @@ class HTMLRewriterMixin(object):
                return value.lower() == attr_value.lower()
        return False

-    def _rewrite_tag_attrs(self, tag, tag_attrs, escape=False):
+    def _rewrite_tag_attrs(self, tag, tag_attrs):
        # special case: script or style parse context
        if ((tag in self.STATE_TAGS) and not self._wb_parse_context):
            self._wb_parse_context = tag
@ -197,7 +194,7 @@ class HTMLRewriterMixin(object):
                                         rebase_rewriter(attr_value))

            # write the attr!
-            self._write_attr(attr_name, attr_value, escape=escape)
+            self._write_attr(attr_name, attr_value)

        return True

@ -217,12 +214,10 @@ class HTMLRewriterMixin(object):

        return True

-    def _write_attr(self, name, value, escape=False):
+    def _write_attr(self, name, value):
        # parser doesn't differentiate between 'attr=""' and just 'attr'
        # 'attr=""' is more common, so use that form
        if value:
-            if escape:
-                value = cgi.escape(value, quote=True)
            self.out.write(' ' + name + '="' + value + '"')
        else:
            self.out.write(' ' + name + '=""')
@ -259,8 +254,8 @@ class HTMLRewriterMixin(object):

        return result

-    def _internal_close(self):
-        pass
+    def _internal_close(self):  # pragma: no cover
+        raise NotImplementedError('Base method')


 #=================================================================
@ -272,7 +267,8 @@ class HTMLRewriter(HTMLRewriterMixin, HTMLParser):
    def feed(self, string):
        try:
            HTMLParser.feed(self, string)
-        except HTMLParseError:
+        except HTMLParseError:  # pragma: no cover
+            # only raised in 2.6
            self.out.write(string)

    def _internal_close(self):
@ -283,7 +279,8 @@ class HTMLRewriter(HTMLRewriterMixin, HTMLParser):

        try:
            HTMLParser.close(self)
-        except HTMLParseError:
+        except HTMLParseError:  # pragma: no cover
+            # only raised in 2.6
            pass

    # called to unescape attrs -- do not unescape!
--- a/pywb/rewrite/rewrite_content.py
+++ b/pywb/rewrite/rewrite_content.py
@ -58,10 +58,12 @@ class RewriteContent:

        return (rewritten_headers, stream)

-    def rewrite_content(self, wb_url, urlrewriter, headers, stream,
+    def rewrite_content(self, urlrewriter, headers, stream,
                        head_insert_func=None, urlkey='',
                        cdx=None):

+        wb_url = urlrewriter.wburl
+
        if (wb_url.is_identity or
            (not head_insert_func and wb_url.is_banner_only)):
            status_headers, stream = self.sanitize_content(headers, stream)
@ -109,16 +111,6 @@ class RewriteContent:
            else:
                stream = DecompressingBufferedReader(stream)

-        #if self.decode_stream:
-        #    if rewritten_headers.charset:
-        #        encoding = rewritten_headers.charset
-        #    else:
-        #        (encoding, first_buff) = self._detect_charset(stream)
-
-            # if encoding not set or chardet thinks its ascii, use utf-8
-        #    if not encoding or encoding == 'ascii':
-        #        encoding = 'utf-8'
-
        rule = self.ruleset.get_first_match(urlkey)

        rewriter_class = rule.rewriters[text_type]
@ -149,8 +141,11 @@ class RewriteContent:
            rewriter = rewriter_class(urlrewriter)

        # Create rewriting generator
-        gen = self._rewriting_stream_gen(rewriter, encoding,
-                                         stream, first_buff)
+        gen =  self.stream_to_gen(stream,
+                                  rewrite_func=rewriter.rewrite,
+                                  final_read_func=rewriter.close,
+                                  first_buff=first_buff)
+

        return (status_headers, gen, True)

@ -179,32 +174,6 @@ class RewriteContent:
        for buff in self.stream_to_gen(stream):
            yield buff

-
-    # Create rewrite stream,  may even be chunked by front-end
-    def _rewriting_stream_gen(self, rewriter, encoding,
-                              stream, first_buff=None):
-
-        def do_rewrite(buff):
-            if encoding:
-                buff = self._decode_buff(buff, stream, encoding)
-            buff = rewriter.rewrite(buff)
-            if encoding:
-                buff = buff.encode(encoding)
-
-            return buff
-
-        def do_finish():
-            result = rewriter.close()
-            if encoding:
-                result = result.encode(encoding)
-
-            return result
-
-        return self.stream_to_gen(stream,
-                                  rewrite_func=do_rewrite,
-                                  final_read_func=do_finish,
-                                  first_buff=first_buff)
-
    @staticmethod
    def _decode_buff(buff, stream, encoding):  # pragma: no coverage
        try:
@ -223,26 +192,6 @@ class RewriteContent:

        return buff

-    def _detect_charset(self, stream):  # pragma: no coverage
-        full_buff = stream.read(8192)
-        io_buff = BytesIO(full_buff)
-
-        detector = UniversalDetector()
-
-        try:
-            buff = io_buff.read(256)
-            while buff:
-                detector.feed(buff)
-                if detector.done:
-                    break
-
-                buff = io_buff.read(256)
-        finally:
-            detector.close()
-
-        print "chardet result: ", str(detector.result)
-        return (detector.result['encoding'], full_buff)
-
    # Create a generator reading from a stream,
    # with optional rewriting and final read call
    @staticmethod
--- a/pywb/rewrite/rewrite_live.py
+++ b/pywb/rewrite/rewrite_live.py
@ -9,7 +9,7 @@ import logging

 from urlparse import urlsplit

-from pywb.utils.loaders import is_http, LimitReader
+from pywb.utils.loaders import is_http, LimitReader, BlockLoader
 from pywb.utils.timeutils import datetime_to_timestamp
 from pywb.utils.statusandheaders import StatusAndHeaders
 from pywb.utils.canonicalize import canonicalize
@ -30,7 +30,8 @@ class LiveRewriter(object):
            logging.debug('Live Rewrite Direct (no proxy)')

    def fetch_local_file(self, uri):
-        fh = open(uri)
+        #fh = open(uri)
+        fh = BlockLoader().load_file_or_resource(uri)

        content_type, _ = mimetypes.guess_type(uri)

@ -118,7 +119,7 @@ class LiveRewriter(object):

        return (status_headers, stream)

-    def fetch_request(self, wb_url, urlrewriter,
+    def fetch_request(self, url, urlrewriter,
                      head_insert_func=None,
                      urlkey=None,
                      env=None,
@ -127,15 +128,11 @@ class LiveRewriter(object):
                      follow_redirects=False,
                      proxies=None):

-        if isinstance(wb_url, str):
-            url = wb_url
-            wb_url = WbUrl(url)
-        else:
-            url = wb_url.url
-
        ts_err = url.split('///')

-        if len(ts_err) > 1:
+        # fixup for accidental erroneous rewrite which has ///
+        # (unless file:///)
+        if len(ts_err) > 1 and ts_err[0] != 'file:':
            url = 'http://' + ts_err[1]

        if url.startswith('//'):
@ -164,8 +161,7 @@ class LiveRewriter(object):
              }

        result = (self.rewriter.
-                  rewrite_content(wb_url,
-                                  urlrewriter,
+                  rewrite_content(urlrewriter,
                                  status_headers,
                                  stream,
                                  head_insert_func=head_insert_func,
--- a/pywb/rewrite/test/test_html_rewriter.py
+++ b/pywb/rewrite/test/test_html_rewriter.py
@ -99,6 +99,7 @@ ur"""
 >>> parse('<link href="abc.txt"><div>SomeTest</div>', head_insert = '<script>load_stuff();</script>')
 <link href="/web/20131226101010oe_/http://example.com/some/path/abc.txt"><script>load_stuff();</script><div>SomeTest</div>

+# doctype
 >>> parse('<!doctype html PUBLIC "public">')
 <!doctype html PUBLIC "public">

--- a/pywb/rewrite/test/test_rewrite_live.py
+++ b/pywb/rewrite/test/test_rewrite_live.py
@ -1,5 +1,6 @@
 from pywb.rewrite.rewrite_live import LiveRewriter
 from pywb.rewrite.url_rewriter import UrlRewriter
+from pywb.rewrite.wburl import WbUrl

 from pywb import get_test_dir

@ -9,6 +10,7 @@ from io import BytesIO
 # As such, the content may change and the test may break

 urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/pywb/')
+bn_urlrewriter = UrlRewriter('20131226101010bn_/http://example.com/some/path/index.html', '/pywb/')

 def head_insert_func(rule, cdx):
    if rule.js_rewrite_location == True:
@ -33,6 +35,51 @@ def test_local_1():
    assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff


+def test_local_no_head():
+    status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample_no_head.html',
+                                         urlrewriter,
+                                         head_insert_func,
+                                         'com,example,test)/')
+
+    # wombat insert added
+    assert '<script src="/static/default/wombat.js"> </script>' in buff
+
+    # location rewritten
+    assert 'window.WB_wombat_location = "/other.html"' in buff
+
+    # link rewritten
+    assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff
+
+def test_local_no_head_banner_only():
+    status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample_no_head.html',
+                                         bn_urlrewriter,
+                                         head_insert_func,
+                                         'com,example,test)/')
+
+    # wombat insert added
+    assert '<script src="/static/default/wombat.js"> </script>' in buff
+
+    # location NOT rewritten
+    assert 'window.location = "/other.html"' in buff
+
+    # link NOT rewritten
+    assert '"another.html"' in buff
+
+def test_local_banner_only():
+    status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html',
+                                         bn_urlrewriter,
+                                         head_insert_func,
+                                         'com,example,test)/')
+
+    # wombat insert added
+    assert '<head><script src="/static/default/wombat.js"> </script>' in buff
+
+    # location NOT rewritten
+    assert 'window.location = "/other.html"' in buff
+
+    # link NOT rewritten
+    assert '"another.html"' in buff
+
 def test_local_2_no_js_location_rewrite():
    status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html',
                                         urlrewriter,
@ -76,8 +123,7 @@ def test_example_4_rewrite_err():
    assert status_headers.get_statuscode() == '200'

 def test_example_domain_specific_3():
-    urlrewriter2 = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/pywb/')
-    status_headers, buff = get_rewritten('http://facebook.com/digitalpreservation', urlrewriter2, follow_redirects=True)
+    status_headers, buff = get_rewritten('http://facebook.com/digitalpreservation', urlrewriter, follow_redirects=True)

    # comment out bootloader
    assert '/* Bootloader.configurePage' in buff
--- a/pywb/rewrite/test/test_url_rewriter.py
+++ b/pywb/rewrite/test/test_url_rewriter.py
@ -65,6 +65,9 @@
 >>> do_rewrite('mailto:example@example.com', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/')
 'mailto:example@example.com'

+>>> do_rewrite('file:///some/path/', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/')
+'file:///some/path/'
+
 >>> UrlRewriter('19960708im_/http://domain.example.com/path.txt', '/abc/').get_abs_url()
 '/abc/19960708im_/'

@ -73,10 +76,10 @@


 # HttpsUrlRewriter tests
->>> HttpsUrlRewriter(None, None).rewrite('https://example.com/abc')
+>>> HttpsUrlRewriter('http://example.com/', None).rewrite('https://example.com/abc')
 'http://example.com/abc'

->>> HttpsUrlRewriter(None, None).rewrite('http://example.com/abc')
+>>> HttpsUrlRewriter('http://example.com/', None).rewrite('http://example.com/abc')
 'http://example.com/abc'

 """
--- a/pywb/rewrite/url_rewriter.py
+++ b/pywb/rewrite/url_rewriter.py
@ -13,7 +13,8 @@ class UrlRewriter(object):
    instance and an optional full path prefix
    """

-    NO_REWRITE_URI_PREFIX = ['#', 'javascript:', 'data:', 'mailto:', 'about:']
+    NO_REWRITE_URI_PREFIX = ['#', 'javascript:', 'data:',
+                             'mailto:', 'about:', 'file:']

    PROTOCOLS = ['http:', 'https:', 'ftp:', 'mms:', 'rtsp:', 'wais:']

@ -125,7 +126,7 @@ class UrlRewriter(object):


 #=================================================================
-class HttpsUrlRewriter(object):
+class HttpsUrlRewriter(UrlRewriter):
    """
    A url rewriter which urls that start with https:// to http://
    Other urls/input is unchanged.
@ -134,9 +135,6 @@ class HttpsUrlRewriter(object):
    HTTP = 'http://'
    HTTPS = 'https://'

-    def __init__(self, wburl, prefix, full_prefix=None):
-        pass
-
    def rewrite(self, url, mod=None):
        if url.startswith(self.HTTPS):
            result = self.HTTP + url[len(self.HTTPS):]
--- a/pywb/ui/query.html
+++ b/pywb/ui/query.html
@ -1,3 +1,28 @@
+<html>
+<head>
+<script>
+function ts_to_date(ts, is_gmt)
+{
+    if (ts.length < 14) {
+        return ts;
+    }
+    
+    var datestr = (ts.substring(0, 4) + "-" + 
+                  ts.substring(4, 6) + "-" +
+                  ts.substring(6, 8) + "T" +
+                  ts.substring(8, 10) + ":" +
+                  ts.substring(10, 12) + ":" +
+                  ts.substring(12, 14) + "-00:00");
+    
+    var date = new Date(datestr);
+    if (is_gmt) {
+        return date.toGMTString();
+    } else {
+        return date.toLocaleString();
+    }
+}
+</script>
+</head>
 <body>
  <h2>pywb Sample Calendar Results</h2>
  <b>{{ cdx_lines | length }}</b> captures of <b>{{ url }}</b>
@ -10,7 +35,9 @@
    </tr>
    {% for cdx in cdx_lines  %}
    <tr style="{{ 'font-weight: bold' if cdx['mimetype'] != 'warc/revisit' else '' }}">
-      <td><a href="{{ prefix }}{{ cdx.timestamp }}/{{ cdx.original }}">{{ cdx['timestamp'] | format_ts}}</a></td>
+      <td><a href="{{ prefix }}{{ cdx.timestamp }}/{{ cdx.original }}">
+      <script>document.write(ts_to_date("{{ cdx['timestamp']}}", true))</script>
+      </a></td>
      <td>{{ cdx['statuscode'] }}</td>
      <td>{{ cdx['original'] }}</td>
      <td>{{ cdx['filename'] }}</td>
@ -21,3 +48,4 @@
  <i><b>* Unique captures are bold.</b> Other captures are duplicates of a previous capture.</i>
  </p>
 </body>
+</html>
--- a/pywb/utils/loaders.py
+++ b/pywb/utils/loaders.py
@ -96,7 +96,7 @@ class BlockLoader(object):
        else:
            return self.load_file_or_resource(url, offset, length)

-    def load_file_or_resource(self, url, offset, length):
+    def load_file_or_resource(self, url, offset=0, length=-1):
        """
        Load a file-like reader from the local file system
        """
--- a/pywb/utils/test/test_loaders.py
+++ b/pywb/utils/test/test_loaders.py
@ -1,5 +1,5 @@
 #=================================================================
-"""
+r"""
 # LimitReader Tests
 >>> LimitReader(BytesIO('abcdefghjiklmnopqrstuvwxyz'), 10).read(26)
 'abcdefghji'
@ -32,10 +32,14 @@ True
 >>> BlockLoader(HMACCookieMaker('test', 'test', 5)).load('http://example.com', 41, 14).read()
 'Example Domain'

-# fixed cookie
+# fixed cookie, range request
 >>> BlockLoader('some=value').load('http://example.com', 41, 14).read()
 'Example Domain'

+# range request
+>>> BlockLoader().load('http://example.com', 1262).read()
+'</html>\n'
+
 # test with extra id, ensure 4 parts of the A-B=C-D form are present
 >>> len(re.split('[-=]', HMACCookieMaker('test', 'test', 5).make('extra')))
 4
--- a/pywb/webapp/live_rewrite_handler.py
+++ b/pywb/webapp/live_rewrite_handler.py
@ -38,6 +38,10 @@ class RewriteHandler(SearchPageWbUrlHandler):
            return self.render_content(wbrequest)

        except Exception as exc:
+            import traceback
+            err_details = traceback.format_exc(exc)
+            print err_details
+
            url = wbrequest.wb_url.url
            msg = 'Could not load the url from the live web: ' + url
            raise LiveResourceException(msg=msg, url=url)
@ -53,8 +57,8 @@ class RewriteHandler(SearchPageWbUrlHandler):
        if ref_wburl_str:
            wbrequest.env['REL_REFERER'] = WbUrl(ref_wburl_str).url

-        wb_url = wbrequest.wb_url
-        result = self.rewriter.fetch_request(wb_url, wbrequest.urlrewriter,
+        result = self.rewriter.fetch_request(wbrequest.wb_url.url,
+                                             wbrequest.urlrewriter,
                                             head_insert_func=head_insert_func,
                                             req_headers=req_headers,
                                             env=wbrequest.env)
--- a/pywb/webapp/replay_views.py
+++ b/pywb/webapp/replay_views.py
@ -130,8 +130,7 @@ class ReplayView(object):
                                create_insert_func(wbrequest))

        result = (self.content_rewriter.
-                  rewrite_content(wbrequest.wb_url,
-                                  urlrewriter,
+                  rewrite_content(urlrewriter,
                                  headers=status_headers,
                                  stream=stream,
                                  head_insert_func=head_insert_func,
--- a/sample_archive/text_content/sample_no_head.html
+++ b/sample_archive/text_content/sample_no_head.html
@ -0,0 +1,8 @@
+<script>
+var some_val = false;
+if (some_val) {
+    window.location = "/other.html";
+}
+</script>
+Test Content
+<a href="another.html">Some Link</a>
--- a/tests/test_integration.py
+++ b/tests/test_integration.py
@ -301,6 +301,11 @@ class TestWb:
        assert resp.status_int == 200
        assert '"data": "^"' in resp.body

+    def test_post_invalid(self):
+        # not json
+        resp = self.testapp.post_json('/pywb/20140610001255mp_/http://httpbin.org/post?foo=bar', {'data': '^'}, status=404)
+        assert resp.status_int == 404
+
    def test_post_redirect(self):
        # post handled without redirect (since 307 not allowed)
        resp = self.testapp.post('/post', {'foo': 'bar', 'test': 'abc'}, headers=[('Referer', 'http://localhost:8080/pywb/2014mp_/http://httpbin.org/post')])
@ -308,7 +313,6 @@ class TestWb:
        assert '"foo": "bar"' in resp.body
        assert '"test": "abc"' in resp.body

-
    def test_excluded_content(self):
        resp = self.testapp.get('/pywb/mp_/http://www.iana.org/_img/bookmark_icon.ico', status = 403)
        assert resp.status_int == 403
--- a/tests/test_live_rewriter.py
+++ b/tests/test_live_rewriter.py
@ -17,6 +17,13 @@ class TestLiveRewriter:
        resp = self.testapp.get('/rewrite/mp_/http://facebook.com/')
        assert resp.status_int == 301

+    def test_live_rewrite_post(self):
+        resp = self.testapp.post('/rewrite/mp_/httpbin.org/post', {'foo': 'bar', 'test': 'abc'})
+        assert resp.status_int == 200
+        assert '"foo": "bar"' in resp.body
+        assert '"test": "abc"' in resp.body
+        assert resp.status_int == 200
+
    def test_live_rewrite_frame(self):
        resp = self.testapp.get('/rewrite/http://example.com/')
        assert resp.status_int == 200