From 71e8ada57d3e1ef884424ae49067bf2b68094f10 Mon Sep 17 00:00:00 2001
From: Ilya Kreymer <ikreymer@gmail.com>
Date: Mon, 4 Aug 2014 20:45:02 -0700
Subject: [PATCH] rewrite: add test for banner-only mode, rewriting w/o a head
 using local 'sample_no_head' file. query.html: use client side rewriting for
 calendar dates rewrite: remove unused decode stuff

---
 pywb/rewrite/rewrite_content.py               | 63 ++-----------------
 pywb/rewrite/rewrite_live.py                  |  9 ++-
 pywb/rewrite/test/test_html_rewriter.py       |  1 +
 pywb/rewrite/test/test_rewrite_live.py        | 53 ++++++++++++++++
 pywb/rewrite/test/test_url_rewriter.py        |  3 +
 pywb/rewrite/url_rewriter.py                  |  3 +-
 pywb/ui/query.html                            | 30 ++++++++-
 pywb/utils/loaders.py                         |  2 +-
 .../text_content/sample_no_head.html          |  8 +++
 9 files changed, 108 insertions(+), 64 deletions(-)
 create mode 100644 sample_archive/text_content/sample_no_head.html

diff --git a/pywb/rewrite/rewrite_content.py b/pywb/rewrite/rewrite_content.py
index 3a635d4e..3cbcd362 100644
--- a/pywb/rewrite/rewrite_content.py
+++ b/pywb/rewrite/rewrite_content.py
@@ -105,16 +105,6 @@ class RewriteContent:
             else:
                 stream = DecompressingBufferedReader(stream)
 
-        #if self.decode_stream:
-        #    if rewritten_headers.charset:
-        #        encoding = rewritten_headers.charset
-        #    else:
-        #        (encoding, first_buff) = self._detect_charset(stream)
-
-            # if encoding not set or chardet thinks its ascii, use utf-8
-        #    if not encoding or encoding == 'ascii':
-        #        encoding = 'utf-8'
-
         rule = self.ruleset.get_first_match(urlkey)
 
         rewriter_class = rule.rewriters[text_type]
@@ -145,8 +135,11 @@ class RewriteContent:
             rewriter = rewriter_class(urlrewriter)
 
         # Create rewriting generator
-        gen = self._rewriting_stream_gen(rewriter, encoding,
-                                         stream, first_buff)
+        gen =  self.stream_to_gen(stream,
+                                  rewrite_func=rewriter.rewrite,
+                                  final_read_func=rewriter.close,
+                                  first_buff=first_buff)
+
 
         return (status_headers, gen, True)
 
@@ -175,32 +168,6 @@ class RewriteContent:
         for buff in self.stream_to_gen(stream):
             yield buff
 
-
-    # Create rewrite stream,  may even be chunked by front-end
-    def _rewriting_stream_gen(self, rewriter, encoding,
-                              stream, first_buff=None):
-
-        def do_rewrite(buff):
-            if encoding:
-                buff = self._decode_buff(buff, stream, encoding)
-            buff = rewriter.rewrite(buff)
-            if encoding:
-                buff = buff.encode(encoding)
-
-            return buff
-
-        def do_finish():
-            result = rewriter.close()
-            if encoding:
-                result = result.encode(encoding)
-
-            return result
-
-        return self.stream_to_gen(stream,
-                                  rewrite_func=do_rewrite,
-                                  final_read_func=do_finish,
-                                  first_buff=first_buff)
-
     @staticmethod
     def _decode_buff(buff, stream, encoding):  # pragma: no coverage
         try:
@@ -219,26 +186,6 @@ class RewriteContent:
 
         return buff
 
-    def _detect_charset(self, stream):  # pragma: no coverage
-        full_buff = stream.read(8192)
-        io_buff = BytesIO(full_buff)
-
-        detector = UniversalDetector()
-
-        try:
-            buff = io_buff.read(256)
-            while buff:
-                detector.feed(buff)
-                if detector.done:
-                    break
-
-                buff = io_buff.read(256)
-        finally:
-            detector.close()
-
-        print "chardet result: ", str(detector.result)
-        return (detector.result['encoding'], full_buff)
-
     # Create a generator reading from a stream,
     # with optional rewriting and final read call
     @staticmethod
diff --git a/pywb/rewrite/rewrite_live.py b/pywb/rewrite/rewrite_live.py
index cbd3f106..5d77ff52 100644
--- a/pywb/rewrite/rewrite_live.py
+++ b/pywb/rewrite/rewrite_live.py
@@ -9,7 +9,7 @@ import logging
 
 from urlparse import urlsplit
 
-from pywb.utils.loaders import is_http, LimitReader
+from pywb.utils.loaders import is_http, LimitReader, BlockLoader
 from pywb.utils.timeutils import datetime_to_timestamp
 from pywb.utils.statusandheaders import StatusAndHeaders
 from pywb.utils.canonicalize import canonicalize
@@ -30,7 +30,8 @@ class LiveRewriter(object):
             logging.debug('Live Rewrite Direct (no proxy)')
 
     def fetch_local_file(self, uri):
-        fh = open(uri)
+        #fh = open(uri)
+        fh = BlockLoader().load_file_or_resource(uri)
 
         content_type, _ = mimetypes.guess_type(uri)
 
@@ -135,12 +136,14 @@ class LiveRewriter(object):
 
         ts_err = url.split('///')
 
-        if len(ts_err) > 1:
+        if len(ts_err) > 1 and ts_err[0] != 'file:':
             url = 'http://' + ts_err[1]
 
         if url.startswith('//'):
             url = 'http:' + url
 
+        print 'URL ', url
+
         if is_http(url):
             (status_headers, stream) = self.fetch_http(url, env, req_headers,
                                                        follow_redirects,
diff --git a/pywb/rewrite/test/test_html_rewriter.py b/pywb/rewrite/test/test_html_rewriter.py
index ae9b24e2..9ea8edc0 100644
--- a/pywb/rewrite/test/test_html_rewriter.py
+++ b/pywb/rewrite/test/test_html_rewriter.py
@@ -99,6 +99,7 @@ ur"""
 >>> parse('<link href="abc.txt"><div>SomeTest</div>', head_insert = '<script>load_stuff();</script>')
 <link href="/web/20131226101010oe_/http://example.com/some/path/abc.txt"><script>load_stuff();</script><div>SomeTest</div>
 
+# doctype
 >>> parse('<!doctype html PUBLIC "public">')
 <!doctype html PUBLIC "public">
 
diff --git a/pywb/rewrite/test/test_rewrite_live.py b/pywb/rewrite/test/test_rewrite_live.py
index 24f76da1..af25762b 100644
--- a/pywb/rewrite/test/test_rewrite_live.py
+++ b/pywb/rewrite/test/test_rewrite_live.py
@@ -1,5 +1,6 @@
 from pywb.rewrite.rewrite_live import LiveRewriter
 from pywb.rewrite.url_rewriter import UrlRewriter
+from pywb.rewrite.wburl import WbUrl
 
 from pywb import get_test_dir
 
@@ -33,6 +34,58 @@ def test_local_1():
     assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff
 
 
+def test_local_no_head():
+    wb_url = WbUrl('file://' + get_test_dir() + 'text_content/sample_no_head.html')
+    status_headers, buff = get_rewritten(wb_url,
+                                         urlrewriter,
+                                         head_insert_func,
+                                         'com,example,test)/')
+
+    # wombat insert added
+    assert '<script src="/static/default/wombat.js"> </script>' in buff
+
+    # location rewritten
+    assert 'window.WB_wombat_location = "/other.html"' in buff
+
+    # link rewritten
+    assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff
+
+def test_local_no_head_banner_only():
+    wb_url = WbUrl('file://' + get_test_dir() + 'text_content/sample_no_head.html')
+    wb_url.mod = 'bn_'
+
+    status_headers, buff = get_rewritten(wb_url,
+                                         urlrewriter,
+                                         head_insert_func,
+                                         'com,example,test)/')
+
+    # wombat insert added
+    assert '<script src="/static/default/wombat.js"> </script>' in buff
+
+    # location NOT rewritten
+    assert 'window.location = "/other.html"' in buff
+
+    # link NOT rewritten
+    assert '"another.html"' in buff
+
+def test_local_banner_only():
+    wb_url = WbUrl('file://' + get_test_dir() + 'text_content/sample.html')
+    wb_url.mod = 'bn_'
+
+    status_headers, buff = get_rewritten(wb_url,
+                                         urlrewriter,
+                                         head_insert_func,
+                                         'com,example,test)/')
+
+    # wombat insert added
+    assert '<head><script src="/static/default/wombat.js"> </script>' in buff
+
+    # location NOT rewritten
+    assert 'window.location = "/other.html"' in buff
+
+    # link NOT rewritten
+    assert '"another.html"' in buff
+
 def test_local_2_no_js_location_rewrite():
     status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html',
                                          urlrewriter,
diff --git a/pywb/rewrite/test/test_url_rewriter.py b/pywb/rewrite/test/test_url_rewriter.py
index a4173d3a..345c4faf 100644
--- a/pywb/rewrite/test/test_url_rewriter.py
+++ b/pywb/rewrite/test/test_url_rewriter.py
@@ -65,6 +65,9 @@
 >>> do_rewrite('mailto:example@example.com', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/')
 'mailto:example@example.com'
 
+>>> do_rewrite('file:///some/path/', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/')
+'file:///some/path/'
+
 >>> UrlRewriter('19960708im_/http://domain.example.com/path.txt', '/abc/').get_abs_url()
 '/abc/19960708im_/'
 
diff --git a/pywb/rewrite/url_rewriter.py b/pywb/rewrite/url_rewriter.py
index d5593a22..d9b42c1b 100644
--- a/pywb/rewrite/url_rewriter.py
+++ b/pywb/rewrite/url_rewriter.py
@@ -13,7 +13,8 @@ class UrlRewriter(object):
     instance and an optional full path prefix
     """
 
-    NO_REWRITE_URI_PREFIX = ['#', 'javascript:', 'data:', 'mailto:', 'about:']
+    NO_REWRITE_URI_PREFIX = ['#', 'javascript:', 'data:',
+                             'mailto:', 'about:', 'file:']
 
     PROTOCOLS = ['http:', 'https:', 'ftp:', 'mms:', 'rtsp:', 'wais:']
 
diff --git a/pywb/ui/query.html b/pywb/ui/query.html
index c78e1b49..2d1f5c86 100644
--- a/pywb/ui/query.html
+++ b/pywb/ui/query.html
@@ -1,3 +1,28 @@
+<html>
+<head>
+<script>
+function ts_to_date(ts, is_gmt)
+{
+    if (ts.length < 14) {
+        return ts;
+    }
+    
+    var datestr = (ts.substring(0, 4) + "-" + 
+                  ts.substring(4, 6) + "-" +
+                  ts.substring(6, 8) + "T" +
+                  ts.substring(8, 10) + ":" +
+                  ts.substring(10, 12) + ":" +
+                  ts.substring(12, 14) + "-00:00");
+    
+    var date = new Date(datestr);
+    if (is_gmt) {
+        return date.toGMTString();
+    } else {
+        return date.toLocaleString();
+    }
+}
+</script>
+</head>
 <body>
   <h2>pywb Sample Calendar Results</h2>
   <b>{{ cdx_lines | length }}</b> captures of <b>{{ url }}</b>
@@ -10,7 +35,9 @@
     </tr>
     {% for cdx in cdx_lines  %}
     <tr style="{{ 'font-weight: bold' if cdx['mimetype'] != 'warc/revisit' else '' }}">
-      <td><a href="{{ prefix }}{{ cdx.timestamp }}/{{ cdx.original }}">{{ cdx['timestamp'] | format_ts}}</a></td>
+      <td><a href="{{ prefix }}{{ cdx.timestamp }}/{{ cdx.original }}">
+      <script>document.write(ts_to_date("{{ cdx['timestamp']}}", true))</script>
+      </a></td>
       <td>{{ cdx['statuscode'] }}</td>
       <td>{{ cdx['original'] }}</td>
       <td>{{ cdx['filename'] }}</td>
@@ -21,3 +48,4 @@
   <i><b>* Unique captures are bold.</b> Other captures are duplicates of a previous capture.</i>
   </p>
 </body>
+</html>
diff --git a/pywb/utils/loaders.py b/pywb/utils/loaders.py
index 6b383493..107379a2 100644
--- a/pywb/utils/loaders.py
+++ b/pywb/utils/loaders.py
@@ -96,7 +96,7 @@ class BlockLoader(object):
         else:
             return self.load_file_or_resource(url, offset, length)
 
-    def load_file_or_resource(self, url, offset, length):
+    def load_file_or_resource(self, url, offset=0, length=-1):
         """
         Load a file-like reader from the local file system
         """
diff --git a/sample_archive/text_content/sample_no_head.html b/sample_archive/text_content/sample_no_head.html
new file mode 100644
index 00000000..ed4bc4f3
--- /dev/null
+++ b/sample_archive/text_content/sample_no_head.html
@@ -0,0 +1,8 @@
+<script>
+var some_val = false;
+if (some_val) {
+    window.location = "/other.html";
+}
+</script>
+Test Content
+<a href="another.html">Some Link</a>