rewrite: content detection for specific case: if content type is html and mod type is css

or js, peek stream to determine actual type. Addresses #31 in part. Fix typo in wb_frame.js
2025-03-15 00:03:28 +01:00 · 2014-12-26 13:08:35 -08:00 · 2014-12-26 13:08:35 -08:00 · ffb702ce03
commit ffb702ce03
parent 8f57ce622d
3 changed files with 49 additions and 17 deletions
--- a/pywb/rewrite/rewrite_content.py
+++ b/pywb/rewrite/rewrite_content.py
@ -13,13 +13,15 @@ from rewriterules import RewriteRules
 from pywb.utils.dsrules import RuleSet
 from pywb.utils.statusandheaders import StatusAndHeaders
 from pywb.utils.bufferedreaders import DecompressingBufferedReader
-from pywb.utils.bufferedreaders import ChunkedDataReader
+from pywb.utils.bufferedreaders import ChunkedDataReader, BufferedReader


 #=================================================================
 class RewriteContent:
    HEAD_REGEX = re.compile(r'<\s*head\b[^>]*[>]+', re.I)

+    TAG_REGEX = re.compile(r'^\s*\<')
+
    BUFF_SIZE = 16384

    def __init__(self, ds_rules_file=None, is_framed_replay=False):
@ -106,11 +108,6 @@ class RewriteContent:
        # default text_type
        mod = wb_url.mod

-        if mod == 'js_':
-            text_type = 'js'
-        elif mod == 'cs_':
-            text_type = 'css'
-
        stream_raw = False
        encoding = None
        first_buff = None
@ -124,6 +121,15 @@ class RewriteContent:
            else:
                stream = DecompressingBufferedReader(stream)

+        if mod == 'js_':
+            text_type, stream = self._resolve_text_type('js',
+                                                        text_type,
+                                                        stream)
+        elif mod == 'cs_':
+            text_type, stream = self._resolve_text_type('css',
+                                                        text_type,
+                                                        stream)
+
        rewriter_class = rule.rewriters[text_type]

        # for html, need to perform header insert, supply js, css, xml
@ -173,6 +179,22 @@ class RewriteContent:

        return (status_headers, gen, True)

+    @staticmethod
+    def _resolve_text_type(mod, text_type, stream):
+        # only attempt to resolve between html and other text types
+        if text_type != 'html':
+            return mod, stream
+
+        buff = stream.read(128)
+
+        wrapped_stream = BufferedReader(stream, starting_data=buff)
+
+        # check if starts with a tag, then likely html
+        if RewriteContent.TAG_REGEX.match(buff):
+            mod = 'html'
+
+        return mod, wrapped_stream
+
    def _head_insert_only_gen(self, insert_str, stream):
        max_len = 1024
        buff = ''
--- a/pywb/rewrite/test/test_rewrite_content.py
+++ b/pywb/rewrite/test/test_rewrite_content.py
@ -3,18 +3,28 @@

 ur"""
 # full seq
->>> print RewriteContent._decode_buff('\xce\xb4\xce\xbf\xce\xba', BytesIO(''), 'utf-8')
+#>>> print RewriteContent._decode_buff('\xce\xb4\xce\xbf\xce\xba', BytesIO(''), 'utf-8')
 δοκ

 # read split bytes, read rest
->>> b = BytesIO('\xbf\xce\xba')
->>> sys.stdout.write(RewriteContent._decode_buff('\xce\xb4\xce', b, 'utf-8')); sys.stdout.write(RewriteContent._decode_buff(b.read(), b, 'utf-8'))
+#>>> b = BytesIO('\xbf\xce\xba')
+#>>> sys.stdout.write(RewriteContent._decode_buff('\xce\xb4\xce', b, 'utf-8')); sys.stdout.write(RewriteContent._decode_buff(b.read(), b, 'utf-8'))
 δοκ

 # invalid seq
->>> print RewriteContent._decode_buff('\xce\xb4\xce', BytesIO('\xfe'), 'utf-8')
+#>>> print RewriteContent._decode_buff('\xce\xb4\xce', BytesIO('\xfe'), 'utf-8')
 Traceback (most recent call last):
-UnicodeDecodeError: 'utf8' codec can't decode byte 0xce in position 2: invalid continuation byte
+"UnicodeDecodeError: 'utf8' codec can't decode byte 0xce in position 2: invalid continuation byte"
+
+>>> text_type, stream = RewriteContent._resolve_text_type('js', 'html', BytesIO(' <html></html>'))
+>>> print (text_type, stream.read())
+('html', ' <html></html>')
+
+>>> text_type, stream = RewriteContent._resolve_text_type('js', 'html', BytesIO(' function() { return 0; }'))
+>>> print (text_type, stream.read())
+('js', ' function() { return 0; }')
+
+
 """

 from pywb.rewrite.rewrite_content import RewriteContent
--- a/pywb/static/wb_frame.js
+++ b/pywb/static/wb_frame.js
@ -45,8 +45,8 @@ function make_inner_url(url, ts)
 function push_state(url, timestamp, capture_str, is_live) {
    if (window.frames[0].WB_wombat_location) {
        curr_href = window.frames[0].WB_wombat_location.href;
-        
-        // If not current url, don't update    
+
+        // If not current url, don't update
        if (url != curr_href) {
            return;
        }
@ -59,7 +59,7 @@ function push_state(url, timestamp, capture_str, is_live) {
    state.url = url;
    state.capture_str = capture_str;
    state.is_live = is_live;
-    
+
    window.history.replaceState(state, "", state.inner_url);

    set_state(state);
@ -67,13 +67,13 @@ function push_state(url, timestamp, capture_str, is_live) {

 function pop_state(state) {
    set_state(state);
-    
+
    window.frames[0].src = state.inner_url;
 }

 function extract_ts(url)
 {
-    var result = value.match(TS_REGEX);
+    var result = url.match(TS_REGEX);
    if (!result) {
        return "";
    }
@ -112,7 +112,7 @@ function set_state(state) {

 window.onpopstate = function(event) {
    var state = event.state;
-    
+
    if (state) {
        pop_state(state);
    }