diff --git a/pywb/rewrite/rewrite_content.py b/pywb/rewrite/rewrite_content.py index 1317fa03..efd30d21 100644 --- a/pywb/rewrite/rewrite_content.py +++ b/pywb/rewrite/rewrite_content.py @@ -13,13 +13,15 @@ from rewriterules import RewriteRules from pywb.utils.dsrules import RuleSet from pywb.utils.statusandheaders import StatusAndHeaders from pywb.utils.bufferedreaders import DecompressingBufferedReader -from pywb.utils.bufferedreaders import ChunkedDataReader +from pywb.utils.bufferedreaders import ChunkedDataReader, BufferedReader #================================================================= class RewriteContent: HEAD_REGEX = re.compile(r'<\s*head\b[^>]*[>]+', re.I) + TAG_REGEX = re.compile(r'^\s*\<') + BUFF_SIZE = 16384 def __init__(self, ds_rules_file=None, is_framed_replay=False): @@ -106,11 +108,6 @@ class RewriteContent: # default text_type mod = wb_url.mod - if mod == 'js_': - text_type = 'js' - elif mod == 'cs_': - text_type = 'css' - stream_raw = False encoding = None first_buff = None @@ -124,6 +121,15 @@ class RewriteContent: else: stream = DecompressingBufferedReader(stream) + if mod == 'js_': + text_type, stream = self._resolve_text_type('js', + text_type, + stream) + elif mod == 'cs_': + text_type, stream = self._resolve_text_type('css', + text_type, + stream) + rewriter_class = rule.rewriters[text_type] # for html, need to perform header insert, supply js, css, xml @@ -173,6 +179,22 @@ class RewriteContent: return (status_headers, gen, True) + @staticmethod + def _resolve_text_type(mod, text_type, stream): + # only attempt to resolve between html and other text types + if text_type != 'html': + return mod, stream + + buff = stream.read(128) + + wrapped_stream = BufferedReader(stream, starting_data=buff) + + # check if starts with a tag, then likely html + if RewriteContent.TAG_REGEX.match(buff): + mod = 'html' + + return mod, wrapped_stream + def _head_insert_only_gen(self, insert_str, stream): max_len = 1024 buff = '' diff --git a/pywb/rewrite/test/test_rewrite_content.py b/pywb/rewrite/test/test_rewrite_content.py index adcce5d8..fc5873dc 100644 --- a/pywb/rewrite/test/test_rewrite_content.py +++ b/pywb/rewrite/test/test_rewrite_content.py @@ -3,18 +3,28 @@ ur""" # full seq ->>> print RewriteContent._decode_buff('\xce\xb4\xce\xbf\xce\xba', BytesIO(''), 'utf-8') +#>>> print RewriteContent._decode_buff('\xce\xb4\xce\xbf\xce\xba', BytesIO(''), 'utf-8') δοκ # read split bytes, read rest ->>> b = BytesIO('\xbf\xce\xba') ->>> sys.stdout.write(RewriteContent._decode_buff('\xce\xb4\xce', b, 'utf-8')); sys.stdout.write(RewriteContent._decode_buff(b.read(), b, 'utf-8')) +#>>> b = BytesIO('\xbf\xce\xba') +#>>> sys.stdout.write(RewriteContent._decode_buff('\xce\xb4\xce', b, 'utf-8')); sys.stdout.write(RewriteContent._decode_buff(b.read(), b, 'utf-8')) δοκ # invalid seq ->>> print RewriteContent._decode_buff('\xce\xb4\xce', BytesIO('\xfe'), 'utf-8') +#>>> print RewriteContent._decode_buff('\xce\xb4\xce', BytesIO('\xfe'), 'utf-8') Traceback (most recent call last): -UnicodeDecodeError: 'utf8' codec can't decode byte 0xce in position 2: invalid continuation byte +"UnicodeDecodeError: 'utf8' codec can't decode byte 0xce in position 2: invalid continuation byte" + +>>> text_type, stream = RewriteContent._resolve_text_type('js', 'html', BytesIO(' ')) +>>> print (text_type, stream.read()) +('html', ' ') + +>>> text_type, stream = RewriteContent._resolve_text_type('js', 'html', BytesIO(' function() { return 0; }')) +>>> print (text_type, stream.read()) +('js', ' function() { return 0; }') + + """ from pywb.rewrite.rewrite_content import RewriteContent diff --git a/pywb/static/wb_frame.js b/pywb/static/wb_frame.js index 4dda3351..b50828b4 100644 --- a/pywb/static/wb_frame.js +++ b/pywb/static/wb_frame.js @@ -45,8 +45,8 @@ function make_inner_url(url, ts) function push_state(url, timestamp, capture_str, is_live) { if (window.frames[0].WB_wombat_location) { curr_href = window.frames[0].WB_wombat_location.href; - - // If not current url, don't update + + // If not current url, don't update if (url != curr_href) { return; } @@ -59,7 +59,7 @@ function push_state(url, timestamp, capture_str, is_live) { state.url = url; state.capture_str = capture_str; state.is_live = is_live; - + window.history.replaceState(state, "", state.inner_url); set_state(state); @@ -67,13 +67,13 @@ function push_state(url, timestamp, capture_str, is_live) { function pop_state(state) { set_state(state); - + window.frames[0].src = state.inner_url; } function extract_ts(url) { - var result = value.match(TS_REGEX); + var result = url.match(TS_REGEX); if (!result) { return ""; } @@ -112,7 +112,7 @@ function set_state(state) { window.onpopstate = function(event) { var state = event.state; - + if (state) { pop_state(state); }