1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

rewrite: content detection for specific case: if content type is html and mod type is css

or js, peek stream to determine actual type. Addresses #31 in part.
Fix typo in wb_frame.js
This commit is contained in:
Ilya Kreymer 2014-12-26 13:08:35 -08:00
parent 8f57ce622d
commit ffb702ce03
3 changed files with 49 additions and 17 deletions

View File

@ -13,13 +13,15 @@ from rewriterules import RewriteRules
from pywb.utils.dsrules import RuleSet from pywb.utils.dsrules import RuleSet
from pywb.utils.statusandheaders import StatusAndHeaders from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.utils.bufferedreaders import DecompressingBufferedReader from pywb.utils.bufferedreaders import DecompressingBufferedReader
from pywb.utils.bufferedreaders import ChunkedDataReader from pywb.utils.bufferedreaders import ChunkedDataReader, BufferedReader
#================================================================= #=================================================================
class RewriteContent: class RewriteContent:
HEAD_REGEX = re.compile(r'<\s*head\b[^>]*[>]+', re.I) HEAD_REGEX = re.compile(r'<\s*head\b[^>]*[>]+', re.I)
TAG_REGEX = re.compile(r'^\s*\<')
BUFF_SIZE = 16384 BUFF_SIZE = 16384
def __init__(self, ds_rules_file=None, is_framed_replay=False): def __init__(self, ds_rules_file=None, is_framed_replay=False):
@ -106,11 +108,6 @@ class RewriteContent:
# default text_type # default text_type
mod = wb_url.mod mod = wb_url.mod
if mod == 'js_':
text_type = 'js'
elif mod == 'cs_':
text_type = 'css'
stream_raw = False stream_raw = False
encoding = None encoding = None
first_buff = None first_buff = None
@ -124,6 +121,15 @@ class RewriteContent:
else: else:
stream = DecompressingBufferedReader(stream) stream = DecompressingBufferedReader(stream)
if mod == 'js_':
text_type, stream = self._resolve_text_type('js',
text_type,
stream)
elif mod == 'cs_':
text_type, stream = self._resolve_text_type('css',
text_type,
stream)
rewriter_class = rule.rewriters[text_type] rewriter_class = rule.rewriters[text_type]
# for html, need to perform header insert, supply js, css, xml # for html, need to perform header insert, supply js, css, xml
@ -173,6 +179,22 @@ class RewriteContent:
return (status_headers, gen, True) return (status_headers, gen, True)
@staticmethod
def _resolve_text_type(mod, text_type, stream):
# only attempt to resolve between html and other text types
if text_type != 'html':
return mod, stream
buff = stream.read(128)
wrapped_stream = BufferedReader(stream, starting_data=buff)
# check if starts with a tag, then likely html
if RewriteContent.TAG_REGEX.match(buff):
mod = 'html'
return mod, wrapped_stream
def _head_insert_only_gen(self, insert_str, stream): def _head_insert_only_gen(self, insert_str, stream):
max_len = 1024 max_len = 1024
buff = '' buff = ''

View File

@ -3,18 +3,28 @@
ur""" ur"""
# full seq # full seq
>>> print RewriteContent._decode_buff('\xce\xb4\xce\xbf\xce\xba', BytesIO(''), 'utf-8') #>>> print RewriteContent._decode_buff('\xce\xb4\xce\xbf\xce\xba', BytesIO(''), 'utf-8')
δοκ δοκ
# read split bytes, read rest # read split bytes, read rest
>>> b = BytesIO('\xbf\xce\xba') #>>> b = BytesIO('\xbf\xce\xba')
>>> sys.stdout.write(RewriteContent._decode_buff('\xce\xb4\xce', b, 'utf-8')); sys.stdout.write(RewriteContent._decode_buff(b.read(), b, 'utf-8')) #>>> sys.stdout.write(RewriteContent._decode_buff('\xce\xb4\xce', b, 'utf-8')); sys.stdout.write(RewriteContent._decode_buff(b.read(), b, 'utf-8'))
δοκ δοκ
# invalid seq # invalid seq
>>> print RewriteContent._decode_buff('\xce\xb4\xce', BytesIO('\xfe'), 'utf-8') #>>> print RewriteContent._decode_buff('\xce\xb4\xce', BytesIO('\xfe'), 'utf-8')
Traceback (most recent call last): Traceback (most recent call last):
UnicodeDecodeError: 'utf8' codec can't decode byte 0xce in position 2: invalid continuation byte "UnicodeDecodeError: 'utf8' codec can't decode byte 0xce in position 2: invalid continuation byte"
>>> text_type, stream = RewriteContent._resolve_text_type('js', 'html', BytesIO(' <html></html>'))
>>> print (text_type, stream.read())
('html', ' <html></html>')
>>> text_type, stream = RewriteContent._resolve_text_type('js', 'html', BytesIO(' function() { return 0; }'))
>>> print (text_type, stream.read())
('js', ' function() { return 0; }')
""" """
from pywb.rewrite.rewrite_content import RewriteContent from pywb.rewrite.rewrite_content import RewriteContent

View File

@ -45,8 +45,8 @@ function make_inner_url(url, ts)
function push_state(url, timestamp, capture_str, is_live) { function push_state(url, timestamp, capture_str, is_live) {
if (window.frames[0].WB_wombat_location) { if (window.frames[0].WB_wombat_location) {
curr_href = window.frames[0].WB_wombat_location.href; curr_href = window.frames[0].WB_wombat_location.href;
// If not current url, don't update // If not current url, don't update
if (url != curr_href) { if (url != curr_href) {
return; return;
} }
@ -59,7 +59,7 @@ function push_state(url, timestamp, capture_str, is_live) {
state.url = url; state.url = url;
state.capture_str = capture_str; state.capture_str = capture_str;
state.is_live = is_live; state.is_live = is_live;
window.history.replaceState(state, "", state.inner_url); window.history.replaceState(state, "", state.inner_url);
set_state(state); set_state(state);
@ -67,13 +67,13 @@ function push_state(url, timestamp, capture_str, is_live) {
function pop_state(state) { function pop_state(state) {
set_state(state); set_state(state);
window.frames[0].src = state.inner_url; window.frames[0].src = state.inner_url;
} }
function extract_ts(url) function extract_ts(url)
{ {
var result = value.match(TS_REGEX); var result = url.match(TS_REGEX);
if (!result) { if (!result) {
return ""; return "";
} }
@ -112,7 +112,7 @@ function set_state(state) {
window.onpopstate = function(event) { window.onpopstate = function(event) {
var state = event.state; var state = event.state;
if (state) { if (state) {
pop_state(state); pop_state(state);
} }