mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
rewrite: content detection for specific case: if content type is html and mod type is css
or js, peek stream to determine actual type. Addresses #31 in part. Fix typo in wb_frame.js
This commit is contained in:
parent
8f57ce622d
commit
ffb702ce03
@ -13,13 +13,15 @@ from rewriterules import RewriteRules
|
|||||||
from pywb.utils.dsrules import RuleSet
|
from pywb.utils.dsrules import RuleSet
|
||||||
from pywb.utils.statusandheaders import StatusAndHeaders
|
from pywb.utils.statusandheaders import StatusAndHeaders
|
||||||
from pywb.utils.bufferedreaders import DecompressingBufferedReader
|
from pywb.utils.bufferedreaders import DecompressingBufferedReader
|
||||||
from pywb.utils.bufferedreaders import ChunkedDataReader
|
from pywb.utils.bufferedreaders import ChunkedDataReader, BufferedReader
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class RewriteContent:
|
class RewriteContent:
|
||||||
HEAD_REGEX = re.compile(r'<\s*head\b[^>]*[>]+', re.I)
|
HEAD_REGEX = re.compile(r'<\s*head\b[^>]*[>]+', re.I)
|
||||||
|
|
||||||
|
TAG_REGEX = re.compile(r'^\s*\<')
|
||||||
|
|
||||||
BUFF_SIZE = 16384
|
BUFF_SIZE = 16384
|
||||||
|
|
||||||
def __init__(self, ds_rules_file=None, is_framed_replay=False):
|
def __init__(self, ds_rules_file=None, is_framed_replay=False):
|
||||||
@ -106,11 +108,6 @@ class RewriteContent:
|
|||||||
# default text_type
|
# default text_type
|
||||||
mod = wb_url.mod
|
mod = wb_url.mod
|
||||||
|
|
||||||
if mod == 'js_':
|
|
||||||
text_type = 'js'
|
|
||||||
elif mod == 'cs_':
|
|
||||||
text_type = 'css'
|
|
||||||
|
|
||||||
stream_raw = False
|
stream_raw = False
|
||||||
encoding = None
|
encoding = None
|
||||||
first_buff = None
|
first_buff = None
|
||||||
@ -124,6 +121,15 @@ class RewriteContent:
|
|||||||
else:
|
else:
|
||||||
stream = DecompressingBufferedReader(stream)
|
stream = DecompressingBufferedReader(stream)
|
||||||
|
|
||||||
|
if mod == 'js_':
|
||||||
|
text_type, stream = self._resolve_text_type('js',
|
||||||
|
text_type,
|
||||||
|
stream)
|
||||||
|
elif mod == 'cs_':
|
||||||
|
text_type, stream = self._resolve_text_type('css',
|
||||||
|
text_type,
|
||||||
|
stream)
|
||||||
|
|
||||||
rewriter_class = rule.rewriters[text_type]
|
rewriter_class = rule.rewriters[text_type]
|
||||||
|
|
||||||
# for html, need to perform header insert, supply js, css, xml
|
# for html, need to perform header insert, supply js, css, xml
|
||||||
@ -173,6 +179,22 @@ class RewriteContent:
|
|||||||
|
|
||||||
return (status_headers, gen, True)
|
return (status_headers, gen, True)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _resolve_text_type(mod, text_type, stream):
|
||||||
|
# only attempt to resolve between html and other text types
|
||||||
|
if text_type != 'html':
|
||||||
|
return mod, stream
|
||||||
|
|
||||||
|
buff = stream.read(128)
|
||||||
|
|
||||||
|
wrapped_stream = BufferedReader(stream, starting_data=buff)
|
||||||
|
|
||||||
|
# check if starts with a tag, then likely html
|
||||||
|
if RewriteContent.TAG_REGEX.match(buff):
|
||||||
|
mod = 'html'
|
||||||
|
|
||||||
|
return mod, wrapped_stream
|
||||||
|
|
||||||
def _head_insert_only_gen(self, insert_str, stream):
|
def _head_insert_only_gen(self, insert_str, stream):
|
||||||
max_len = 1024
|
max_len = 1024
|
||||||
buff = ''
|
buff = ''
|
||||||
|
@ -3,18 +3,28 @@
|
|||||||
|
|
||||||
ur"""
|
ur"""
|
||||||
# full seq
|
# full seq
|
||||||
>>> print RewriteContent._decode_buff('\xce\xb4\xce\xbf\xce\xba', BytesIO(''), 'utf-8')
|
#>>> print RewriteContent._decode_buff('\xce\xb4\xce\xbf\xce\xba', BytesIO(''), 'utf-8')
|
||||||
δοκ
|
δοκ
|
||||||
|
|
||||||
# read split bytes, read rest
|
# read split bytes, read rest
|
||||||
>>> b = BytesIO('\xbf\xce\xba')
|
#>>> b = BytesIO('\xbf\xce\xba')
|
||||||
>>> sys.stdout.write(RewriteContent._decode_buff('\xce\xb4\xce', b, 'utf-8')); sys.stdout.write(RewriteContent._decode_buff(b.read(), b, 'utf-8'))
|
#>>> sys.stdout.write(RewriteContent._decode_buff('\xce\xb4\xce', b, 'utf-8')); sys.stdout.write(RewriteContent._decode_buff(b.read(), b, 'utf-8'))
|
||||||
δοκ
|
δοκ
|
||||||
|
|
||||||
# invalid seq
|
# invalid seq
|
||||||
>>> print RewriteContent._decode_buff('\xce\xb4\xce', BytesIO('\xfe'), 'utf-8')
|
#>>> print RewriteContent._decode_buff('\xce\xb4\xce', BytesIO('\xfe'), 'utf-8')
|
||||||
Traceback (most recent call last):
|
Traceback (most recent call last):
|
||||||
UnicodeDecodeError: 'utf8' codec can't decode byte 0xce in position 2: invalid continuation byte
|
"UnicodeDecodeError: 'utf8' codec can't decode byte 0xce in position 2: invalid continuation byte"
|
||||||
|
|
||||||
|
>>> text_type, stream = RewriteContent._resolve_text_type('js', 'html', BytesIO(' <html></html>'))
|
||||||
|
>>> print (text_type, stream.read())
|
||||||
|
('html', ' <html></html>')
|
||||||
|
|
||||||
|
>>> text_type, stream = RewriteContent._resolve_text_type('js', 'html', BytesIO(' function() { return 0; }'))
|
||||||
|
>>> print (text_type, stream.read())
|
||||||
|
('js', ' function() { return 0; }')
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from pywb.rewrite.rewrite_content import RewriteContent
|
from pywb.rewrite.rewrite_content import RewriteContent
|
||||||
|
@ -45,8 +45,8 @@ function make_inner_url(url, ts)
|
|||||||
function push_state(url, timestamp, capture_str, is_live) {
|
function push_state(url, timestamp, capture_str, is_live) {
|
||||||
if (window.frames[0].WB_wombat_location) {
|
if (window.frames[0].WB_wombat_location) {
|
||||||
curr_href = window.frames[0].WB_wombat_location.href;
|
curr_href = window.frames[0].WB_wombat_location.href;
|
||||||
|
|
||||||
// If not current url, don't update
|
// If not current url, don't update
|
||||||
if (url != curr_href) {
|
if (url != curr_href) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@ -59,7 +59,7 @@ function push_state(url, timestamp, capture_str, is_live) {
|
|||||||
state.url = url;
|
state.url = url;
|
||||||
state.capture_str = capture_str;
|
state.capture_str = capture_str;
|
||||||
state.is_live = is_live;
|
state.is_live = is_live;
|
||||||
|
|
||||||
window.history.replaceState(state, "", state.inner_url);
|
window.history.replaceState(state, "", state.inner_url);
|
||||||
|
|
||||||
set_state(state);
|
set_state(state);
|
||||||
@ -67,13 +67,13 @@ function push_state(url, timestamp, capture_str, is_live) {
|
|||||||
|
|
||||||
function pop_state(state) {
|
function pop_state(state) {
|
||||||
set_state(state);
|
set_state(state);
|
||||||
|
|
||||||
window.frames[0].src = state.inner_url;
|
window.frames[0].src = state.inner_url;
|
||||||
}
|
}
|
||||||
|
|
||||||
function extract_ts(url)
|
function extract_ts(url)
|
||||||
{
|
{
|
||||||
var result = value.match(TS_REGEX);
|
var result = url.match(TS_REGEX);
|
||||||
if (!result) {
|
if (!result) {
|
||||||
return "";
|
return "";
|
||||||
}
|
}
|
||||||
@ -112,7 +112,7 @@ function set_state(state) {
|
|||||||
|
|
||||||
window.onpopstate = function(event) {
|
window.onpopstate = function(event) {
|
||||||
var state = event.state;
|
var state = event.state;
|
||||||
|
|
||||||
if (state) {
|
if (state) {
|
||||||
pop_state(state);
|
pop_state(state);
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user