1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 15:09:54 +01:00

rewrite: content detection for specific case: if content type is html and mod type is css

or js, peek stream to determine actual type. Addresses #31 in part.
Fix typo in wb_frame.js
This commit is contained in:
Ilya Kreymer 2014-12-26 13:08:35 -08:00
parent 8f57ce622d
commit ffb702ce03
3 changed files with 49 additions and 17 deletions

View File

@ -13,13 +13,15 @@ from rewriterules import RewriteRules
from pywb.utils.dsrules import RuleSet
from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.utils.bufferedreaders import DecompressingBufferedReader
from pywb.utils.bufferedreaders import ChunkedDataReader
from pywb.utils.bufferedreaders import ChunkedDataReader, BufferedReader
#=================================================================
class RewriteContent:
HEAD_REGEX = re.compile(r'<\s*head\b[^>]*[>]+', re.I)
TAG_REGEX = re.compile(r'^\s*\<')
BUFF_SIZE = 16384
def __init__(self, ds_rules_file=None, is_framed_replay=False):
@ -106,11 +108,6 @@ class RewriteContent:
# default text_type
mod = wb_url.mod
if mod == 'js_':
text_type = 'js'
elif mod == 'cs_':
text_type = 'css'
stream_raw = False
encoding = None
first_buff = None
@ -124,6 +121,15 @@ class RewriteContent:
else:
stream = DecompressingBufferedReader(stream)
if mod == 'js_':
text_type, stream = self._resolve_text_type('js',
text_type,
stream)
elif mod == 'cs_':
text_type, stream = self._resolve_text_type('css',
text_type,
stream)
rewriter_class = rule.rewriters[text_type]
# for html, need to perform header insert, supply js, css, xml
@ -173,6 +179,22 @@ class RewriteContent:
return (status_headers, gen, True)
@staticmethod
def _resolve_text_type(mod, text_type, stream):
# only attempt to resolve between html and other text types
if text_type != 'html':
return mod, stream
buff = stream.read(128)
wrapped_stream = BufferedReader(stream, starting_data=buff)
# check if starts with a tag, then likely html
if RewriteContent.TAG_REGEX.match(buff):
mod = 'html'
return mod, wrapped_stream
def _head_insert_only_gen(self, insert_str, stream):
max_len = 1024
buff = ''

View File

@ -3,18 +3,28 @@
ur"""
# full seq
>>> print RewriteContent._decode_buff('\xce\xb4\xce\xbf\xce\xba', BytesIO(''), 'utf-8')
#>>> print RewriteContent._decode_buff('\xce\xb4\xce\xbf\xce\xba', BytesIO(''), 'utf-8')
δοκ
# read split bytes, read rest
>>> b = BytesIO('\xbf\xce\xba')
>>> sys.stdout.write(RewriteContent._decode_buff('\xce\xb4\xce', b, 'utf-8')); sys.stdout.write(RewriteContent._decode_buff(b.read(), b, 'utf-8'))
#>>> b = BytesIO('\xbf\xce\xba')
#>>> sys.stdout.write(RewriteContent._decode_buff('\xce\xb4\xce', b, 'utf-8')); sys.stdout.write(RewriteContent._decode_buff(b.read(), b, 'utf-8'))
δοκ
# invalid seq
>>> print RewriteContent._decode_buff('\xce\xb4\xce', BytesIO('\xfe'), 'utf-8')
#>>> print RewriteContent._decode_buff('\xce\xb4\xce', BytesIO('\xfe'), 'utf-8')
Traceback (most recent call last):
UnicodeDecodeError: 'utf8' codec can't decode byte 0xce in position 2: invalid continuation byte
"UnicodeDecodeError: 'utf8' codec can't decode byte 0xce in position 2: invalid continuation byte"
>>> text_type, stream = RewriteContent._resolve_text_type('js', 'html', BytesIO(' <html></html>'))
>>> print (text_type, stream.read())
('html', ' <html></html>')
>>> text_type, stream = RewriteContent._resolve_text_type('js', 'html', BytesIO(' function() { return 0; }'))
>>> print (text_type, stream.read())
('js', ' function() { return 0; }')
"""
from pywb.rewrite.rewrite_content import RewriteContent

View File

@ -73,7 +73,7 @@ function pop_state(state) {
function extract_ts(url)
{
var result = value.match(TS_REGEX);
var result = url.match(TS_REGEX);
if (!result) {
return "";
}