1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 08:04:49 +01:00

streaming rewriter improvements:

- add optional 'first_buff' defaulting to ''
- rename close() -> final_read()
- add rewrite_complete() for single-pass complete rewrite (including first buff and final_read()
- rewrite_text_stream_to_gen() uses first_buff, uses member funcs directly
- remove unused close() from other rewriters, only needed for HTMLParser interface
This commit is contained in:
Ilya Kreymer 2017-07-18 21:06:48 -07:00
parent adab304f33
commit 35674c6de7
4 changed files with 22 additions and 29 deletions

View File

@ -201,49 +201,48 @@ class BufferedRewriter(object):
# ============================================================================
class StreamingRewriter(object):
def __init__(self, url_rewriter, align_to_line=True):
def __init__(self, url_rewriter, align_to_line=True, first_buff=''):
self.url_rewriter = url_rewriter
self.align_to_line = align_to_line
self.first_buff = first_buff
def __call__(self, rwinfo):
gen = self.rewrite_text_stream_to_gen(rwinfo.content_stream,
rewrite_func=self.rewrite,
final_read_func=self.close,
align_to_line=self.align_to_line)
return gen
return self.rewrite_text_stream_to_gen(rwinfo.content_stream)
def rewrite(self, string):
return string
def close(self):
def rewrite_complete(self, string):
return self.first_buff + self.rewrite(string) + self.final_read()
def final_read(self):
return ''
def rewrite_text_stream_to_gen(cls, stream,
rewrite_func,
final_read_func,
align_to_line):
def rewrite_text_stream_to_gen(self, stream):
"""
Convert stream to generator using applying rewriting func
to each portion of the stream.
Align to line boundaries if needed.
"""
try:
buff = ''
buff = self.first_buff
if buff:
yield buff.encode('iso-8859-1')
while True:
buff = stream.read(BUFF_SIZE)
if not buff:
break
if align_to_line:
if self.align_to_line:
buff += stream.readline()
buff = rewrite_func(buff.decode('iso-8859-1'))
buff = self.rewrite(buff.decode('iso-8859-1'))
yield buff.encode('iso-8859-1')
# For adding a tail/handling final buffer
buff = final_read_func()
buff = self.final_read()
if buff:
yield buff.encode('iso-8859-1')

View File

@ -237,7 +237,7 @@ class HTMLRewriterMixin(StreamingRewriter):
def _rewrite_css(self, css_content):
if css_content:
return self.css_rewriter.rewrite(css_content)
return self.css_rewriter.rewrite_complete(css_content)
else:
return ''
@ -245,7 +245,7 @@ class HTMLRewriterMixin(StreamingRewriter):
if not script_content:
return ''
content = self.js_rewriter.rewrite(script_content)
content = self.js_rewriter.rewrite_complete(script_content)
if ensure_window:
content = self.ADD_WINDOW.sub('window.\\1', content)
@ -456,7 +456,7 @@ class HTMLRewriterMixin(StreamingRewriter):
return result
def close(self):
def final_read(self):
self.out = self.AccumBuff()
self._internal_close()
@ -468,6 +468,9 @@ class HTMLRewriterMixin(StreamingRewriter):
return result
def close(self):
return self.final_read()
def _internal_close(self): # pragma: no cover
raise NotImplementedError('Base method')
@ -562,8 +565,7 @@ class HTMLRewriter(HTMLRewriterMixin, HTMLParser):
comment_rewriter = HTMLRewriter(self.url_rewriter,
defmod=self.defmod)
data = comment_rewriter.rewrite(data)
data += comment_rewriter.close()
data = comment_rewriter.rewrite_complete(data)
self.out.write(data)
else:
self.parse_data(data)

View File

@ -21,8 +21,3 @@ class JSONPRewriter(StreamingRewriter):
string = m_callback.group(1) + string[m_json.end(1):]
return string
def close(self):
return ''

View File

@ -61,9 +61,6 @@ class RegexRewriter(StreamingRewriter):
def rewrite(self, string):
return self.regex.sub(lambda x: self.replace(x), string)
def close(self):
return ''
def replace(self, m):
i = 0
for _, op, count in self.rules: