1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

streaming rewriter improvements:

- add optional 'first_buff' defaulting to ''
- rename close() -> final_read()
- add rewrite_complete() for single-pass complete rewrite (including first buff and final_read()
- rewrite_text_stream_to_gen() uses first_buff, uses member funcs directly
- remove unused close() from other rewriters, only needed for HTMLParser interface
This commit is contained in:
Ilya Kreymer 2017-07-18 21:06:48 -07:00
parent adab304f33
commit 35674c6de7
4 changed files with 22 additions and 29 deletions

View File

@ -201,49 +201,48 @@ class BufferedRewriter(object):
# ============================================================================ # ============================================================================
class StreamingRewriter(object): class StreamingRewriter(object):
def __init__(self, url_rewriter, align_to_line=True): def __init__(self, url_rewriter, align_to_line=True, first_buff=''):
self.url_rewriter = url_rewriter self.url_rewriter = url_rewriter
self.align_to_line = align_to_line self.align_to_line = align_to_line
self.first_buff = first_buff
def __call__(self, rwinfo): def __call__(self, rwinfo):
gen = self.rewrite_text_stream_to_gen(rwinfo.content_stream, return self.rewrite_text_stream_to_gen(rwinfo.content_stream)
rewrite_func=self.rewrite,
final_read_func=self.close,
align_to_line=self.align_to_line)
return gen
def rewrite(self, string): def rewrite(self, string):
return string return string
def close(self): def rewrite_complete(self, string):
return self.first_buff + self.rewrite(string) + self.final_read()
def final_read(self):
return '' return ''
def rewrite_text_stream_to_gen(cls, stream, def rewrite_text_stream_to_gen(self, stream):
rewrite_func,
final_read_func,
align_to_line):
""" """
Convert stream to generator using applying rewriting func Convert stream to generator using applying rewriting func
to each portion of the stream. to each portion of the stream.
Align to line boundaries if needed. Align to line boundaries if needed.
""" """
try: try:
buff = '' buff = self.first_buff
if buff:
yield buff.encode('iso-8859-1')
while True: while True:
buff = stream.read(BUFF_SIZE) buff = stream.read(BUFF_SIZE)
if not buff: if not buff:
break break
if align_to_line: if self.align_to_line:
buff += stream.readline() buff += stream.readline()
buff = rewrite_func(buff.decode('iso-8859-1')) buff = self.rewrite(buff.decode('iso-8859-1'))
yield buff.encode('iso-8859-1') yield buff.encode('iso-8859-1')
# For adding a tail/handling final buffer # For adding a tail/handling final buffer
buff = final_read_func() buff = self.final_read()
if buff: if buff:
yield buff.encode('iso-8859-1') yield buff.encode('iso-8859-1')

View File

@ -237,7 +237,7 @@ class HTMLRewriterMixin(StreamingRewriter):
def _rewrite_css(self, css_content): def _rewrite_css(self, css_content):
if css_content: if css_content:
return self.css_rewriter.rewrite(css_content) return self.css_rewriter.rewrite_complete(css_content)
else: else:
return '' return ''
@ -245,7 +245,7 @@ class HTMLRewriterMixin(StreamingRewriter):
if not script_content: if not script_content:
return '' return ''
content = self.js_rewriter.rewrite(script_content) content = self.js_rewriter.rewrite_complete(script_content)
if ensure_window: if ensure_window:
content = self.ADD_WINDOW.sub('window.\\1', content) content = self.ADD_WINDOW.sub('window.\\1', content)
@ -456,7 +456,7 @@ class HTMLRewriterMixin(StreamingRewriter):
return result return result
def close(self): def final_read(self):
self.out = self.AccumBuff() self.out = self.AccumBuff()
self._internal_close() self._internal_close()
@ -468,6 +468,9 @@ class HTMLRewriterMixin(StreamingRewriter):
return result return result
def close(self):
return self.final_read()
def _internal_close(self): # pragma: no cover def _internal_close(self): # pragma: no cover
raise NotImplementedError('Base method') raise NotImplementedError('Base method')
@ -562,8 +565,7 @@ class HTMLRewriter(HTMLRewriterMixin, HTMLParser):
comment_rewriter = HTMLRewriter(self.url_rewriter, comment_rewriter = HTMLRewriter(self.url_rewriter,
defmod=self.defmod) defmod=self.defmod)
data = comment_rewriter.rewrite(data) data = comment_rewriter.rewrite_complete(data)
data += comment_rewriter.close()
self.out.write(data) self.out.write(data)
else: else:
self.parse_data(data) self.parse_data(data)

View File

@ -21,8 +21,3 @@ class JSONPRewriter(StreamingRewriter):
string = m_callback.group(1) + string[m_json.end(1):] string = m_callback.group(1) + string[m_json.end(1):]
return string return string
def close(self):
return ''

View File

@ -61,9 +61,6 @@ class RegexRewriter(StreamingRewriter):
def rewrite(self, string): def rewrite(self, string):
return self.regex.sub(lambda x: self.replace(x), string) return self.regex.sub(lambda x: self.replace(x), string)
def close(self):
return ''
def replace(self, m): def replace(self, m):
i = 0 i = 0
for _, op, count in self.rules: for _, op, count in self.rules: