1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00
pywb/pywb/rewrite/rewrite_content.py
Ilya Kreymer a4b770d34e new-pywb refactor!
frontendapp compatibility
- add support for separate not found page for 404s (not_found.html)
- support for exception handling with error template (error.html)
- support for home page (index.html)
- add memento headers for replay
- add referrer fallback check
- tests: port integration tests for front-end replay, cdx server
- not included: proxy mode, exact redirect mode, non-framed replay
- move unused tests to tests_disabled
- cli: add optional werkzeug profiler with --profile flag
2017-02-27 19:07:51 -08:00

399 lines
14 KiB
Python

#import chardet
import pkgutil
import webencodings
import yaml
import re
#from chardet.universaldetector import UniversalDetector
from io import BytesIO
from pywb.rewrite.header_rewriter import RewrittenStatusAndHeaders
from pywb.rewrite.rewriterules import RewriteRules
from pywb.utils.dsrules import RuleSet
from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.utils.bufferedreaders import DecompressingBufferedReader
from pywb.utils.bufferedreaders import ChunkedDataReader, BufferedReader
from pywb.utils.loaders import to_native_str
from pywb.rewrite.regex_rewriters import JSNoneRewriter, JSLinkOnlyRewriter
#=================================================================
class RewriteContent(object):
HEAD_REGEX = re.compile(b'<\s*head\\b[^>]*[>]+', re.I)
TAG_REGEX = re.compile(b'^\s*\<')
CHARSET_REGEX = re.compile(b'<meta[^>]*?[\s;"\']charset\s*=[\s"\']*([^\s"\'/>]*)')
BUFF_SIZE = 16384
def __init__(self, ds_rules_file=None, is_framed_replay=False):
self.ruleset = RuleSet(RewriteRules, 'rewrite',
default_rule_config={},
ds_rules_file=ds_rules_file)
if is_framed_replay == 'inverse':
self.defmod = 'mp_'
else:
self.defmod = ''
def sanitize_content(self, status_headers, stream):
# remove transfer encoding chunked and wrap in a dechunking stream
if (status_headers.remove_header('transfer-encoding')):
stream = ChunkedDataReader(stream)
return (status_headers, stream)
def _rewrite_headers(self, urlrewriter, rule, status_headers, stream,
urlkey='', cookie_rewriter=None):
header_rewriter_class = rule.rewriters['header']
if urlrewriter and not cookie_rewriter:
cookie_rewriter = urlrewriter.get_cookie_rewriter(rule)
rewritten_headers = (header_rewriter_class().
rewrite(status_headers,
urlrewriter,
cookie_rewriter))
# note: since chunk encoding may/may not be valid,
# the approach taken here is to *always* attempt
# to dechunk if 'transfer-encoding: chunked' is present
#
# an alternative may be to serve chunked unless
# content rewriting is needed
# todo: possible revisit this approach
if (rewritten_headers.
contains_removed_header('transfer-encoding', 'chunked')):
stream = ChunkedDataReader(stream)
return (rewritten_headers, stream)
def _check_encoding(self, rewritten_headers, stream, enc):
matched = False
if (rewritten_headers.
contains_removed_header('content-encoding', enc)):
#optimize: if already a ChunkedDataReader, add the encoding
if isinstance(stream, ChunkedDataReader):
stream.set_decomp(enc)
else:
stream = DecompressingBufferedReader(stream, decomp_type=enc)
rewritten_headers.status_headers.remove_header('content-length')
matched = True
return matched, stream
def rewrite_content(self, urlrewriter, status_headers, stream,
head_insert_func=None, urlkey='',
cdx=None, cookie_rewriter=None, env=None):
wb_url = urlrewriter.wburl
if (wb_url.is_identity or
(not head_insert_func and wb_url.is_banner_only)):
status_headers, stream = self.sanitize_content(status_headers,
stream)
return (status_headers, self.stream_to_gen(stream), False)
if urlrewriter and cdx and cdx.get('is_live'):
urlrewriter.rewrite_opts['is_live'] = True
rule = self.ruleset.get_first_match(urlkey)
(rewritten_headers, stream) = self._rewrite_headers(urlrewriter,
rule,
status_headers,
stream,
urlkey,
cookie_rewriter)
res = self.handle_custom_rewrite(rewritten_headers,
stream,
urlrewriter,
wb_url.mod,
env)
if res:
return res
# Handle text content rewriting
# ====================================================================
# special case -- need to ungzip the body
status_headers = rewritten_headers.status_headers
text_type = rewritten_headers.text_type
# see known js/css modifier specified, the context should run
# default text_type
mod = wb_url.mod
stream_raw = False
encoding = None
first_buff = b''
for decomp_type in BufferedReader.get_supported_decompressors():
matched, stream = self._check_encoding(rewritten_headers,
stream,
decomp_type)
if matched:
break
if mod == 'js_':
text_type, stream = self._resolve_text_type('js',
text_type,
stream)
elif mod == 'cs_':
text_type, stream = self._resolve_text_type('css',
text_type,
stream)
# for proxy mode: use special js_proxy rewriter
# which may be none rewriter + custom rules (if any)
if text_type == 'js' and not urlrewriter.prefix:
rewriter_class = rule.rewriters['js_proxy']
else:
rewriter_class = rule.rewriters[text_type]
# for html, need to perform header insert, supply js, css, xml
# rewriters
if text_type == 'html':
head_insert_str = ''
charset = rewritten_headers.charset
# if no charset set, attempt to extract from first 1024
if not rewritten_headers.charset:
first_buff = stream.read(1024)
charset = self._extract_html_charset(first_buff,
status_headers)
if head_insert_func and not wb_url.is_url_rewrite_only:
head_insert_orig = head_insert_func(rule, cdx)
if charset:
try:
head_insert_str = webencodings.encode(head_insert_orig, charset)
except:
pass
if not head_insert_str:
charset = 'utf-8'
head_insert_str = head_insert_orig.encode(charset)
head_insert_buf = head_insert_str
#head_insert_str = to_native_str(head_insert_str)
head_insert_str = head_insert_str.decode('iso-8859-1')
if wb_url.is_banner_only:
gen = self._head_insert_only_gen(head_insert_buf,
stream,
first_buff)
content_len = status_headers.get_header('Content-Length')
try:
content_len = int(content_len)
except Exception:
content_len = None
if content_len is not None and content_len >= 0:
content_len = str(content_len + len(head_insert_str))
status_headers.replace_header('Content-Length',
content_len)
return (status_headers, gen, False)
# if proxy, use js_proxy rewriter
if not urlrewriter.prefix:
js_rewriter_class = rule.rewriters['js_proxy']
else:
js_rewriter_class = rule.rewriters['js']
css_rewriter_class = rule.rewriters['css']
if wb_url.is_url_rewrite_only:
js_rewriter_class = JSNoneRewriter
rewriter = rewriter_class(urlrewriter,
js_rewriter_class=js_rewriter_class,
css_rewriter_class=css_rewriter_class,
head_insert=head_insert_str,
url=wb_url.url,
defmod=self.defmod,
parse_comments=rule.parse_comments)
else:
if wb_url.is_banner_only:
return (status_headers, self.stream_to_gen(stream), False)
# url-only rewriter, but not rewriting urls in JS, so return
if wb_url.is_url_rewrite_only and text_type == 'js':
#return (status_headers, self.stream_to_gen(stream), False)
rewriter_class = JSLinkOnlyRewriter
# apply one of (js, css, xml) rewriters
rewriter = rewriter_class(urlrewriter)
# align to line end for all non-html rewriting
align = (text_type != 'html')
# Create rewriting generator
gen = self.rewrite_text_stream_to_gen(stream,
rewrite_func=rewriter.rewrite,
final_read_func=rewriter.close,
first_buff=first_buff,
align_to_line=align)
return (status_headers, gen, True)
def handle_custom_rewrite(self, rewritten_headers, stream,
urlrewriter, mod, env):
text_type = rewritten_headers.text_type
status_headers = rewritten_headers.status_headers
# use rewritten headers, but no further rewriting needed
if text_type is None:
return (status_headers, self.stream_to_gen(stream), False)
if ((text_type == 'html' and urlrewriter.rewrite_opts.get('is_ajax')) or
(text_type == 'plain' and not mod in ('js_', 'cs_'))):
rewritten_headers.readd_rewrite_removed()
return (status_headers, self.stream_to_gen(stream), False)
@staticmethod
def _extract_html_charset(buff, status_headers):
charset = None
m = RewriteContent.CHARSET_REGEX.search(buff)
if m:
charset = m.group(1)
charset = to_native_str(charset)
# content_type = 'text/html; charset=' + charset
# status_headers.replace_header('content-type', content_type)
return charset
@staticmethod
def _resolve_text_type(mod, text_type, stream):
if text_type == 'css' and mod == 'js':
return 'css', stream
# only attempt to resolve between html and other text types
if text_type != 'html':
return mod, stream
buff = stream.read(128)
wrapped_stream = BufferedReader(stream, starting_data=buff)
# check if starts with a tag, then likely html
if RewriteContent.TAG_REGEX.match(buff):
mod = 'html'
return mod, wrapped_stream
def _head_insert_only_gen(self, insert_str, stream, first_buff=b''):
buff = first_buff
max_len = 1024 - len(first_buff)
while max_len > 0:
curr = stream.read(max_len)
if not curr:
break
max_len -= len(buff)
buff += curr
matcher = self.HEAD_REGEX.search(buff)
if matcher:
yield buff[:matcher.end()]
yield insert_str
yield buff[matcher.end():]
else:
yield insert_str
yield buff
for buff in self.stream_to_gen(stream):
yield buff
@staticmethod
def _decode_buff(buff, stream, encoding): # pragma: no coverage
try:
buff = buff.decode(encoding)
except UnicodeDecodeError as e:
# chunk may have cut apart unicode bytes -- add 1-3 bytes and retry
for i in range(3):
buff += stream.read(1)
try:
buff = buff.decode(encoding)
break
except UnicodeDecodeError:
pass
else:
raise
return buff
@staticmethod
def stream_to_gen(stream):
"""
Convert stream to an iterator, reading BUFF_SIZE bytes
"""
try:
while True:
buff = stream.read(RewriteContent.BUFF_SIZE)
yield buff
if not buff:
break
finally:
stream.close()
@staticmethod
def rewrite_text_stream_to_gen(stream, rewrite_func,
final_read_func, first_buff,
align_to_line):
"""
Convert stream to generator using applying rewriting func
to each portion of the stream.
Align to line boundaries if needed.
"""
try:
has_closed = hasattr(stream, 'closed')
buff = first_buff
while True:
if buff:
buff = rewrite_func(buff.decode('iso-8859-1'))
yield buff.encode('iso-8859-1')
buff = stream.read(RewriteContent.BUFF_SIZE)
# on 2.6, readline() (but not read()) throws an exception
# if stream already closed, so check stream.closed if present
if (buff and align_to_line and
(not has_closed or not stream.closed)):
buff += stream.readline()
if not buff:
break
# For adding a tail/handling final buffer
buff = final_read_func()
if buff:
yield buff.encode('iso-8859-1')
finally:
stream.close()