2014-03-17 20:53:42 -07:00
|
|
|
#import chardet
|
2014-02-26 18:02:01 -08:00
|
|
|
import pkgutil
|
|
|
|
import yaml
|
2014-07-29 12:20:22 -07:00
|
|
|
import re
|
|
|
|
|
2014-03-17 20:53:42 -07:00
|
|
|
from chardet.universaldetector import UniversalDetector
|
|
|
|
from io import BytesIO
|
2014-02-17 02:34:39 -08:00
|
|
|
|
2014-02-26 18:02:01 -08:00
|
|
|
from header_rewriter import RewrittenStatusAndHeaders
|
2014-02-17 02:34:39 -08:00
|
|
|
|
2014-06-27 19:03:06 -07:00
|
|
|
from rewriterules import RewriteRules
|
2014-02-26 18:02:01 -08:00
|
|
|
|
|
|
|
from pywb.utils.dsrules import RuleSet
|
2014-02-17 02:34:39 -08:00
|
|
|
from pywb.utils.statusandheaders import StatusAndHeaders
|
2014-03-14 16:34:51 -07:00
|
|
|
from pywb.utils.bufferedreaders import DecompressingBufferedReader
|
|
|
|
from pywb.utils.bufferedreaders import ChunkedDataReader
|
2014-02-17 02:34:39 -08:00
|
|
|
|
|
|
|
|
2014-02-26 22:04:37 -08:00
|
|
|
#=================================================================
|
2014-02-26 18:02:01 -08:00
|
|
|
class RewriteContent:
|
2014-09-06 15:19:28 -07:00
|
|
|
HEAD_REGEX = re.compile(r'<\s*head\b[^>]*[>]+', re.I)
|
|
|
|
|
2014-08-04 01:18:46 -07:00
|
|
|
def __init__(self, ds_rules_file=None, is_framed_replay=False):
|
2014-02-26 22:04:37 -08:00
|
|
|
self.ruleset = RuleSet(RewriteRules, 'rewrite',
|
|
|
|
default_rule_config={},
|
|
|
|
ds_rules_file=ds_rules_file)
|
2014-08-04 01:18:46 -07:00
|
|
|
|
|
|
|
if is_framed_replay:
|
|
|
|
self.defmod = 'mp_'
|
|
|
|
else:
|
|
|
|
self.defmod = ''
|
2014-02-17 02:34:39 -08:00
|
|
|
|
2014-04-03 12:44:00 -07:00
|
|
|
def sanitize_content(self, status_headers, stream):
|
|
|
|
# remove transfer encoding chunked and wrap in a dechunking stream
|
|
|
|
if (status_headers.remove_header('transfer-encoding')):
|
|
|
|
stream = ChunkedDataReader(stream)
|
|
|
|
|
|
|
|
return (status_headers, stream)
|
|
|
|
|
2014-09-30 12:42:11 -07:00
|
|
|
def _rewrite_headers(self, urlrewriter, rule, status_headers, stream, urlkey=''):
|
2014-02-17 02:34:39 -08:00
|
|
|
|
2014-09-30 12:42:11 -07:00
|
|
|
header_rewriter_class = rule.rewriters['header']
|
|
|
|
|
|
|
|
cookie_rewriter = None
|
|
|
|
|
|
|
|
if urlrewriter:
|
|
|
|
cookie_rewriter = urlrewriter.get_cookie_rewriter(rule)
|
2014-02-17 02:34:39 -08:00
|
|
|
|
2014-03-14 16:34:51 -07:00
|
|
|
rewritten_headers = (header_rewriter_class().
|
2014-09-30 12:42:11 -07:00
|
|
|
rewrite(status_headers, urlrewriter, cookie_rewriter))
|
2014-03-14 16:34:51 -07:00
|
|
|
|
|
|
|
# note: since chunk encoding may/may not be valid,
|
|
|
|
# the approach taken here is to *always* attempt
|
|
|
|
# to dechunk if 'transfer-encoding: chunked' is present
|
2014-02-17 02:34:39 -08:00
|
|
|
#
|
2014-03-14 16:34:51 -07:00
|
|
|
# an alternative may be to serve chunked unless
|
|
|
|
# content rewriting is needed
|
2014-02-17 02:34:39 -08:00
|
|
|
# todo: possible revisit this approach
|
|
|
|
|
2014-03-14 16:34:51 -07:00
|
|
|
if (rewritten_headers.
|
|
|
|
contains_removed_header('transfer-encoding', 'chunked')):
|
|
|
|
|
2014-02-17 02:34:39 -08:00
|
|
|
stream = ChunkedDataReader(stream)
|
|
|
|
|
|
|
|
return (rewritten_headers, stream)
|
|
|
|
|
2014-08-04 21:11:46 -07:00
|
|
|
def rewrite_content(self, urlrewriter, headers, stream,
|
2014-04-03 12:44:00 -07:00
|
|
|
head_insert_func=None, urlkey='',
|
2014-07-29 12:20:22 -07:00
|
|
|
cdx=None):
|
2014-04-03 12:44:00 -07:00
|
|
|
|
2014-08-04 21:11:46 -07:00
|
|
|
wb_url = urlrewriter.wburl
|
|
|
|
|
2014-07-29 12:20:22 -07:00
|
|
|
if (wb_url.is_identity or
|
|
|
|
(not head_insert_func and wb_url.is_banner_only)):
|
2014-04-03 12:44:00 -07:00
|
|
|
status_headers, stream = self.sanitize_content(headers, stream)
|
|
|
|
return (status_headers, self.stream_to_gen(stream), False)
|
|
|
|
|
2014-07-31 17:02:26 -07:00
|
|
|
if wb_url.is_banner_only:
|
|
|
|
urlrewriter = None
|
|
|
|
|
2014-09-30 12:42:11 -07:00
|
|
|
rule = self.ruleset.get_first_match(urlkey)
|
|
|
|
|
|
|
|
(rewritten_headers, stream) = self._rewrite_headers(urlrewriter,
|
|
|
|
rule,
|
|
|
|
headers,
|
|
|
|
stream)
|
2014-02-17 02:34:39 -08:00
|
|
|
|
|
|
|
status_headers = rewritten_headers.status_headers
|
2014-02-26 18:02:01 -08:00
|
|
|
|
2014-04-03 12:44:00 -07:00
|
|
|
# use rewritten headers, but no further rewriting needed
|
|
|
|
if rewritten_headers.text_type is None:
|
|
|
|
return (status_headers, self.stream_to_gen(stream), False)
|
|
|
|
|
2014-02-17 02:34:39 -08:00
|
|
|
# Handle text content rewriting
|
2014-03-14 16:34:51 -07:00
|
|
|
# ====================================================================
|
2014-02-17 02:34:39 -08:00
|
|
|
# special case -- need to ungzip the body
|
2014-02-26 18:02:01 -08:00
|
|
|
|
2014-04-06 09:47:34 -07:00
|
|
|
text_type = rewritten_headers.text_type
|
2014-05-05 22:12:45 -07:00
|
|
|
|
|
|
|
# see known js/css modifier specified, the context should run
|
|
|
|
# default text_type
|
2014-07-29 12:20:22 -07:00
|
|
|
mod = wb_url.mod
|
|
|
|
|
2014-05-05 22:12:45 -07:00
|
|
|
if mod == 'js_':
|
|
|
|
text_type = 'js'
|
|
|
|
elif mod == 'cs_':
|
|
|
|
text_type = 'css'
|
|
|
|
|
2014-04-06 09:47:34 -07:00
|
|
|
stream_raw = False
|
|
|
|
encoding = None
|
|
|
|
first_buff = None
|
|
|
|
|
2014-03-14 16:34:51 -07:00
|
|
|
if (rewritten_headers.
|
|
|
|
contains_removed_header('content-encoding', 'gzip')):
|
2014-04-28 20:15:31 -07:00
|
|
|
|
|
|
|
#optimize: if already a ChunkedDataReader, add gzip
|
|
|
|
if isinstance(stream, ChunkedDataReader):
|
|
|
|
stream.set_decomp('gzip')
|
|
|
|
else:
|
2014-05-16 22:43:53 -07:00
|
|
|
stream = DecompressingBufferedReader(stream)
|
2014-02-17 02:34:39 -08:00
|
|
|
|
2014-05-16 22:43:53 -07:00
|
|
|
rewriter_class = rule.rewriters[text_type]
|
2014-02-17 02:34:39 -08:00
|
|
|
|
2014-04-03 12:44:00 -07:00
|
|
|
# for html, need to perform header insert, supply js, css, xml
|
|
|
|
# rewriters
|
2014-02-17 02:34:39 -08:00
|
|
|
if text_type == 'html':
|
2014-02-26 22:04:37 -08:00
|
|
|
head_insert_str = ''
|
|
|
|
|
|
|
|
if head_insert_func:
|
2014-04-09 15:46:03 -07:00
|
|
|
head_insert_str = head_insert_func(rule, cdx)
|
2014-08-05 10:12:54 -07:00
|
|
|
head_insert_str = head_insert_str.encode('utf-8')
|
2014-02-26 18:02:01 -08:00
|
|
|
|
2014-07-29 12:20:22 -07:00
|
|
|
if wb_url.is_banner_only:
|
|
|
|
gen = self._head_insert_only_gen(head_insert_str, stream)
|
2014-08-05 10:12:54 -07:00
|
|
|
|
|
|
|
content_len = headers.get_header('Content-Length')
|
|
|
|
try:
|
|
|
|
content_len = int(content_len)
|
|
|
|
except Exception:
|
|
|
|
content_len = None
|
|
|
|
|
|
|
|
if content_len and content_len >= 0:
|
|
|
|
content_len = str(content_len + len(head_insert_str))
|
|
|
|
status_headers.replace_header('Content-Length',
|
|
|
|
content_len)
|
|
|
|
|
2014-07-29 12:20:22 -07:00
|
|
|
return (status_headers, gen, False)
|
2014-02-26 18:02:01 -08:00
|
|
|
|
|
|
|
rewriter = rewriter_class(urlrewriter,
|
|
|
|
js_rewriter_class=rule.rewriters['js'],
|
|
|
|
css_rewriter_class=rule.rewriters['css'],
|
2014-04-09 10:01:44 -07:00
|
|
|
head_insert=head_insert_str,
|
2014-08-05 01:47:52 -07:00
|
|
|
defmod=self.defmod,
|
|
|
|
parse_comments=rule.parse_comments)
|
2014-04-06 09:47:34 -07:00
|
|
|
|
2014-02-17 02:34:39 -08:00
|
|
|
else:
|
2014-07-29 12:20:22 -07:00
|
|
|
if wb_url.is_banner_only:
|
|
|
|
return (status_headers, self.stream_to_gen(stream), False)
|
|
|
|
|
|
|
|
# apply one of (js, css, xml) rewriters
|
2014-02-17 02:34:39 -08:00
|
|
|
rewriter = rewriter_class(urlrewriter)
|
|
|
|
|
|
|
|
# Create rewriting generator
|
2014-09-06 15:19:28 -07:00
|
|
|
gen = self.stream_to_gen(stream,
|
|
|
|
rewrite_func=rewriter.rewrite,
|
|
|
|
final_read_func=rewriter.close,
|
|
|
|
first_buff=first_buff)
|
2014-04-03 12:44:00 -07:00
|
|
|
|
|
|
|
return (status_headers, gen, True)
|
2014-02-17 02:34:39 -08:00
|
|
|
|
2014-07-29 12:20:22 -07:00
|
|
|
def _head_insert_only_gen(self, insert_str, stream):
|
|
|
|
max_len = 1024
|
|
|
|
buff = ''
|
|
|
|
while max_len > 0:
|
|
|
|
curr = stream.read(max_len)
|
|
|
|
if not curr:
|
|
|
|
break
|
|
|
|
|
|
|
|
max_len -= len(buff)
|
|
|
|
buff += curr
|
|
|
|
|
|
|
|
matcher = self.HEAD_REGEX.search(buff)
|
|
|
|
|
|
|
|
if matcher:
|
2014-08-05 10:12:54 -07:00
|
|
|
yield buff[:matcher.end()]
|
|
|
|
yield insert_str
|
2014-07-29 12:20:22 -07:00
|
|
|
yield buff[matcher.end():]
|
|
|
|
else:
|
|
|
|
yield insert_str
|
|
|
|
yield buff
|
|
|
|
|
|
|
|
for buff in self.stream_to_gen(stream):
|
|
|
|
yield buff
|
|
|
|
|
2014-04-03 12:44:00 -07:00
|
|
|
@staticmethod
|
2014-07-20 18:26:16 -07:00
|
|
|
def _decode_buff(buff, stream, encoding): # pragma: no coverage
|
2014-02-17 02:34:39 -08:00
|
|
|
try:
|
|
|
|
buff = buff.decode(encoding)
|
|
|
|
except UnicodeDecodeError, e:
|
|
|
|
# chunk may have cut apart unicode bytes -- add 1-3 bytes and retry
|
|
|
|
for i in range(3):
|
|
|
|
buff += stream.read(1)
|
|
|
|
try:
|
|
|
|
buff = buff.decode(encoding)
|
|
|
|
break
|
|
|
|
except UnicodeDecodeError:
|
|
|
|
pass
|
|
|
|
else:
|
|
|
|
raise
|
|
|
|
|
|
|
|
return buff
|
|
|
|
|
2014-03-14 16:34:51 -07:00
|
|
|
# Create a generator reading from a stream,
|
|
|
|
# with optional rewriting and final read call
|
2014-02-17 02:34:39 -08:00
|
|
|
@staticmethod
|
2014-03-14 16:34:51 -07:00
|
|
|
def stream_to_gen(stream, rewrite_func=None,
|
|
|
|
final_read_func=None, first_buff=None):
|
2014-02-17 02:34:39 -08:00
|
|
|
try:
|
2014-04-06 09:47:34 -07:00
|
|
|
if first_buff:
|
|
|
|
buff = first_buff
|
|
|
|
else:
|
2014-04-09 16:44:45 -07:00
|
|
|
buff = stream.read()
|
2014-06-27 19:03:06 -07:00
|
|
|
if buff and (not hasattr(stream, 'closed') or
|
|
|
|
not stream.closed):
|
|
|
|
buff += stream.readline()
|
2014-04-06 09:47:34 -07:00
|
|
|
|
2014-02-17 02:34:39 -08:00
|
|
|
while buff:
|
|
|
|
if rewrite_func:
|
|
|
|
buff = rewrite_func(buff)
|
|
|
|
yield buff
|
2014-04-09 16:44:45 -07:00
|
|
|
buff = stream.read()
|
2014-06-27 19:03:06 -07:00
|
|
|
if buff:
|
|
|
|
buff += stream.readline()
|
2014-02-17 02:34:39 -08:00
|
|
|
|
|
|
|
# For adding a tail/handling final buffer
|
|
|
|
if final_read_func:
|
|
|
|
buff = final_read_func()
|
|
|
|
if buff:
|
|
|
|
yield buff
|
|
|
|
|
|
|
|
finally:
|
|
|
|
stream.close()
|