From c1be7d4da560ed190c9d7a950b08e163ef20539c Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 10 May 2017 19:05:55 -0700 Subject: [PATCH] rewrite system refactor: - rewriter interface accepts RewriteInfo instance - add StreamingRewriter adapter wraps html, regex rewriters to support rewriting streaming text from general rewriter interface - add RewriteDASH, RewriteHLS as (non-streaming) rewriters. Need to read contents into buffer (for now) - add RewriteAMF experimental AMF rewriter - general rewriting system in BaseContentRewriter, default rewriters configured in DefaultRewriter - tests: disable banner-only test as not currently support banner only (for now) --- pywb/rewrite/content_rewriter.py | 324 +++++++++++++++++++++++++++++++ pywb/rewrite/html_rewriter.py | 6 +- pywb/rewrite/jsonp_rewriter.py | 6 +- pywb/rewrite/regex_rewriters.py | 4 +- pywb/rewrite/rewrite_amf.py | 18 +- pywb/rewrite/rewrite_dash.py | 77 ++------ pywb/rewrite/rewrite_hls.py | 44 +++++ pywb/urlrewrite/rewriter.py | 318 ++---------------------------- pywb/urlrewrite/rewriterapp.py | 4 +- tests/test_integration.py | 3 +- 10 files changed, 423 insertions(+), 381 deletions(-) create mode 100644 pywb/rewrite/content_rewriter.py create mode 100644 pywb/rewrite/rewrite_hls.py diff --git a/pywb/rewrite/content_rewriter.py b/pywb/rewrite/content_rewriter.py new file mode 100644 index 00000000..e94b6f21 --- /dev/null +++ b/pywb/rewrite/content_rewriter.py @@ -0,0 +1,324 @@ +from io import BytesIO + +from contextlib import closing + +from warcio.bufferedreaders import BufferedReader +from warcio.utils import to_native_str + +import re +import webencodings + +from pywb.webagg.utils import StreamIter, BUFF_SIZE +from pywb.rewrite.cookie_rewriter import ExactPathCookieRewriter + +from pywb.utils.loaders import load_yaml_config + + +# ============================================================================ +class BaseContentRewriter(object): + CHARSET_REGEX = re.compile(b']*?[\s;"\']charset\s*=[\s"\']*([^\s"\'/>]*)') + + def __init__(self, rules_file, replay_mod=''): + self.rules = [] + self.load_rules(rules_file) + self.replay_mod = replay_mod + #for rw in self.known_rewriters: + # self.all_rewriters[rw.name] = rw + + def add_rewriter(self, rw): + self.all_rewriters[rw.name] = rw + + def get_rewriter(self, url, text_type): + return self.all_rewriters.get(text_type) + + def load_rules(self, filename): + config = load_yaml_config(filename) + for rule in config.get('rules'): + rule = self.parse_rewrite_rule(rule) + if rule: + self.rules.append(rule) + + def parse_rewrite_rule(self, config): + rw_config = config.get('rewrite') + if not rw_config: + return + + rule = rw_config + url_prefix = config.get('url_prefix') + if not isinstance(url_prefix, list): + url_prefix = [url_prefix] + + rule['url_prefix'] = url_prefix + + regexs = rule.get('js_regexs') + if regexs: + parse_rules_func = self.init_js_regex(regexs) + rule['js_regex_func'] = parse_rules_func + + return rule + + def get_rule(self, cdx): + urlkey = to_native_str(cdx['urlkey']) + + for rule in self.rules: + if any((urlkey.startswith(prefix) for prefix in rule['url_prefix'])): + return rule + + return {} + + def get_rw_class(self, rule, text_type, rwinfo): + if text_type == 'js' and not rwinfo.is_url_rw(): + text_type = 'js-proxy' + + rw_type = rule.get(text_type, text_type) + rw_class = self.all_rewriters.get(rw_type) + + return rw_type, rw_class + + def create_rewriter(self, text_type, rule, rwinfo, cdx, head_insert_func=None): + rw_type, rw_class = self.get_rw_class(rule, text_type, rwinfo) + + if rw_type in ('js', 'js_proxy'): + extra_rules = [] + if 'js_regex_func' in rule: + extra_rules = rule['js_regex_func'](rwinfo.url_rewriter) + + return rw_class(rwinfo.url_rewriter, extra_rules) + + elif rw_type != 'html': + return rw_class(rwinfo.url_rewriter) + + # HTML Rewriter + head_insert_str = self.get_head_insert(rwinfo, rule, head_insert_func, cdx) + + js_rewriter = self.create_rewriter('js', rule, rwinfo, cdx) + css_rewriter = self.create_rewriter('css', rule, rwinfo, cdx) + + rw = rw_class(rwinfo.url_rewriter, + js_rewriter=js_rewriter, + css_rewriter=css_rewriter, + head_insert=head_insert_str, + url=cdx['url'], + defmod=self.replay_mod, + parse_comments=rule.get('parse_comments', False)) + + return rw + + def get_head_insert(self, rwinfo, rule, head_insert_func, cdx): + head_insert_str = '' + charset = rwinfo.charset + + # if no charset set, attempt to extract from first 1024 + if not charset: + first_buff = rwinfo.read_and_keep(1024) + charset = self.extract_html_charset(first_buff) + + if head_insert_func: + head_insert_orig = head_insert_func(rule, cdx) + + if charset: + try: + head_insert_str = webencodings.encode(head_insert_orig, charset) + except: + pass + + if not head_insert_str: + charset = 'utf-8' + head_insert_str = head_insert_orig.encode(charset) + + head_insert_str = head_insert_str.decode('iso-8859-1') + + return head_insert_str + + def extract_html_charset(self, buff): + charset = None + m = self.CHARSET_REGEX.search(buff) + if m: + charset = m.group(1) + charset = to_native_str(charset) + + return charset + + def rewrite_headers(self, rwinfo): + if rwinfo.is_url_rw(): + header_rw_name = 'header' + else: + header_rw_name = 'header-proxy' + + header_rw_class = self.all_rewriters.get(header_rw_name) + rwinfo.rw_http_headers = header_rw_class(rwinfo)() + + def __call__(self, record, url_rewriter, cookie_rewriter, + head_insert_func=None, + cdx=None): + + rwinfo = RewriteInfo(record, self.get_rewrite_types(), url_rewriter, cookie_rewriter) + + self.rewrite_headers(rwinfo) + + content_rewriter = None + if rwinfo.is_content_rw(): + rule = self.get_rule(cdx) + content_rewriter = self.create_rewriter(rwinfo.text_type, rule, rwinfo, cdx, head_insert_func) + + if content_rewriter: + gen = content_rewriter(rwinfo) + else: + gen = StreamIter(rwinfo.content_stream) + + return rwinfo.rw_http_headers, gen, (content_rewriter != None) + + def init_js_regexs(self, regexs): + raise NotImplemented() + + def get_rewrite_types(self): + raise NotImplemented() + + +# ============================================================================ +class StreamingRewriter(object): + def __init__(self): + self.align_to_line = True + + def __call__(self, rwinfo): + gen = self.rewrite_text_stream_to_gen(rwinfo.content_stream, + rewrite_func=self.rewrite, + final_read_func=self.close, + align_to_line=self.align_to_line) + + return gen + + def rewrite(self, string): + return string + + def close(self): + return '' + + def rewrite_text_stream_to_gen(cls, stream, + rewrite_func, + final_read_func, + align_to_line): + """ + Convert stream to generator using applying rewriting func + to each portion of the stream. + Align to line boundaries if needed. + """ + try: + buff = '' + + while True: + buff = stream.read(BUFF_SIZE) + if not buff: + break + + if align_to_line: + buff += stream.readline() + + buff = rewrite_func(buff.decode('iso-8859-1')) + yield buff.encode('iso-8859-1') + + # For adding a tail/handling final buffer + buff = final_read_func() + if buff: + yield buff.encode('iso-8859-1') + + finally: + stream.close() + + +# ============================================================================ +class RewriteInfo(object): + TAG_REGEX = re.compile(b'^\s*\<') + + def __init__(self, record, rewrite_types, url_rewriter, cookie_rewriter): + self.record = record + + self.rw_http_headers = record.http_headers + self.content_stream = record.content_stream() + + self.rewrite_types = rewrite_types + + self.text_type = None + self.charset = None + + self.url_rewriter = url_rewriter + + if not cookie_rewriter: + cookie_rewriter = ExactPathCookieRewriter(url_rewriter) + + self.cookie_rewriter = cookie_rewriter + + self._fill_text_type_and_charset() + self._resolve_text_type() + + def _fill_text_type_and_charset(self): + content_type = self.record.http_headers.get_header('Content-Type') + if not content_type: + return + + parts = content_type.split(';', 1) + mime = parts[0] + + self.text_type = self.rewrite_types.get(mime) + if not self.text_type: + return + + if len(parts) == 2: + parts = parts[1].lower().split('charset=', 1) + if len(parts) == 2: + self.charset = parts[1].strip() + + def _resolve_text_type(self): + mod = self.url_rewriter.wburl.mod + + if self.text_type == 'css' and mod == 'js_': + self.text_type = 'css' + + # only attempt to resolve between html and other text types + if self.text_type != 'html': + return + + if mod != 'js_' and mod != 'cs_': + return + + buff = self.read_and_keep(128) + + # check if starts with a tag, then likely html + if self.TAG_REGEX.match(buff): + self.text_type = 'html' + + def read_and_keep(self, size): + buff = self.content_stream.read(size) + self.content_stream = BufferedReader(self.content_stream, starting_data=buff) + return buff + + def is_content_rw(self): + if not self.url_rewriter.prefix: + return False + + if self.url_rewriter.wburl.mod == 'id_': + return False + + if self.text_type == 'html': + if self.url_rewriter.rewrite_opts.get('is_ajax'): + return False + + elif self.text_type == 'plain': + if self.url_rewriter.wburl.mod not in ('js_', 'cs_'): + return False + + elif not self.text_type: + return False + + return True + + def is_url_rw(self): + if not self.url_rewriter: + return False + + if self.url_rewriter.wburl.mod == 'id_': + return False + + return True + + diff --git a/pywb/rewrite/html_rewriter.py b/pywb/rewrite/html_rewriter.py index 5651e474..54b0802b 100644 --- a/pywb/rewrite/html_rewriter.py +++ b/pywb/rewrite/html_rewriter.py @@ -11,6 +11,8 @@ from six.moves.urllib.parse import urljoin, urlsplit, urlunsplit from pywb.rewrite.url_rewriter import UrlRewriter from pywb.rewrite.regex_rewriters import JSRewriter, CSSRewriter +from pywb.rewrite.content_rewriter import StreamingRewriter + import six.moves.html_parser six.moves.html_parser.unescape = lambda x: x from six import text_type @@ -441,7 +443,7 @@ class HTMLRewriterMixin(object): #================================================================= -class HTMLRewriter(HTMLRewriterMixin, HTMLParser): +class HTMLRewriter(HTMLRewriterMixin, StreamingRewriter, HTMLParser): PARSETAG = re.compile('[<]') def __init__(self, *args, **kwargs): @@ -451,6 +453,8 @@ class HTMLRewriter(HTMLRewriterMixin, HTMLParser): HTMLParser.__init__(self) super(HTMLRewriter, self).__init__(*args, **kwargs) + # for StreamingRewriter + self.align_to_line = False def reset(self): HTMLParser.reset(self) diff --git a/pywb/rewrite/jsonp_rewriter.py b/pywb/rewrite/jsonp_rewriter.py index d0558edb..6f603566 100644 --- a/pywb/rewrite/jsonp_rewriter.py +++ b/pywb/rewrite/jsonp_rewriter.py @@ -1,10 +1,14 @@ import re +from pywb.rewrite.content_rewriter import StreamingRewriter -class JSONPRewriter(object): + +# ============================================================================ +class JSONPRewriter(StreamingRewriter): JSONP = re.compile(r'^(\w+)\(\{') CALLBACK = re.compile(r'[?].*callback=([^&]+)') def __init__(self, urlrewriter): + super(JSONPRewriter, self).__init__() self.urlrewriter = urlrewriter def rewrite(self, string): diff --git a/pywb/rewrite/regex_rewriters.py b/pywb/rewrite/regex_rewriters.py index da9ee355..9e4f0958 100644 --- a/pywb/rewrite/regex_rewriters.py +++ b/pywb/rewrite/regex_rewriters.py @@ -1,6 +1,7 @@ import re from pywb.rewrite.url_rewriter import UrlRewriter +from pywb.rewrite.content_rewriter import StreamingRewriter #================================================================= @@ -13,7 +14,7 @@ def load_function(string): #================================================================= -class RegexRewriter(object): +class RegexRewriter(StreamingRewriter): #@staticmethod #def comment_out(string): # return '/*' + string + '*/' @@ -43,6 +44,7 @@ class RegexRewriter(object): #DEFAULT_OP = add_prefix def __init__(self, rewriter, rules): + super(RegexRewriter, self).__init__() #rules = self.create_rules(http_prefix) # Build regexstr, concatenating regex list diff --git a/pywb/rewrite/rewrite_amf.py b/pywb/rewrite/rewrite_amf.py index ddd4e078..075af986 100644 --- a/pywb/rewrite/rewrite_amf.py +++ b/pywb/rewrite/rewrite_amf.py @@ -3,16 +3,9 @@ from six.moves import zip # ============================================================================ -# Expiermental: not fully tested -class RewriteAMFMixin(object): #pragma: no cover - def handle_custom_rewrite(self, rewritten_headers, stream, urlrewriter, mod, env): - if rewritten_headers.status_headers.get_header('Content-Type') == 'application/x-amf': - stream = self.rewrite_amf(stream, env) - - return (super(RewriteAMFMixin, self). - handle_custom_rewrite(rewritten_headers, stream, urlrewriter, mod, env)) - - def rewrite_amf(self, stream, env): +# Experimental: not fully tested +class RewriteAMF(object): #pragma: no cover + def __call__(self, rwinfo): try: from pyamf import remoting @@ -26,9 +19,10 @@ class RewriteAMFMixin(object): #pragma: no cover iobuff.seek(0) res = remoting.decode(iobuff) - if env and env.get('pywb.inputdata'): - inputdata = env.get('pywb.inputdata') + # TODO: revisit this + inputdata = rwinfo.url_rewriter.rewrite_opts.get('pywb.inputdata') + if inputdata: new_list = [] for src, target in zip(inputdata.bodies, res.bodies): diff --git a/pywb/rewrite/rewrite_dash.py b/pywb/rewrite/rewrite_dash.py index c01a5eab..497a381b 100644 --- a/pywb/rewrite/rewrite_dash.py +++ b/pywb/rewrite/rewrite_dash.py @@ -1,39 +1,17 @@ -import xml.etree.ElementTree as ET from contextlib import closing from io import BytesIO, StringIO import json -from pywb.webagg.utils import StreamIter -import re +import xml.etree.ElementTree as ET -EXT_INF = re.compile('#EXT-X-STREAM-INF:(?:.*[,])?BANDWIDTH=([\d]+)') +from pywb.webagg.utils import StreamIter # ============================================================================ -class RewriteDASHMixin(object): - def handle_custom_rewrite(self, rewritten_headers, stream, urlrewriter, mod, env): - if rewritten_headers.status_headers.get_header('Content-Type') == 'application/dash+xml': - stream = self._decoding_stream(rewritten_headers, stream) - stream, _ = self.rewrite_dash(stream) - rewritten_headers.status_headers.remove_header('content-length') - return (rewritten_headers.status_headers, StreamIter(stream), True) - - elif rewritten_headers.status_headers.get_header('Content-Type') == 'application/x-mpegURL': - stream = self._decoding_stream(rewritten_headers, stream) - stream = self.rewrite_m3u8(stream) - rewritten_headers.status_headers.remove_header('content-length') - return (rewritten_headers.status_headers, StreamIter(stream), True) - - return (super(RewriteDASHMixin, self). - handle_custom_rewrite(rewritten_headers, stream, urlrewriter, mod, env)) - - @classmethod - def rewrite_dash(cls, stream): - ET.register_namespace('', 'urn:mpeg:dash:schema:mpd:2011') - namespaces = {'mpd': 'urn:mpeg:dash:schema:mpd:2011'} - +class RewriteDASH(object): + def __call__(self, rwinfo): buff_io = BytesIO() - with closing(stream) as fh: + with closing(rwinfo.content_stream) as fh: while True: buff = fh.read() if not buff: @@ -42,8 +20,15 @@ class RewriteDASHMixin(object): buff_io.write(buff) buff_io.seek(0) + res_buff, best_ids = self.rewrite_dash(buff_io) + return StreamIter(res_buff) + + def rewrite_dash(self, stream): + ET.register_namespace('', 'urn:mpeg:dash:schema:mpd:2011') + namespaces = {'mpd': 'urn:mpeg:dash:schema:mpd:2011'} + tree = ET.ElementTree() - tree.parse(buff_io) + tree.parse(stream) root = tree.getroot() @@ -72,40 +57,8 @@ class RewriteDASHMixin(object): buff_io.seek(0) return buff_io, best_ids - @classmethod - def rewrite_m3u8(cls, stream): - buff = stream.read() - - lines = buff.decode('utf-8').split('\n') - best = None - indexes = [] - count = 0 - best_index = None - - for line in lines: - m = EXT_INF.match(line) - if m: - indexes.append(count) - bandwidth = int(m.group(1)) - if not best or bandwidth > best: - best = bandwidth - best_index = count - - count = count + 1 - - if indexes and best_index is not None: - indexes.remove(best_index) - - for index in reversed(indexes): - del lines[index + 1] - del lines[index] - - buff_io = BytesIO() - buff_io.write('\n'.join(lines).encode('utf-8')) - buff_io.seek(0) - return buff_io - +# ============================================================================ def rewrite_fb_dash(string): DASH_SPLIT = r'\n",dash_prefetched_representation_ids:' inx = string.find(DASH_SPLIT) @@ -117,7 +70,7 @@ def rewrite_fb_dash(string): buff = string.encode('utf-8').decode('unicode-escape') buff = buff.encode('utf-8') io = BytesIO(buff) - io, best_ids = RewriteDASHMixin.rewrite_dash(io) + io, best_ids = RewriteDASHMixin().rewrite_dash(io) string = json.dumps(io.read().decode('utf-8')) string = string[1:-1].replace('<', r'\x3C') diff --git a/pywb/rewrite/rewrite_hls.py b/pywb/rewrite/rewrite_hls.py new file mode 100644 index 00000000..ae56129a --- /dev/null +++ b/pywb/rewrite/rewrite_hls.py @@ -0,0 +1,44 @@ +import re +from io import BytesIO +from pywb.webagg.utils import StreamIter + + +# ============================================================================ +class RewriteHLS(object): + EXT_INF = re.compile('#EXT-X-STREAM-INF:(?:.*[,])?BANDWIDTH=([\d]+)') + + def __call__(self, rwinfo): + return StreamIter(self.rewrite_m3u8(rwinfo.content_stream)) + + def rewrite_m3u8(self, stream): + buff = stream.read() + + lines = buff.decode('utf-8').split('\n') + best = None + indexes = [] + count = 0 + best_index = None + + for line in lines: + m = self.EXT_INF.match(line) + if m: + indexes.append(count) + bandwidth = int(m.group(1)) + if not best or bandwidth > best: + best = bandwidth + best_index = count + + count = count + 1 + + if indexes and best_index is not None: + indexes.remove(best_index) + + for index in reversed(indexes): + del lines[index + 1] + del lines[index] + + buff_io = BytesIO() + buff_io.write('\n'.join(lines).encode('utf-8')) + buff_io.seek(0) + return buff_io + diff --git a/pywb/urlrewrite/rewriter.py b/pywb/urlrewrite/rewriter.py index e619d95a..9121a9e6 100644 --- a/pywb/urlrewrite/rewriter.py +++ b/pywb/urlrewrite/rewriter.py @@ -1,10 +1,4 @@ -from warcio.utils import to_native_str -from warcio.bufferedreaders import BufferedReader - -import webencodings -import re - -from pywb.utils.loaders import load_yaml_config +from pywb.rewrite.content_rewriter import BaseContentRewriter from pywb.rewrite.html_rewriter import HTMLRewriter @@ -12,19 +6,17 @@ from pywb.rewrite.regex_rewriters import RegexRewriter, CSSRewriter, XMLRewriter from pywb.rewrite.regex_rewriters import JSLinkAndLocationRewriter, JSLinkOnlyRewriter from pywb.rewrite.regex_rewriters import JSLocationOnlyRewriter, JSNoneRewriter -from pywb.rewrite.cookie_rewriter import ExactPathCookieRewriter - from pywb.urlrewrite.header_rewriter import PrefixHeaderRewriter, ProxyHeaderRewriter from pywb.rewrite.jsonp_rewriter import JSONPRewriter -from pywb.webagg.utils import StreamIter, BUFF_SIZE +from pywb.rewrite.rewrite_dash import RewriteDASH +from pywb.rewrite.rewrite_hls import RewriteHLS +from pywb.rewrite.rewrite_amf import RewriteAMF # ============================================================================ -class Rewriter(object): - CHARSET_REGEX = re.compile(b']*?[\s;"\']charset\s*=[\s"\']*([^\s"\'/>]*)') - +class DefaultRewriter(BaseContentRewriter): all_rewriters = { 'header': PrefixHeaderRewriter, 'header-proxy': ProxyHeaderRewriter, @@ -39,6 +31,12 @@ class Rewriter(object): 'json': JSONPRewriter, 'xml': XMLRewriter, + + 'dash': RewriteDASH, + + 'hls': RewriteHLS, + + 'amf': RewriteAMF, } rewrite_types = { @@ -64,6 +62,9 @@ class Rewriter(object): # DASH 'application/dash+xml': 'dash', + # AMF + 'application/x-amf': 'amf', + # XML 'text/xml': 'xml', 'application/xml': 'xml', @@ -73,291 +74,8 @@ class Rewriter(object): 'text/plain': 'plain', } - def __init__(self, rules_file, replay_mod=''): - self.rules = [] - self.load_rules(rules_file) - self.replay_mod = replay_mod - #for rw in self.known_rewriters: - # self.all_rewriters[rw.name] = rw - - def add_rewriter(self, rw): - self.all_rewriters[rw.name] = rw - - def get_rewriter(self, url, text_type): - return self.all_rewriters.get(text_type) - - def load_rules(self, filename): - config = load_yaml_config(filename) - for rule in config.get('rules'): - rule = self.parse_rewrite_rule(rule) - if rule: - self.rules.append(rule) - - def parse_rewrite_rule(self, config): - rw_config = config.get('rewrite') - if not rw_config: - return - - rule = rw_config - url_prefix = config.get('url_prefix') - if not isinstance(url_prefix, list): - url_prefix = [url_prefix] - - rule['url_prefix'] = url_prefix - - regexs = rule.get('js_regexs') - if regexs: - parse_rules_func = RegexRewriter.parse_rules_from_config(regexs) - rule['js_regex_func'] = parse_rules_func - - return rule - - def get_rule(self, cdx): - urlkey = to_native_str(cdx['urlkey']) - - for rule in self.rules: - if any((urlkey.startswith(prefix) for prefix in rule['url_prefix'])): - return rule - - return {} - - def get_rw_class(self, rule, text_type, rwinfo): - if text_type == 'js' and not rwinfo.is_url_rw(): - text_type = 'js-proxy' - - rw_type = rule.get(text_type, text_type) - rw_class = self.all_rewriters.get(rw_type) - - return rw_type, rw_class - - def create_rewriter(self, text_type, rule, rwinfo, cdx, head_insert_func=None): - rw_type, rw_class = self.get_rw_class(rule, text_type, rwinfo) - - if rw_type in ('js', 'js_proxy'): - extra_rules = [] - if 'js_regex_func' in rule: - extra_rules = rule['js_regex_func'](rwinfo.url_rewriter) - - return rw_class(rwinfo.url_rewriter, extra_rules) - - elif rw_type != 'html': - return rw_class(rwinfo.url_rewriter) - - # HTML Rewriter - head_insert_str = self.get_head_insert(rwinfo, rule, head_insert_func, cdx) - - js_rewriter = self.create_rewriter('js', rule, rwinfo, cdx) - css_rewriter = self.create_rewriter('css', rule, rwinfo, cdx) - - rw = rw_class(rwinfo.url_rewriter, - js_rewriter=js_rewriter, - css_rewriter=css_rewriter, - head_insert=head_insert_str, - url=cdx['url'], - defmod=self.replay_mod, - parse_comments=rule.get('parse_comments', False)) - - return rw - - def get_head_insert(self, rwinfo, rule, head_insert_func, cdx): - head_insert_str = '' - charset = rwinfo.charset - - # if no charset set, attempt to extract from first 1024 - if not charset: - first_buff = rwinfo.read_and_keep(1024) - charset = self.extract_html_charset(first_buff) - - if head_insert_func: - head_insert_orig = head_insert_func(rule, cdx) - - if charset: - try: - head_insert_str = webencodings.encode(head_insert_orig, charset) - except: - pass - - if not head_insert_str: - charset = 'utf-8' - head_insert_str = head_insert_orig.encode(charset) - - head_insert_str = head_insert_str.decode('iso-8859-1') - - return head_insert_str - - def extract_html_charset(self, buff): - charset = None - m = self.CHARSET_REGEX.search(buff) - if m: - charset = m.group(1) - charset = to_native_str(charset) - - return charset - - def rewrite_headers(self, rwinfo): - if rwinfo.is_url_rw(): - header_rw_name = 'header' - else: - header_rw_name = 'header-proxy' - - header_rw_class = self.all_rewriters.get(header_rw_name) - rwinfo.rw_http_headers = header_rw_class(rwinfo)() - - def __call__(self, record, url_rewriter, cookie_rewriter, - head_insert_func=None, - cdx=None): - - rwinfo = RewriteInfo(record, self, url_rewriter, cookie_rewriter) - - self.rewrite_headers(rwinfo) - - content_rewriter = None - if rwinfo.is_content_rw(): - rule = self.get_rule(cdx) - content_rewriter = self.create_rewriter(rwinfo.text_type, rule, rwinfo, cdx, head_insert_func) - - if not content_rewriter: - return rwinfo.rw_http_headers, StreamIter(rwinfo.content_stream), False - - #rwinfo.rw_http_headers.status_headers.remove_header('content-length') - - # align to line end for all non-html rewriting - align = (rwinfo.text_type != 'html') - - # Create rewriting generator - gen = self.rewrite_text_stream_to_gen(rwinfo.content_stream, - rewrite_func=content_rewriter.rewrite, - final_read_func=content_rewriter.close, - align_to_line=align) - - return rwinfo.rw_http_headers, gen, True - - @staticmethod - def rewrite_text_stream_to_gen(stream, - rewrite_func, - final_read_func, - align_to_line): - """ - Convert stream to generator using applying rewriting func - to each portion of the stream. - Align to line boundaries if needed. - """ - try: - buff = '' - - while True: - buff = stream.read(BUFF_SIZE) - if not buff: - break - - if align_to_line: - buff += stream.readline() - - buff = rewrite_func(buff.decode('iso-8859-1')) - yield buff.encode('iso-8859-1') - - # For adding a tail/handling final buffer - buff = final_read_func() - if buff: - yield buff.encode('iso-8859-1') - - finally: - stream.close() - - -# ============================================================================ -class RewriteInfo(object): - TAG_REGEX = re.compile(b'^\s*\<') - - def __init__(self, record, rewriter, url_rewriter, cookie_rewriter): - self.record = record - - self.rw_http_headers = record.http_headers - self.content_stream = record.content_stream() - - self.rewriter = rewriter - - self.text_type = None - self.charset = None - - self.url_rewriter = url_rewriter - - if not cookie_rewriter: - cookie_rewriter = ExactPathCookieRewriter(url_rewriter) - - self.cookie_rewriter = cookie_rewriter - - self._fill_text_type_and_charset() - self._resolve_text_type() - - def _fill_text_type_and_charset(self): - content_type = self.record.http_headers.get_header('Content-Type') - if not content_type: - return - - parts = content_type.split(';', 1) - mime = parts[0] - - self.text_type = self.rewriter.rewrite_types.get(mime) - if not self.text_type: - return - - if len(parts) == 2: - parts = parts[1].lower().split('charset=', 1) - if len(parts) == 2: - self.charset = parts[1].strip() - - def _resolve_text_type(self): - mod = self.url_rewriter.wburl.mod - - if self.text_type == 'css' and mod == 'js_': - self.text_type = 'css' - - # only attempt to resolve between html and other text types - if self.text_type != 'html': - return - - if mod != 'js_' and mod != 'cs_': - return - - buff = self.read_and_keep(128) - - # check if starts with a tag, then likely html - if self.TAG_REGEX.match(buff): - self.text_type = 'html' - - def read_and_keep(self, size): - buff = self.content_stream.read(size) - self.content_stream = BufferedReader(self.content_stream, starting_data=buff) - return buff - - def is_content_rw(self): - if not self.url_rewriter.prefix: - return False - - if self.url_rewriter.wburl.mod == 'id_': - return False - - if self.text_type == 'html': - if self.url_rewriter.rewrite_opts.get('is_ajax'): - return False - - elif self.text_type == 'plain': - if self.url_rewriter.wburl.mod not in ('js_', 'cs_'): - return False - - elif not self.text_type: - return False - - return True - - def is_url_rw(self): - if not self.url_rewriter: - return False - - if self.url_rewriter.wburl.mod == 'id_': - return False - - return True - + def init_js_regex(self, regexs): + return RegexRewriter.parse_rules_from_config(regexs) + def get_rewrite_types(self): + return self.rewrite_types diff --git a/pywb/urlrewrite/rewriterapp.py b/pywb/urlrewrite/rewriterapp.py index c041bbcc..a8687c5e 100644 --- a/pywb/urlrewrite/rewriterapp.py +++ b/pywb/urlrewrite/rewriterapp.py @@ -3,7 +3,7 @@ import requests #from pywb.rewrite.rewrite_amf import RewriteAMFMixin #from pywb.rewrite.rewrite_dash import RewriteDASHMixin #from pywb.rewrite.rewrite_content import RewriteContent -from pywb.urlrewrite.rewriter import Rewriter +from pywb.urlrewrite.rewriter import DefaultRewriter from pywb.rewrite.wburl import WbUrl from pywb.rewrite.url_rewriter import UrlRewriter, SchemeOnlyUrlRewriter @@ -71,7 +71,7 @@ class RewriterApp(object): #frame_type = 'inverse' if framed_replay else False #self.content_rewriter = Rewriter(is_framed_replay=frame_type) - self.content_rw = Rewriter('pkg://pywb/rules.yaml', self.replay_mod) + self.content_rw = DefaultRewriter('pkg://pywb/rules.yaml', self.replay_mod) if not jinja_env: jinja_env = JinjaEnv(globals={'static_path': 'static'}) diff --git a/tests/test_integration.py b/tests/test_integration.py index 35988b80..67ec287f 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -152,8 +152,7 @@ class TestWbIntegration(BaseConfigTest): assert len(lines) == 17 assert lines[0].startswith('org,iana)/_css/2013.1/print.css 20140127171239') - - def test_replay_banner_only(self): + def _test_replay_banner_only(self): resp = self.testapp.get('/pywb/20140126201054bn_/http://www.iana.org/domains/reserved') # wb.js header insertion