From db9d0ae41a644ddd86dec9abee04b03b575db635 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 8 May 2017 19:17:09 -0700 Subject: [PATCH] new rewriting system! - new header rewriter - new extensible content rewriter in urlrewrite.rewriter! --- Dockerfile | 2 +- pywb/rewrite/html_rewriter.py | 21 +- pywb/urlrewrite/header_rewriter.py | 102 ++++++++ pywb/urlrewrite/rewriteinputreq.py | 2 + pywb/urlrewrite/rewriter.py | 363 +++++++++++++++++++++++++++++ pywb/urlrewrite/rewriterapp.py | 35 +-- 6 files changed, 504 insertions(+), 21 deletions(-) create mode 100644 pywb/urlrewrite/header_rewriter.py create mode 100644 pywb/urlrewrite/rewriter.py diff --git a/Dockerfile b/Dockerfile index b1d69cb9..73063b33 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM python:3.5.2 +FROM python:3.5.3 MAINTAINER Ilya Kreymer diff --git a/pywb/rewrite/html_rewriter.py b/pywb/rewrite/html_rewriter.py index cfd99d3e..5651e474 100644 --- a/pywb/rewrite/html_rewriter.py +++ b/pywb/rewrite/html_rewriter.py @@ -88,8 +88,10 @@ class HTMLRewriterMixin(object): # =========================== def __init__(self, url_rewriter, head_insert=None, - js_rewriter_class=JSRewriter, - css_rewriter_class=CSSRewriter, + js_rewriter_class=None, + js_rewriter=None, + css_rewriter=None, + css_rewriter_class=None, url = '', defmod='', parse_comments=False): @@ -97,8 +99,19 @@ class HTMLRewriterMixin(object): self.url_rewriter = url_rewriter self._wb_parse_context = None - self.js_rewriter = js_rewriter_class(url_rewriter) - self.css_rewriter = css_rewriter_class(url_rewriter) + if js_rewriter: + self.js_rewriter = js_rewriter + elif js_rewriter_class: + self.js_rewriter = js_rewriter_class(url_rewriter) + else: + self.js_rewriter = JSRewriter(url_rewriter) + + if css_rewriter: + self.css_rewriter = css_rewriter + elif css_rewriter_class: + self.css_rewriter = css_rewriter_class(url_rewriter) + else: + self.css_rewriter = CSSRewriter(url_rewriter) self.head_insert = head_insert self.parse_comments = parse_comments diff --git a/pywb/urlrewrite/header_rewriter.py b/pywb/urlrewrite/header_rewriter.py new file mode 100644 index 00000000..8788c535 --- /dev/null +++ b/pywb/urlrewrite/header_rewriter.py @@ -0,0 +1,102 @@ +from warcio.statusandheaders import StatusAndHeaders +from warcio.timeutils import datetime_to_http_date +from datetime import datetime, timedelta + + +#============================================================================= +class PrefixHeaderRewriter(object): + header_rules = { + 'content-type': 'keep', + 'content-disposition': 'keep', + 'content-range': 'keep', + 'accept-rangees': 'keep', + 'www-authenticate': 'keep', + 'proxy-authenticate': 'keep', + + 'location': 'url-rewrite', + 'content-location': 'url-rewrite', + 'content-base': 'url-rewrite', + + 'content-encoding': 'keep-if-no-content-rewrite', + 'content-length': 'content-length', + + 'set-cookie': 'cookie', + 'cookie': 'cookie', + } + + default_rule = 'prefix' + + def __init__(self, rwinfo, header_prefix='X-Archive-Orig-'): + self.header_prefix = header_prefix + self.rwinfo = rwinfo + self.http_headers = rwinfo.record.http_headers + + def __call__(self): + new_headers_list = [] + for name, value in self.http_headers.headers: + rule = self.header_rules.get(name.lower(), self.default_rule) + new_header = self.rewrite_header(name, value, rule) + if new_header: + if isinstance(new_header, list): + new_headers_list.extend(new_header) + else: + new_headers_list.append(new_header) + + return StatusAndHeaders(self.http_headers.statusline, + headers=new_headers_list, + protocol=self.http_headers.protocol) + + def rewrite_header(self, name, value, rule): + if rule == 'keep': + return (name, value) + + elif rule == 'url-rewrite': + return (name, self.rwinfo.url_rewriter.rewrite(value)) + + elif rule == 'keep-if-no-content-rewrite': + if not self.rwinfo.is_content_rw(): + return (name, value) + + elif rule == 'content-length': + if value == '0': + return (name, value) + + if not self.rwinfo.is_content_rw(): + try: + if int(value) >= 0: + return (name, value) + except: + pass + + elif rule == 'cookie': + if self.rwinfo.cookie_rewriter: + return self.rwinfo.cookie_rewriter.rewrite(value) + else: + return (name, value) + + # default 'prefix' + return (self.header_prefix + name, value) + + def _add_cache_headers(self, new_headers, http_cache): + try: + age = int(http_cache) + except: + age = 0 + + if age <= 0: + new_headers.append(('Cache-Control', 'no-cache; no-store')) + else: + dt = datetime.utcnow() + dt = dt + timedelta(seconds=age) + new_headers.append(('Cache-Control', 'max-age=' + str(age))) + new_headers.append(('Expires', datetime_to_http_date(dt))) + + +#============================================================================= +class ProxyHeaderRewriter(PrefixHeaderRewriter): + header_rules = { + 'transfer-encoding': 'prefix', + 'connection': 'prefix', + } + + default_rule = 'keep' diff --git a/pywb/urlrewrite/rewriteinputreq.py b/pywb/urlrewrite/rewriteinputreq.py index ccbc29f3..84569358 100644 --- a/pywb/urlrewrite/rewriteinputreq.py +++ b/pywb/urlrewrite/rewriteinputreq.py @@ -89,6 +89,8 @@ class RewriteInputRequest(DirectWSGIInputRequest): return headers def _req_cookie_rewrite(self, value): + return value + rule = self.rewriter.ruleset.get_first_match(self.urlkey) if not rule or not rule.req_cookie_rewrite: return value diff --git a/pywb/urlrewrite/rewriter.py b/pywb/urlrewrite/rewriter.py new file mode 100644 index 00000000..e619d95a --- /dev/null +++ b/pywb/urlrewrite/rewriter.py @@ -0,0 +1,363 @@ +from warcio.utils import to_native_str +from warcio.bufferedreaders import BufferedReader + +import webencodings +import re + +from pywb.utils.loaders import load_yaml_config + +from pywb.rewrite.html_rewriter import HTMLRewriter + +from pywb.rewrite.regex_rewriters import RegexRewriter, CSSRewriter, XMLRewriter +from pywb.rewrite.regex_rewriters import JSLinkAndLocationRewriter, JSLinkOnlyRewriter +from pywb.rewrite.regex_rewriters import JSLocationOnlyRewriter, JSNoneRewriter + +from pywb.rewrite.cookie_rewriter import ExactPathCookieRewriter + +from pywb.urlrewrite.header_rewriter import PrefixHeaderRewriter, ProxyHeaderRewriter + +from pywb.rewrite.jsonp_rewriter import JSONPRewriter + +from pywb.webagg.utils import StreamIter, BUFF_SIZE + + +# ============================================================================ +class Rewriter(object): + CHARSET_REGEX = re.compile(b']*?[\s;"\']charset\s*=[\s"\']*([^\s"\'/>]*)') + + all_rewriters = { + 'header': PrefixHeaderRewriter, + 'header-proxy': ProxyHeaderRewriter, + + 'html': HTMLRewriter, + + 'css': CSSRewriter, + + 'js': JSLocationOnlyRewriter, + 'js-proxy': JSNoneRewriter, + + 'json': JSONPRewriter, + + 'xml': XMLRewriter, + } + + rewrite_types = { + # HTML + 'text/html': 'html', + 'application/xhtml': 'html', + 'application/xhtml+xml': 'html', + + # CSS + 'text/css': 'css', + + # JS + 'text/javascript': 'js', + 'application/javascript': 'js', + 'application/x-javascript': 'js', + + # JSON + 'application/json': 'json', + + # HLS + 'application/x-mpegURL': 'hls', + + # DASH + 'application/dash+xml': 'dash', + + # XML + 'text/xml': 'xml', + 'application/xml': 'xml', + 'application/rss+xml': 'xml', + + # PLAIN + 'text/plain': 'plain', + } + + def __init__(self, rules_file, replay_mod=''): + self.rules = [] + self.load_rules(rules_file) + self.replay_mod = replay_mod + #for rw in self.known_rewriters: + # self.all_rewriters[rw.name] = rw + + def add_rewriter(self, rw): + self.all_rewriters[rw.name] = rw + + def get_rewriter(self, url, text_type): + return self.all_rewriters.get(text_type) + + def load_rules(self, filename): + config = load_yaml_config(filename) + for rule in config.get('rules'): + rule = self.parse_rewrite_rule(rule) + if rule: + self.rules.append(rule) + + def parse_rewrite_rule(self, config): + rw_config = config.get('rewrite') + if not rw_config: + return + + rule = rw_config + url_prefix = config.get('url_prefix') + if not isinstance(url_prefix, list): + url_prefix = [url_prefix] + + rule['url_prefix'] = url_prefix + + regexs = rule.get('js_regexs') + if regexs: + parse_rules_func = RegexRewriter.parse_rules_from_config(regexs) + rule['js_regex_func'] = parse_rules_func + + return rule + + def get_rule(self, cdx): + urlkey = to_native_str(cdx['urlkey']) + + for rule in self.rules: + if any((urlkey.startswith(prefix) for prefix in rule['url_prefix'])): + return rule + + return {} + + def get_rw_class(self, rule, text_type, rwinfo): + if text_type == 'js' and not rwinfo.is_url_rw(): + text_type = 'js-proxy' + + rw_type = rule.get(text_type, text_type) + rw_class = self.all_rewriters.get(rw_type) + + return rw_type, rw_class + + def create_rewriter(self, text_type, rule, rwinfo, cdx, head_insert_func=None): + rw_type, rw_class = self.get_rw_class(rule, text_type, rwinfo) + + if rw_type in ('js', 'js_proxy'): + extra_rules = [] + if 'js_regex_func' in rule: + extra_rules = rule['js_regex_func'](rwinfo.url_rewriter) + + return rw_class(rwinfo.url_rewriter, extra_rules) + + elif rw_type != 'html': + return rw_class(rwinfo.url_rewriter) + + # HTML Rewriter + head_insert_str = self.get_head_insert(rwinfo, rule, head_insert_func, cdx) + + js_rewriter = self.create_rewriter('js', rule, rwinfo, cdx) + css_rewriter = self.create_rewriter('css', rule, rwinfo, cdx) + + rw = rw_class(rwinfo.url_rewriter, + js_rewriter=js_rewriter, + css_rewriter=css_rewriter, + head_insert=head_insert_str, + url=cdx['url'], + defmod=self.replay_mod, + parse_comments=rule.get('parse_comments', False)) + + return rw + + def get_head_insert(self, rwinfo, rule, head_insert_func, cdx): + head_insert_str = '' + charset = rwinfo.charset + + # if no charset set, attempt to extract from first 1024 + if not charset: + first_buff = rwinfo.read_and_keep(1024) + charset = self.extract_html_charset(first_buff) + + if head_insert_func: + head_insert_orig = head_insert_func(rule, cdx) + + if charset: + try: + head_insert_str = webencodings.encode(head_insert_orig, charset) + except: + pass + + if not head_insert_str: + charset = 'utf-8' + head_insert_str = head_insert_orig.encode(charset) + + head_insert_str = head_insert_str.decode('iso-8859-1') + + return head_insert_str + + def extract_html_charset(self, buff): + charset = None + m = self.CHARSET_REGEX.search(buff) + if m: + charset = m.group(1) + charset = to_native_str(charset) + + return charset + + def rewrite_headers(self, rwinfo): + if rwinfo.is_url_rw(): + header_rw_name = 'header' + else: + header_rw_name = 'header-proxy' + + header_rw_class = self.all_rewriters.get(header_rw_name) + rwinfo.rw_http_headers = header_rw_class(rwinfo)() + + def __call__(self, record, url_rewriter, cookie_rewriter, + head_insert_func=None, + cdx=None): + + rwinfo = RewriteInfo(record, self, url_rewriter, cookie_rewriter) + + self.rewrite_headers(rwinfo) + + content_rewriter = None + if rwinfo.is_content_rw(): + rule = self.get_rule(cdx) + content_rewriter = self.create_rewriter(rwinfo.text_type, rule, rwinfo, cdx, head_insert_func) + + if not content_rewriter: + return rwinfo.rw_http_headers, StreamIter(rwinfo.content_stream), False + + #rwinfo.rw_http_headers.status_headers.remove_header('content-length') + + # align to line end for all non-html rewriting + align = (rwinfo.text_type != 'html') + + # Create rewriting generator + gen = self.rewrite_text_stream_to_gen(rwinfo.content_stream, + rewrite_func=content_rewriter.rewrite, + final_read_func=content_rewriter.close, + align_to_line=align) + + return rwinfo.rw_http_headers, gen, True + + @staticmethod + def rewrite_text_stream_to_gen(stream, + rewrite_func, + final_read_func, + align_to_line): + """ + Convert stream to generator using applying rewriting func + to each portion of the stream. + Align to line boundaries if needed. + """ + try: + buff = '' + + while True: + buff = stream.read(BUFF_SIZE) + if not buff: + break + + if align_to_line: + buff += stream.readline() + + buff = rewrite_func(buff.decode('iso-8859-1')) + yield buff.encode('iso-8859-1') + + # For adding a tail/handling final buffer + buff = final_read_func() + if buff: + yield buff.encode('iso-8859-1') + + finally: + stream.close() + + +# ============================================================================ +class RewriteInfo(object): + TAG_REGEX = re.compile(b'^\s*\<') + + def __init__(self, record, rewriter, url_rewriter, cookie_rewriter): + self.record = record + + self.rw_http_headers = record.http_headers + self.content_stream = record.content_stream() + + self.rewriter = rewriter + + self.text_type = None + self.charset = None + + self.url_rewriter = url_rewriter + + if not cookie_rewriter: + cookie_rewriter = ExactPathCookieRewriter(url_rewriter) + + self.cookie_rewriter = cookie_rewriter + + self._fill_text_type_and_charset() + self._resolve_text_type() + + def _fill_text_type_and_charset(self): + content_type = self.record.http_headers.get_header('Content-Type') + if not content_type: + return + + parts = content_type.split(';', 1) + mime = parts[0] + + self.text_type = self.rewriter.rewrite_types.get(mime) + if not self.text_type: + return + + if len(parts) == 2: + parts = parts[1].lower().split('charset=', 1) + if len(parts) == 2: + self.charset = parts[1].strip() + + def _resolve_text_type(self): + mod = self.url_rewriter.wburl.mod + + if self.text_type == 'css' and mod == 'js_': + self.text_type = 'css' + + # only attempt to resolve between html and other text types + if self.text_type != 'html': + return + + if mod != 'js_' and mod != 'cs_': + return + + buff = self.read_and_keep(128) + + # check if starts with a tag, then likely html + if self.TAG_REGEX.match(buff): + self.text_type = 'html' + + def read_and_keep(self, size): + buff = self.content_stream.read(size) + self.content_stream = BufferedReader(self.content_stream, starting_data=buff) + return buff + + def is_content_rw(self): + if not self.url_rewriter.prefix: + return False + + if self.url_rewriter.wburl.mod == 'id_': + return False + + if self.text_type == 'html': + if self.url_rewriter.rewrite_opts.get('is_ajax'): + return False + + elif self.text_type == 'plain': + if self.url_rewriter.wburl.mod not in ('js_', 'cs_'): + return False + + elif not self.text_type: + return False + + return True + + def is_url_rw(self): + if not self.url_rewriter: + return False + + if self.url_rewriter.wburl.mod == 'id_': + return False + + return True + + diff --git a/pywb/urlrewrite/rewriterapp.py b/pywb/urlrewrite/rewriterapp.py index 41bd2476..c041bbcc 100644 --- a/pywb/urlrewrite/rewriterapp.py +++ b/pywb/urlrewrite/rewriterapp.py @@ -1,8 +1,9 @@ import requests -from pywb.rewrite.rewrite_amf import RewriteAMFMixin -from pywb.rewrite.rewrite_dash import RewriteDASHMixin -from pywb.rewrite.rewrite_content import RewriteContent +#from pywb.rewrite.rewrite_amf import RewriteAMFMixin +#from pywb.rewrite.rewrite_dash import RewriteDASHMixin +#from pywb.rewrite.rewrite_content import RewriteContent +from pywb.urlrewrite.rewriter import Rewriter from pywb.rewrite.wburl import WbUrl from pywb.rewrite.url_rewriter import UrlRewriter, SchemeOnlyUrlRewriter @@ -44,8 +45,8 @@ class UpstreamException(WbException): # ============================================================================ -class Rewriter(RewriteDASHMixin, RewriteAMFMixin, RewriteContent): - pass +#class Rewriter(RewriteDASHMixin, RewriteAMFMixin, RewriteContent): +# pass # ============================================================================ @@ -67,9 +68,10 @@ class RewriterApp(object): self.frame_mod = None self.replay_mod = '' - frame_type = 'inverse' if framed_replay else False + #frame_type = 'inverse' if framed_replay else False - self.content_rewriter = Rewriter(is_framed_replay=frame_type) + #self.content_rewriter = Rewriter(is_framed_replay=frame_type) + self.content_rw = Rewriter('pkg://pywb/rules.yaml', self.replay_mod) if not jinja_env: jinja_env = JinjaEnv(globals={'static_path': 'static'}) @@ -149,7 +151,7 @@ class RewriterApp(object): urlkey = canonicalize(wb_url.url) inputreq = RewriteInputRequest(environ, urlkey, wb_url.url, - self.content_rewriter) + self.content_rw) inputreq.include_post_query(wb_url.url) @@ -267,14 +269,15 @@ class RewriterApp(object): cookie_rewriter = self.cookie_tracker.get_rewriter(urlrewriter, cookie_key) - result = self.content_rewriter.rewrite_content(urlrewriter, - record.http_headers, - record.raw_stream, - head_insert_func, - urlkey, - cdx, - cookie_rewriter, - environ) + #result = self.content_rewriter.rewrite_content(urlrewriter, + # record.http_headers, + # record.raw_stream, + # head_insert_func, + # urlkey, + # cdx, + # cookie_rewriter, + # environ) + result = self.content_rw(record, urlrewriter, cookie_rewriter, head_insert_func, cdx) status_headers, gen, is_rw = result