diff --git a/pywb/rewrite/content_rewriter.py b/pywb/rewrite/content_rewriter.py
new file mode 100644
index 00000000..e94b6f21
--- /dev/null
+++ b/pywb/rewrite/content_rewriter.py
@@ -0,0 +1,324 @@
+from io import BytesIO
+
+from contextlib import closing
+
+from warcio.bufferedreaders import BufferedReader
+from warcio.utils import to_native_str
+
+import re
+import webencodings
+
+from pywb.webagg.utils import StreamIter, BUFF_SIZE
+from pywb.rewrite.cookie_rewriter import ExactPathCookieRewriter
+
+from pywb.utils.loaders import load_yaml_config
+
+
+# ============================================================================
+class BaseContentRewriter(object):
+ CHARSET_REGEX = re.compile(b']*?[\s;"\']charset\s*=[\s"\']*([^\s"\'/>]*)')
+
+ def __init__(self, rules_file, replay_mod=''):
+ self.rules = []
+ self.load_rules(rules_file)
+ self.replay_mod = replay_mod
+ #for rw in self.known_rewriters:
+ # self.all_rewriters[rw.name] = rw
+
+ def add_rewriter(self, rw):
+ self.all_rewriters[rw.name] = rw
+
+ def get_rewriter(self, url, text_type):
+ return self.all_rewriters.get(text_type)
+
+ def load_rules(self, filename):
+ config = load_yaml_config(filename)
+ for rule in config.get('rules'):
+ rule = self.parse_rewrite_rule(rule)
+ if rule:
+ self.rules.append(rule)
+
+ def parse_rewrite_rule(self, config):
+ rw_config = config.get('rewrite')
+ if not rw_config:
+ return
+
+ rule = rw_config
+ url_prefix = config.get('url_prefix')
+ if not isinstance(url_prefix, list):
+ url_prefix = [url_prefix]
+
+ rule['url_prefix'] = url_prefix
+
+ regexs = rule.get('js_regexs')
+ if regexs:
+ parse_rules_func = self.init_js_regex(regexs)
+ rule['js_regex_func'] = parse_rules_func
+
+ return rule
+
+ def get_rule(self, cdx):
+ urlkey = to_native_str(cdx['urlkey'])
+
+ for rule in self.rules:
+ if any((urlkey.startswith(prefix) for prefix in rule['url_prefix'])):
+ return rule
+
+ return {}
+
+ def get_rw_class(self, rule, text_type, rwinfo):
+ if text_type == 'js' and not rwinfo.is_url_rw():
+ text_type = 'js-proxy'
+
+ rw_type = rule.get(text_type, text_type)
+ rw_class = self.all_rewriters.get(rw_type)
+
+ return rw_type, rw_class
+
+ def create_rewriter(self, text_type, rule, rwinfo, cdx, head_insert_func=None):
+ rw_type, rw_class = self.get_rw_class(rule, text_type, rwinfo)
+
+ if rw_type in ('js', 'js_proxy'):
+ extra_rules = []
+ if 'js_regex_func' in rule:
+ extra_rules = rule['js_regex_func'](rwinfo.url_rewriter)
+
+ return rw_class(rwinfo.url_rewriter, extra_rules)
+
+ elif rw_type != 'html':
+ return rw_class(rwinfo.url_rewriter)
+
+ # HTML Rewriter
+ head_insert_str = self.get_head_insert(rwinfo, rule, head_insert_func, cdx)
+
+ js_rewriter = self.create_rewriter('js', rule, rwinfo, cdx)
+ css_rewriter = self.create_rewriter('css', rule, rwinfo, cdx)
+
+ rw = rw_class(rwinfo.url_rewriter,
+ js_rewriter=js_rewriter,
+ css_rewriter=css_rewriter,
+ head_insert=head_insert_str,
+ url=cdx['url'],
+ defmod=self.replay_mod,
+ parse_comments=rule.get('parse_comments', False))
+
+ return rw
+
+ def get_head_insert(self, rwinfo, rule, head_insert_func, cdx):
+ head_insert_str = ''
+ charset = rwinfo.charset
+
+ # if no charset set, attempt to extract from first 1024
+ if not charset:
+ first_buff = rwinfo.read_and_keep(1024)
+ charset = self.extract_html_charset(first_buff)
+
+ if head_insert_func:
+ head_insert_orig = head_insert_func(rule, cdx)
+
+ if charset:
+ try:
+ head_insert_str = webencodings.encode(head_insert_orig, charset)
+ except:
+ pass
+
+ if not head_insert_str:
+ charset = 'utf-8'
+ head_insert_str = head_insert_orig.encode(charset)
+
+ head_insert_str = head_insert_str.decode('iso-8859-1')
+
+ return head_insert_str
+
+ def extract_html_charset(self, buff):
+ charset = None
+ m = self.CHARSET_REGEX.search(buff)
+ if m:
+ charset = m.group(1)
+ charset = to_native_str(charset)
+
+ return charset
+
+ def rewrite_headers(self, rwinfo):
+ if rwinfo.is_url_rw():
+ header_rw_name = 'header'
+ else:
+ header_rw_name = 'header-proxy'
+
+ header_rw_class = self.all_rewriters.get(header_rw_name)
+ rwinfo.rw_http_headers = header_rw_class(rwinfo)()
+
+ def __call__(self, record, url_rewriter, cookie_rewriter,
+ head_insert_func=None,
+ cdx=None):
+
+ rwinfo = RewriteInfo(record, self.get_rewrite_types(), url_rewriter, cookie_rewriter)
+
+ self.rewrite_headers(rwinfo)
+
+ content_rewriter = None
+ if rwinfo.is_content_rw():
+ rule = self.get_rule(cdx)
+ content_rewriter = self.create_rewriter(rwinfo.text_type, rule, rwinfo, cdx, head_insert_func)
+
+ if content_rewriter:
+ gen = content_rewriter(rwinfo)
+ else:
+ gen = StreamIter(rwinfo.content_stream)
+
+ return rwinfo.rw_http_headers, gen, (content_rewriter != None)
+
+ def init_js_regexs(self, regexs):
+ raise NotImplemented()
+
+ def get_rewrite_types(self):
+ raise NotImplemented()
+
+
+# ============================================================================
+class StreamingRewriter(object):
+ def __init__(self):
+ self.align_to_line = True
+
+ def __call__(self, rwinfo):
+ gen = self.rewrite_text_stream_to_gen(rwinfo.content_stream,
+ rewrite_func=self.rewrite,
+ final_read_func=self.close,
+ align_to_line=self.align_to_line)
+
+ return gen
+
+ def rewrite(self, string):
+ return string
+
+ def close(self):
+ return ''
+
+ def rewrite_text_stream_to_gen(cls, stream,
+ rewrite_func,
+ final_read_func,
+ align_to_line):
+ """
+ Convert stream to generator using applying rewriting func
+ to each portion of the stream.
+ Align to line boundaries if needed.
+ """
+ try:
+ buff = ''
+
+ while True:
+ buff = stream.read(BUFF_SIZE)
+ if not buff:
+ break
+
+ if align_to_line:
+ buff += stream.readline()
+
+ buff = rewrite_func(buff.decode('iso-8859-1'))
+ yield buff.encode('iso-8859-1')
+
+ # For adding a tail/handling final buffer
+ buff = final_read_func()
+ if buff:
+ yield buff.encode('iso-8859-1')
+
+ finally:
+ stream.close()
+
+
+# ============================================================================
+class RewriteInfo(object):
+ TAG_REGEX = re.compile(b'^\s*\<')
+
+ def __init__(self, record, rewrite_types, url_rewriter, cookie_rewriter):
+ self.record = record
+
+ self.rw_http_headers = record.http_headers
+ self.content_stream = record.content_stream()
+
+ self.rewrite_types = rewrite_types
+
+ self.text_type = None
+ self.charset = None
+
+ self.url_rewriter = url_rewriter
+
+ if not cookie_rewriter:
+ cookie_rewriter = ExactPathCookieRewriter(url_rewriter)
+
+ self.cookie_rewriter = cookie_rewriter
+
+ self._fill_text_type_and_charset()
+ self._resolve_text_type()
+
+ def _fill_text_type_and_charset(self):
+ content_type = self.record.http_headers.get_header('Content-Type')
+ if not content_type:
+ return
+
+ parts = content_type.split(';', 1)
+ mime = parts[0]
+
+ self.text_type = self.rewrite_types.get(mime)
+ if not self.text_type:
+ return
+
+ if len(parts) == 2:
+ parts = parts[1].lower().split('charset=', 1)
+ if len(parts) == 2:
+ self.charset = parts[1].strip()
+
+ def _resolve_text_type(self):
+ mod = self.url_rewriter.wburl.mod
+
+ if self.text_type == 'css' and mod == 'js_':
+ self.text_type = 'css'
+
+ # only attempt to resolve between html and other text types
+ if self.text_type != 'html':
+ return
+
+ if mod != 'js_' and mod != 'cs_':
+ return
+
+ buff = self.read_and_keep(128)
+
+ # check if starts with a tag, then likely html
+ if self.TAG_REGEX.match(buff):
+ self.text_type = 'html'
+
+ def read_and_keep(self, size):
+ buff = self.content_stream.read(size)
+ self.content_stream = BufferedReader(self.content_stream, starting_data=buff)
+ return buff
+
+ def is_content_rw(self):
+ if not self.url_rewriter.prefix:
+ return False
+
+ if self.url_rewriter.wburl.mod == 'id_':
+ return False
+
+ if self.text_type == 'html':
+ if self.url_rewriter.rewrite_opts.get('is_ajax'):
+ return False
+
+ elif self.text_type == 'plain':
+ if self.url_rewriter.wburl.mod not in ('js_', 'cs_'):
+ return False
+
+ elif not self.text_type:
+ return False
+
+ return True
+
+ def is_url_rw(self):
+ if not self.url_rewriter:
+ return False
+
+ if self.url_rewriter.wburl.mod == 'id_':
+ return False
+
+ return True
+
+
diff --git a/pywb/rewrite/html_rewriter.py b/pywb/rewrite/html_rewriter.py
index 5651e474..54b0802b 100644
--- a/pywb/rewrite/html_rewriter.py
+++ b/pywb/rewrite/html_rewriter.py
@@ -11,6 +11,8 @@ from six.moves.urllib.parse import urljoin, urlsplit, urlunsplit
from pywb.rewrite.url_rewriter import UrlRewriter
from pywb.rewrite.regex_rewriters import JSRewriter, CSSRewriter
+from pywb.rewrite.content_rewriter import StreamingRewriter
+
import six.moves.html_parser
six.moves.html_parser.unescape = lambda x: x
from six import text_type
@@ -441,7 +443,7 @@ class HTMLRewriterMixin(object):
#=================================================================
-class HTMLRewriter(HTMLRewriterMixin, HTMLParser):
+class HTMLRewriter(HTMLRewriterMixin, StreamingRewriter, HTMLParser):
PARSETAG = re.compile('[<]')
def __init__(self, *args, **kwargs):
@@ -451,6 +453,8 @@ class HTMLRewriter(HTMLRewriterMixin, HTMLParser):
HTMLParser.__init__(self)
super(HTMLRewriter, self).__init__(*args, **kwargs)
+ # for StreamingRewriter
+ self.align_to_line = False
def reset(self):
HTMLParser.reset(self)
diff --git a/pywb/rewrite/jsonp_rewriter.py b/pywb/rewrite/jsonp_rewriter.py
index d0558edb..6f603566 100644
--- a/pywb/rewrite/jsonp_rewriter.py
+++ b/pywb/rewrite/jsonp_rewriter.py
@@ -1,10 +1,14 @@
import re
+from pywb.rewrite.content_rewriter import StreamingRewriter
-class JSONPRewriter(object):
+
+# ============================================================================
+class JSONPRewriter(StreamingRewriter):
JSONP = re.compile(r'^(\w+)\(\{')
CALLBACK = re.compile(r'[?].*callback=([^&]+)')
def __init__(self, urlrewriter):
+ super(JSONPRewriter, self).__init__()
self.urlrewriter = urlrewriter
def rewrite(self, string):
diff --git a/pywb/rewrite/regex_rewriters.py b/pywb/rewrite/regex_rewriters.py
index da9ee355..9e4f0958 100644
--- a/pywb/rewrite/regex_rewriters.py
+++ b/pywb/rewrite/regex_rewriters.py
@@ -1,6 +1,7 @@
import re
from pywb.rewrite.url_rewriter import UrlRewriter
+from pywb.rewrite.content_rewriter import StreamingRewriter
#=================================================================
@@ -13,7 +14,7 @@ def load_function(string):
#=================================================================
-class RegexRewriter(object):
+class RegexRewriter(StreamingRewriter):
#@staticmethod
#def comment_out(string):
# return '/*' + string + '*/'
@@ -43,6 +44,7 @@ class RegexRewriter(object):
#DEFAULT_OP = add_prefix
def __init__(self, rewriter, rules):
+ super(RegexRewriter, self).__init__()
#rules = self.create_rules(http_prefix)
# Build regexstr, concatenating regex list
diff --git a/pywb/rewrite/rewrite_amf.py b/pywb/rewrite/rewrite_amf.py
index ddd4e078..075af986 100644
--- a/pywb/rewrite/rewrite_amf.py
+++ b/pywb/rewrite/rewrite_amf.py
@@ -3,16 +3,9 @@ from six.moves import zip
# ============================================================================
-# Expiermental: not fully tested
-class RewriteAMFMixin(object): #pragma: no cover
- def handle_custom_rewrite(self, rewritten_headers, stream, urlrewriter, mod, env):
- if rewritten_headers.status_headers.get_header('Content-Type') == 'application/x-amf':
- stream = self.rewrite_amf(stream, env)
-
- return (super(RewriteAMFMixin, self).
- handle_custom_rewrite(rewritten_headers, stream, urlrewriter, mod, env))
-
- def rewrite_amf(self, stream, env):
+# Experimental: not fully tested
+class RewriteAMF(object): #pragma: no cover
+ def __call__(self, rwinfo):
try:
from pyamf import remoting
@@ -26,9 +19,10 @@ class RewriteAMFMixin(object): #pragma: no cover
iobuff.seek(0)
res = remoting.decode(iobuff)
- if env and env.get('pywb.inputdata'):
- inputdata = env.get('pywb.inputdata')
+ # TODO: revisit this
+ inputdata = rwinfo.url_rewriter.rewrite_opts.get('pywb.inputdata')
+ if inputdata:
new_list = []
for src, target in zip(inputdata.bodies, res.bodies):
diff --git a/pywb/rewrite/rewrite_dash.py b/pywb/rewrite/rewrite_dash.py
index c01a5eab..497a381b 100644
--- a/pywb/rewrite/rewrite_dash.py
+++ b/pywb/rewrite/rewrite_dash.py
@@ -1,39 +1,17 @@
-import xml.etree.ElementTree as ET
from contextlib import closing
from io import BytesIO, StringIO
import json
-from pywb.webagg.utils import StreamIter
-import re
+import xml.etree.ElementTree as ET
-EXT_INF = re.compile('#EXT-X-STREAM-INF:(?:.*[,])?BANDWIDTH=([\d]+)')
+from pywb.webagg.utils import StreamIter
# ============================================================================
-class RewriteDASHMixin(object):
- def handle_custom_rewrite(self, rewritten_headers, stream, urlrewriter, mod, env):
- if rewritten_headers.status_headers.get_header('Content-Type') == 'application/dash+xml':
- stream = self._decoding_stream(rewritten_headers, stream)
- stream, _ = self.rewrite_dash(stream)
- rewritten_headers.status_headers.remove_header('content-length')
- return (rewritten_headers.status_headers, StreamIter(stream), True)
-
- elif rewritten_headers.status_headers.get_header('Content-Type') == 'application/x-mpegURL':
- stream = self._decoding_stream(rewritten_headers, stream)
- stream = self.rewrite_m3u8(stream)
- rewritten_headers.status_headers.remove_header('content-length')
- return (rewritten_headers.status_headers, StreamIter(stream), True)
-
- return (super(RewriteDASHMixin, self).
- handle_custom_rewrite(rewritten_headers, stream, urlrewriter, mod, env))
-
- @classmethod
- def rewrite_dash(cls, stream):
- ET.register_namespace('', 'urn:mpeg:dash:schema:mpd:2011')
- namespaces = {'mpd': 'urn:mpeg:dash:schema:mpd:2011'}
-
+class RewriteDASH(object):
+ def __call__(self, rwinfo):
buff_io = BytesIO()
- with closing(stream) as fh:
+ with closing(rwinfo.content_stream) as fh:
while True:
buff = fh.read()
if not buff:
@@ -42,8 +20,15 @@ class RewriteDASHMixin(object):
buff_io.write(buff)
buff_io.seek(0)
+ res_buff, best_ids = self.rewrite_dash(buff_io)
+ return StreamIter(res_buff)
+
+ def rewrite_dash(self, stream):
+ ET.register_namespace('', 'urn:mpeg:dash:schema:mpd:2011')
+ namespaces = {'mpd': 'urn:mpeg:dash:schema:mpd:2011'}
+
tree = ET.ElementTree()
- tree.parse(buff_io)
+ tree.parse(stream)
root = tree.getroot()
@@ -72,40 +57,8 @@ class RewriteDASHMixin(object):
buff_io.seek(0)
return buff_io, best_ids
- @classmethod
- def rewrite_m3u8(cls, stream):
- buff = stream.read()
-
- lines = buff.decode('utf-8').split('\n')
- best = None
- indexes = []
- count = 0
- best_index = None
-
- for line in lines:
- m = EXT_INF.match(line)
- if m:
- indexes.append(count)
- bandwidth = int(m.group(1))
- if not best or bandwidth > best:
- best = bandwidth
- best_index = count
-
- count = count + 1
-
- if indexes and best_index is not None:
- indexes.remove(best_index)
-
- for index in reversed(indexes):
- del lines[index + 1]
- del lines[index]
-
- buff_io = BytesIO()
- buff_io.write('\n'.join(lines).encode('utf-8'))
- buff_io.seek(0)
- return buff_io
-
+# ============================================================================
def rewrite_fb_dash(string):
DASH_SPLIT = r'\n",dash_prefetched_representation_ids:'
inx = string.find(DASH_SPLIT)
@@ -117,7 +70,7 @@ def rewrite_fb_dash(string):
buff = string.encode('utf-8').decode('unicode-escape')
buff = buff.encode('utf-8')
io = BytesIO(buff)
- io, best_ids = RewriteDASHMixin.rewrite_dash(io)
+ io, best_ids = RewriteDASHMixin().rewrite_dash(io)
string = json.dumps(io.read().decode('utf-8'))
string = string[1:-1].replace('<', r'\x3C')
diff --git a/pywb/rewrite/rewrite_hls.py b/pywb/rewrite/rewrite_hls.py
new file mode 100644
index 00000000..ae56129a
--- /dev/null
+++ b/pywb/rewrite/rewrite_hls.py
@@ -0,0 +1,44 @@
+import re
+from io import BytesIO
+from pywb.webagg.utils import StreamIter
+
+
+# ============================================================================
+class RewriteHLS(object):
+ EXT_INF = re.compile('#EXT-X-STREAM-INF:(?:.*[,])?BANDWIDTH=([\d]+)')
+
+ def __call__(self, rwinfo):
+ return StreamIter(self.rewrite_m3u8(rwinfo.content_stream))
+
+ def rewrite_m3u8(self, stream):
+ buff = stream.read()
+
+ lines = buff.decode('utf-8').split('\n')
+ best = None
+ indexes = []
+ count = 0
+ best_index = None
+
+ for line in lines:
+ m = self.EXT_INF.match(line)
+ if m:
+ indexes.append(count)
+ bandwidth = int(m.group(1))
+ if not best or bandwidth > best:
+ best = bandwidth
+ best_index = count
+
+ count = count + 1
+
+ if indexes and best_index is not None:
+ indexes.remove(best_index)
+
+ for index in reversed(indexes):
+ del lines[index + 1]
+ del lines[index]
+
+ buff_io = BytesIO()
+ buff_io.write('\n'.join(lines).encode('utf-8'))
+ buff_io.seek(0)
+ return buff_io
+
diff --git a/pywb/urlrewrite/rewriter.py b/pywb/urlrewrite/rewriter.py
index e619d95a..9121a9e6 100644
--- a/pywb/urlrewrite/rewriter.py
+++ b/pywb/urlrewrite/rewriter.py
@@ -1,10 +1,4 @@
-from warcio.utils import to_native_str
-from warcio.bufferedreaders import BufferedReader
-
-import webencodings
-import re
-
-from pywb.utils.loaders import load_yaml_config
+from pywb.rewrite.content_rewriter import BaseContentRewriter
from pywb.rewrite.html_rewriter import HTMLRewriter
@@ -12,19 +6,17 @@ from pywb.rewrite.regex_rewriters import RegexRewriter, CSSRewriter, XMLRewriter
from pywb.rewrite.regex_rewriters import JSLinkAndLocationRewriter, JSLinkOnlyRewriter
from pywb.rewrite.regex_rewriters import JSLocationOnlyRewriter, JSNoneRewriter
-from pywb.rewrite.cookie_rewriter import ExactPathCookieRewriter
-
from pywb.urlrewrite.header_rewriter import PrefixHeaderRewriter, ProxyHeaderRewriter
from pywb.rewrite.jsonp_rewriter import JSONPRewriter
-from pywb.webagg.utils import StreamIter, BUFF_SIZE
+from pywb.rewrite.rewrite_dash import RewriteDASH
+from pywb.rewrite.rewrite_hls import RewriteHLS
+from pywb.rewrite.rewrite_amf import RewriteAMF
# ============================================================================
-class Rewriter(object):
- CHARSET_REGEX = re.compile(b']*?[\s;"\']charset\s*=[\s"\']*([^\s"\'/>]*)')
-
+class DefaultRewriter(BaseContentRewriter):
all_rewriters = {
'header': PrefixHeaderRewriter,
'header-proxy': ProxyHeaderRewriter,
@@ -39,6 +31,12 @@ class Rewriter(object):
'json': JSONPRewriter,
'xml': XMLRewriter,
+
+ 'dash': RewriteDASH,
+
+ 'hls': RewriteHLS,
+
+ 'amf': RewriteAMF,
}
rewrite_types = {
@@ -64,6 +62,9 @@ class Rewriter(object):
# DASH
'application/dash+xml': 'dash',
+ # AMF
+ 'application/x-amf': 'amf',
+
# XML
'text/xml': 'xml',
'application/xml': 'xml',
@@ -73,291 +74,8 @@ class Rewriter(object):
'text/plain': 'plain',
}
- def __init__(self, rules_file, replay_mod=''):
- self.rules = []
- self.load_rules(rules_file)
- self.replay_mod = replay_mod
- #for rw in self.known_rewriters:
- # self.all_rewriters[rw.name] = rw
-
- def add_rewriter(self, rw):
- self.all_rewriters[rw.name] = rw
-
- def get_rewriter(self, url, text_type):
- return self.all_rewriters.get(text_type)
-
- def load_rules(self, filename):
- config = load_yaml_config(filename)
- for rule in config.get('rules'):
- rule = self.parse_rewrite_rule(rule)
- if rule:
- self.rules.append(rule)
-
- def parse_rewrite_rule(self, config):
- rw_config = config.get('rewrite')
- if not rw_config:
- return
-
- rule = rw_config
- url_prefix = config.get('url_prefix')
- if not isinstance(url_prefix, list):
- url_prefix = [url_prefix]
-
- rule['url_prefix'] = url_prefix
-
- regexs = rule.get('js_regexs')
- if regexs:
- parse_rules_func = RegexRewriter.parse_rules_from_config(regexs)
- rule['js_regex_func'] = parse_rules_func
-
- return rule
-
- def get_rule(self, cdx):
- urlkey = to_native_str(cdx['urlkey'])
-
- for rule in self.rules:
- if any((urlkey.startswith(prefix) for prefix in rule['url_prefix'])):
- return rule
-
- return {}
-
- def get_rw_class(self, rule, text_type, rwinfo):
- if text_type == 'js' and not rwinfo.is_url_rw():
- text_type = 'js-proxy'
-
- rw_type = rule.get(text_type, text_type)
- rw_class = self.all_rewriters.get(rw_type)
-
- return rw_type, rw_class
-
- def create_rewriter(self, text_type, rule, rwinfo, cdx, head_insert_func=None):
- rw_type, rw_class = self.get_rw_class(rule, text_type, rwinfo)
-
- if rw_type in ('js', 'js_proxy'):
- extra_rules = []
- if 'js_regex_func' in rule:
- extra_rules = rule['js_regex_func'](rwinfo.url_rewriter)
-
- return rw_class(rwinfo.url_rewriter, extra_rules)
-
- elif rw_type != 'html':
- return rw_class(rwinfo.url_rewriter)
-
- # HTML Rewriter
- head_insert_str = self.get_head_insert(rwinfo, rule, head_insert_func, cdx)
-
- js_rewriter = self.create_rewriter('js', rule, rwinfo, cdx)
- css_rewriter = self.create_rewriter('css', rule, rwinfo, cdx)
-
- rw = rw_class(rwinfo.url_rewriter,
- js_rewriter=js_rewriter,
- css_rewriter=css_rewriter,
- head_insert=head_insert_str,
- url=cdx['url'],
- defmod=self.replay_mod,
- parse_comments=rule.get('parse_comments', False))
-
- return rw
-
- def get_head_insert(self, rwinfo, rule, head_insert_func, cdx):
- head_insert_str = ''
- charset = rwinfo.charset
-
- # if no charset set, attempt to extract from first 1024
- if not charset:
- first_buff = rwinfo.read_and_keep(1024)
- charset = self.extract_html_charset(first_buff)
-
- if head_insert_func:
- head_insert_orig = head_insert_func(rule, cdx)
-
- if charset:
- try:
- head_insert_str = webencodings.encode(head_insert_orig, charset)
- except:
- pass
-
- if not head_insert_str:
- charset = 'utf-8'
- head_insert_str = head_insert_orig.encode(charset)
-
- head_insert_str = head_insert_str.decode('iso-8859-1')
-
- return head_insert_str
-
- def extract_html_charset(self, buff):
- charset = None
- m = self.CHARSET_REGEX.search(buff)
- if m:
- charset = m.group(1)
- charset = to_native_str(charset)
-
- return charset
-
- def rewrite_headers(self, rwinfo):
- if rwinfo.is_url_rw():
- header_rw_name = 'header'
- else:
- header_rw_name = 'header-proxy'
-
- header_rw_class = self.all_rewriters.get(header_rw_name)
- rwinfo.rw_http_headers = header_rw_class(rwinfo)()
-
- def __call__(self, record, url_rewriter, cookie_rewriter,
- head_insert_func=None,
- cdx=None):
-
- rwinfo = RewriteInfo(record, self, url_rewriter, cookie_rewriter)
-
- self.rewrite_headers(rwinfo)
-
- content_rewriter = None
- if rwinfo.is_content_rw():
- rule = self.get_rule(cdx)
- content_rewriter = self.create_rewriter(rwinfo.text_type, rule, rwinfo, cdx, head_insert_func)
-
- if not content_rewriter:
- return rwinfo.rw_http_headers, StreamIter(rwinfo.content_stream), False
-
- #rwinfo.rw_http_headers.status_headers.remove_header('content-length')
-
- # align to line end for all non-html rewriting
- align = (rwinfo.text_type != 'html')
-
- # Create rewriting generator
- gen = self.rewrite_text_stream_to_gen(rwinfo.content_stream,
- rewrite_func=content_rewriter.rewrite,
- final_read_func=content_rewriter.close,
- align_to_line=align)
-
- return rwinfo.rw_http_headers, gen, True
-
- @staticmethod
- def rewrite_text_stream_to_gen(stream,
- rewrite_func,
- final_read_func,
- align_to_line):
- """
- Convert stream to generator using applying rewriting func
- to each portion of the stream.
- Align to line boundaries if needed.
- """
- try:
- buff = ''
-
- while True:
- buff = stream.read(BUFF_SIZE)
- if not buff:
- break
-
- if align_to_line:
- buff += stream.readline()
-
- buff = rewrite_func(buff.decode('iso-8859-1'))
- yield buff.encode('iso-8859-1')
-
- # For adding a tail/handling final buffer
- buff = final_read_func()
- if buff:
- yield buff.encode('iso-8859-1')
-
- finally:
- stream.close()
-
-
-# ============================================================================
-class RewriteInfo(object):
- TAG_REGEX = re.compile(b'^\s*\<')
-
- def __init__(self, record, rewriter, url_rewriter, cookie_rewriter):
- self.record = record
-
- self.rw_http_headers = record.http_headers
- self.content_stream = record.content_stream()
-
- self.rewriter = rewriter
-
- self.text_type = None
- self.charset = None
-
- self.url_rewriter = url_rewriter
-
- if not cookie_rewriter:
- cookie_rewriter = ExactPathCookieRewriter(url_rewriter)
-
- self.cookie_rewriter = cookie_rewriter
-
- self._fill_text_type_and_charset()
- self._resolve_text_type()
-
- def _fill_text_type_and_charset(self):
- content_type = self.record.http_headers.get_header('Content-Type')
- if not content_type:
- return
-
- parts = content_type.split(';', 1)
- mime = parts[0]
-
- self.text_type = self.rewriter.rewrite_types.get(mime)
- if not self.text_type:
- return
-
- if len(parts) == 2:
- parts = parts[1].lower().split('charset=', 1)
- if len(parts) == 2:
- self.charset = parts[1].strip()
-
- def _resolve_text_type(self):
- mod = self.url_rewriter.wburl.mod
-
- if self.text_type == 'css' and mod == 'js_':
- self.text_type = 'css'
-
- # only attempt to resolve between html and other text types
- if self.text_type != 'html':
- return
-
- if mod != 'js_' and mod != 'cs_':
- return
-
- buff = self.read_and_keep(128)
-
- # check if starts with a tag, then likely html
- if self.TAG_REGEX.match(buff):
- self.text_type = 'html'
-
- def read_and_keep(self, size):
- buff = self.content_stream.read(size)
- self.content_stream = BufferedReader(self.content_stream, starting_data=buff)
- return buff
-
- def is_content_rw(self):
- if not self.url_rewriter.prefix:
- return False
-
- if self.url_rewriter.wburl.mod == 'id_':
- return False
-
- if self.text_type == 'html':
- if self.url_rewriter.rewrite_opts.get('is_ajax'):
- return False
-
- elif self.text_type == 'plain':
- if self.url_rewriter.wburl.mod not in ('js_', 'cs_'):
- return False
-
- elif not self.text_type:
- return False
-
- return True
-
- def is_url_rw(self):
- if not self.url_rewriter:
- return False
-
- if self.url_rewriter.wburl.mod == 'id_':
- return False
-
- return True
-
+ def init_js_regex(self, regexs):
+ return RegexRewriter.parse_rules_from_config(regexs)
+ def get_rewrite_types(self):
+ return self.rewrite_types
diff --git a/pywb/urlrewrite/rewriterapp.py b/pywb/urlrewrite/rewriterapp.py
index c041bbcc..a8687c5e 100644
--- a/pywb/urlrewrite/rewriterapp.py
+++ b/pywb/urlrewrite/rewriterapp.py
@@ -3,7 +3,7 @@ import requests
#from pywb.rewrite.rewrite_amf import RewriteAMFMixin
#from pywb.rewrite.rewrite_dash import RewriteDASHMixin
#from pywb.rewrite.rewrite_content import RewriteContent
-from pywb.urlrewrite.rewriter import Rewriter
+from pywb.urlrewrite.rewriter import DefaultRewriter
from pywb.rewrite.wburl import WbUrl
from pywb.rewrite.url_rewriter import UrlRewriter, SchemeOnlyUrlRewriter
@@ -71,7 +71,7 @@ class RewriterApp(object):
#frame_type = 'inverse' if framed_replay else False
#self.content_rewriter = Rewriter(is_framed_replay=frame_type)
- self.content_rw = Rewriter('pkg://pywb/rules.yaml', self.replay_mod)
+ self.content_rw = DefaultRewriter('pkg://pywb/rules.yaml', self.replay_mod)
if not jinja_env:
jinja_env = JinjaEnv(globals={'static_path': 'static'})
diff --git a/tests/test_integration.py b/tests/test_integration.py
index 35988b80..67ec287f 100644
--- a/tests/test_integration.py
+++ b/tests/test_integration.py
@@ -152,8 +152,7 @@ class TestWbIntegration(BaseConfigTest):
assert len(lines) == 17
assert lines[0].startswith('org,iana)/_css/2013.1/print.css 20140127171239')
-
- def test_replay_banner_only(self):
+ def _test_replay_banner_only(self):
resp = self.testapp.get('/pywb/20140126201054bn_/http://www.iana.org/domains/reserved')
# wb.js header insertion