1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

RegexRewriter Optimization (#354)

* bump version to 2.0.5

* regexrewriter: work on splitting rules into separate class hierarchy from rewriter.
rules logic and regexs can be inited once, while rewriter is per response being rewritten

* regexrewriter: refactor remaining rewriters to use a shared rules factory to avoid reiniting rules

* fix spacing

* fixes: ensure custom rules added first, fix fb rewrite_dash
content_rewriter tests: update tests to check with location-only and js obj proxy rewriter, check fb dash rewriter

* simplify JSNoneRewriter
This commit is contained in:
Ilya Kreymer 2018-08-05 16:40:19 -07:00 committed by GitHub
parent 2f062cf5c7
commit 973a2dcff9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 250 additions and 176 deletions

View File

@ -5,52 +5,118 @@ from six.moves.urllib.parse import unquote
# ================================================================= # =================================================================
class RegexRewriter(StreamingRewriter): class RxRules(object):
# @staticmethod HTTPX_MATCH_STR = r'https?:\\?/\\?/[A-Za-z0-9:_@.-]+'
# def comment_out(string):
# return '/*' + string + '*/'
@staticmethod @staticmethod
def format(template): def remove_https(string, _):
return lambda string: template.format(string)
@staticmethod
def fixed(string):
return lambda _: string
@staticmethod
def remove_https(string):
return string.replace("https", "http") return string.replace("https", "http")
@staticmethod
def replace_str(replacer):
return lambda x, _: x.replace('this', replacer)
@staticmethod
def format(template):
return lambda string, _: template.format(string)
@staticmethod
def fixed(string):
return lambda _, _2: string
@staticmethod
def archival_rewrite():
return lambda string, rewriter: rewriter.rewrite(string)
@staticmethod @staticmethod
def add_prefix(prefix): def add_prefix(prefix):
return lambda string: prefix + string return lambda string, _: prefix + string
@staticmethod @staticmethod
def add_suffix(suffix): def add_suffix(suffix):
return lambda string: string + suffix return lambda string, _: string + suffix
@staticmethod @staticmethod
def archival_rewrite(rewriter): def compile_rules(rules):
return lambda string: rewriter.rewrite(string)
HTTPX_MATCH_STR = r'https?:\\?/\\?/[A-Za-z0-9:_@.-]+'
# DEFAULT_OP = add_prefix
def __init__(self, rewriter, rules):
super(RegexRewriter, self).__init__(rewriter)
# rules = self.create_rules(http_prefix)
# Build regexstr, concatenating regex list # Build regexstr, concatenating regex list
regex_str = '|'.join(['(' + rx + ')' for rx, op, count in rules]) regex_str = '|'.join(['(' + rx + ')' for rx, op, count in rules])
# ensure it's not middle of a word, wrap in non-capture group # ensure it's not middle of a word, wrap in non-capture group
regex_str = '(?<!\w)(?:' + regex_str + ')' regex_str = '(?<!\w)(?:' + regex_str + ')'
self.regex = re.compile(regex_str, re.M) return re.compile(regex_str, re.M)
self.rules = rules
def __init__(self, rules=None):
self.rules = rules or []
self.regex = self.compile_rules(self.rules)
def __call__(self, extra_rules=None):
if not extra_rules:
return self.rules, self.regex
all_rules = extra_rules + self.rules
regex = self.compile_rules(all_rules)
return all_rules, regex
# =================================================================
class JSWombatProxyRules(RxRules):
def __init__(self):
local_init_func = '\nvar {0} = function(name) {{\
return (self._wb_wombat && self._wb_wombat.local_init &&\
self._wb_wombat.local_init(name)) || self[name]; }};\n\
if (!self.__WB_pmw) {{ self.__WB_pmw = function(obj) {{ return obj; }} }}\n\
{{\n'
local_init_func_name = '_____WB$wombat$assign$function_____'
local_var_line = 'let {0} = {1}("{0}");'
this_rw = '(this && this._WB_wombat_obj_proxy || this)'
check_loc = '(self.__WB_check_loc && self.__WB_check_loc(location) || {}).href = '
self.local_objs = [
'window',
'self',
'document',
'location',
'top',
'parent',
'frames',
'opener']
local_declares = '\n'.join([local_var_line.format(obj, local_init_func_name) for obj in self.local_objs])
prop_str = '|'.join(self.local_objs)
rules = [
(r'(?<=\.)postMessage\b\(', self.add_prefix('__WB_pmw(self).'), 0),
(r'(?<!\.)\blocation\b\s*[=]\s*(?![=])', self.add_suffix(check_loc), 0),
(r'\breturn\s+this\b\s*(?![.$])', self.replace_str(this_rw), 0),
(r'(?<=[\n])\s*this\b(?=(?:\.(?:{0})\b))'.format(prop_str), self.replace_str(';' + this_rw), 0),
(r'(?<![$.])\s*this\b(?=(?:\.(?:{0})\b))'.format(prop_str), self.replace_str(this_rw), 0),
(r'(?<=[=])\s*this\b\s*(?![.$])', self.replace_str(this_rw), 0),
('\}(?:\s*\))?\s*\(this\)', self.replace_str(this_rw), 0),
(r'(?<=[^|&][|&]{2})\s*this\b\s*(?![|&.$]([^|&]|$))', self.replace_str(this_rw), 0),
]
super(JSWombatProxyRules, self).__init__(rules)
self.first_buff = local_init_func.format(local_init_func_name) + local_declares
self.last_buff = '\n\n}'
# =================================================================
class RegexRewriter(StreamingRewriter):
rules_factory = RxRules()
def __init__(self, rewriter, extra_rules=None, first_buff=''):
super(RegexRewriter, self).__init__(rewriter, first_buff=first_buff)
# rules = self.create_rules(http_prefix)
self.rules, self.regex = self.rules_factory(extra_rules)
def filter(self, m): def filter(self, m):
return True return True
@ -79,7 +145,7 @@ class RegexRewriter(StreamingRewriter):
# if not hasattr(op, '__call__'): # if not hasattr(op, '__call__'):
# op = RegexRewriter.DEFAULT_OP(op) # op = RegexRewriter.DEFAULT_OP(op)
result = op(m.group(i)) result = op(m.group(i), self.url_rewriter)
final_str = result final_str = result
# if extracting partial match # if extracting partial match
@ -96,11 +162,11 @@ class RegexRewriter(StreamingRewriter):
def parse_rule(obj): def parse_rule(obj):
match = obj.get('match') match = obj.get('match')
if 'rewrite' in obj: if 'rewrite' in obj:
replace = RegexRewriter.archival_rewrite(rewriter) replace = RxRules.archival_rewrite()
elif 'function' in obj: elif 'function' in obj:
replace = load_py_name(obj['function']) replace = load_py_name(obj['function'])
else: else:
replace = RegexRewriter.format(obj.get('replace', '{0}')) replace = RxRules.format(obj.get('replace', '{0}'))
group = obj.get('group', 0) group = obj.get('group', 0)
result = (match, replace, group) result = (match, replace, group)
return result return result
@ -111,9 +177,32 @@ class RegexRewriter(StreamingRewriter):
# ================================================================= # =================================================================
class JSLinkRewriterMixin(object): class JSLocationRewriterRules(RxRules):
""" """
JS Rewriter which rewrites absolute http://, https:// and // urls JS Rewriter mixin which rewrites location and domain to the
specified prefix (default: ``WB_wombat_``)
"""
def __init__(self, prefix='WB_wombat_'):
super(JSLocationRewriterRules, self).__init__(self.get_rules(prefix))
def get_rules(self, prefix):
rules = [
(r'(?<![$\'"])\b(?:location|top)\b(?![$\'":])', self.add_prefix(prefix), 0),
(r'(?<=[?])\s*(?:\w+[.])?(location)\s*(?=[:])', self.add_prefix(prefix), 1),
(r'(?<=\.)postMessage\b\(', self.add_prefix('__WB_pmw(self.window).'), 0),
(r'(?<=\.)frameElement\b', self.add_prefix(prefix), 0),
]
return rules
# =================================================================
class JSLinkAndLocationRewriterRules(JSLocationRewriterRules):
"""
JS Rewriter rules which also rewrite absolute http://, https:// and // urls
at the beginning of a string at the beginning of a string
""" """
# JS_HTTPX = r'(?:(?:(?<=["\';])https?:)|(?<=["\']))\\{0,4}/\\{0,4}/[A-Za-z0-9:_@.-]+.*(?=["\s\';&\\])' # JS_HTTPX = r'(?:(?:(?<=["\';])https?:)|(?<=["\']))\\{0,4}/\\{0,4}/[A-Za-z0-9:_@.-]+.*(?=["\s\';&\\])'
@ -122,94 +211,44 @@ class JSLinkRewriterMixin(object):
# JS_HTTPX = r'(?:(?<=["\';])https?:|(?<=["\']))\\{0,4}/\\{0,4}/[A-Za-z0-9:_@.-][^"\s\';&\\]*(?=["\';&\\])' # JS_HTTPX = r'(?:(?<=["\';])https?:|(?<=["\']))\\{0,4}/\\{0,4}/[A-Za-z0-9:_@.-][^"\s\';&\\]*(?=["\';&\\])'
JS_HTTPX = r'(?:(?<=["\';])https?:|(?<=["\']))\\{0,4}/\\{0,4}/[A-Za-z0-9:_@%.\\-]+/' JS_HTTPX = r'(?:(?<=["\';])https?:|(?<=["\']))\\{0,4}/\\{0,4}/[A-Za-z0-9:_@%.\\-]+/'
def __init__(self, rewriter, rules=[]): def get_rules(self, prefix):
rules = rules + [ rules = super(JSLinkAndLocationRewriterRules, self).get_rules(prefix)
(self.JS_HTTPX, RegexRewriter.archival_rewrite(rewriter), 0) rules.append((self.JS_HTTPX, RxRules.archival_rewrite(), 0))
] return rules
super(JSLinkRewriterMixin, self).__init__(rewriter, rules)
# ================================================================= # =================================================================
class JSLocationRewriterMixin(object): class JSLocationOnlyRewriter(RegexRewriter):
""" rules_factory = JSLocationRewriterRules()
JS Rewriter mixin which rewrites location and domain to the
specified prefix (default: ``WB_wombat_``)
"""
def __init__(self, rewriter, rules=[], prefix='WB_wombat_'):
rules = rules + [
(r'(?<![$\'"])\b(?:location|top)\b(?![$\'":])', RegexRewriter.add_prefix(prefix), 0),
(r'(?<=[?])\s*(?:\w+[.])?(location)\s*(?=[:])', RegexRewriter.add_prefix(prefix), 1),
(r'(?<=\.)postMessage\b\(', RegexRewriter.add_prefix('__WB_pmw(self.window).'), 0),
(r'(?<=\.)frameElement\b', RegexRewriter.add_prefix(prefix), 0),
]
super(JSLocationRewriterMixin, self).__init__(rewriter, rules)
# ================================================================= # =================================================================
class JSWombatProxyRewriterMixin(object): class JSLinkAndLocationRewriter(RegexRewriter):
rules_factory = JSLinkAndLocationRewriterRules()
JSRewriter = JSLinkAndLocationRewriter
# =================================================================
class JSWombatProxyRewriter(RegexRewriter):
""" """
JS Rewriter mixin which wraps the contents of the JS Rewriter mixin which wraps the contents of the
script in an anonymous block scope and inserts script in an anonymous block scope and inserts
Wombat js-proxy setup Wombat js-proxy setup
""" """
local_init_func = '\nvar {0} = function(name) {{\ rules_factory = JSWombatProxyRules()
return (self._wb_wombat && self._wb_wombat.local_init &&\
self._wb_wombat.local_init(name)) || self[name]; }};\n\
if (!self.__WB_pmw) {{ self.__WB_pmw = function(obj) {{ return obj; }} }}\n\
{{\n'
local_init_func_name = '_____WB$wombat$assign$function_____' def __init__(self, rewriter, extra_rules=None):
super(JSWombatProxyRewriter, self).__init__(rewriter, extra_rules=extra_rules)
local_var_line = 'let {0} = {1}("{0}");' self.first_buff = self.rules_factory.first_buff
self.last_buff = self.rules_factory.last_buff
local_objs = ['window', self.local_objs = self.rules_factory.local_objs
'self',
'document',
'location',
'top',
'parent',
'frames',
'opener']
THIS_RW = '(this && this._WB_wombat_obj_proxy || this)'
CHECK_LOC = '(self.__WB_check_loc && self.__WB_check_loc(location) || {}).href = '
@classmethod
def replace_str(cls, replacer):
return lambda x: x.replace('this', replacer)
def __init__(self, rewriter, rules=[]):
#func_rw = 'Function("return {0}")'.format(self.THIS_RW)
prop_str = '|'.join(self.local_objs)
rules = rules + [
(r'(?<=\.)postMessage\b\(', self.add_prefix('__WB_pmw(self).'), 0),
(r'(?<!\.)\blocation\b\s*[=]\s*(?![=])', self.add_suffix(self.CHECK_LOC), 0),
(r'\breturn\s+this\b\s*(?![.$])', self.replace_str(self.THIS_RW), 0),
(r'(?<=[\n])\s*this\b(?=(?:\.(?:{0})\b))'.format(prop_str), self.replace_str(';' + self.THIS_RW), 0),
(r'(?<![$.])\s*this\b(?=(?:\.(?:{0})\b))'.format(prop_str), self.replace_str(self.THIS_RW), 0),
(r'(?<=[=])\s*this\b\s*(?![.$])', self.replace_str(self.THIS_RW), 0),
('\}(?:\s*\))?\s*\(this\)', self.replace_str(self.THIS_RW), 0),
(r'(?<=[^|&][|&]{2})\s*this\b\s*(?![|&.$]([^|&]|$))', self.replace_str(self.THIS_RW), 0),
]
super(JSWombatProxyRewriterMixin, self).__init__(rewriter, rules)
local_declares = '\n'.join([self.local_var_line.format(obj, self.local_init_func_name) for obj in self.local_objs])
self.first_buff = self.local_init_func.format(self.local_init_func_name) + local_declares
self.last_buff = '\n\n}'
def rewrite_complete(self, string, **kwargs): def rewrite_complete(self, string, **kwargs):
if not kwargs.get('inline_attr'): if not kwargs.get('inline_attr'):
return super(JSWombatProxyRewriterMixin, self).rewrite_complete(string) return super(JSWombatProxyRewriter, self).rewrite_complete(string)
# check if any of the wrapped objects are used in the script # check if any of the wrapped objects are used in the script
# if not, don't rewrite # if not, don't rewrite
@ -231,31 +270,8 @@ if (!self.__WB_pmw) {{ self.__WB_pmw = function(obj) {{ return obj; }} }}\n\
return self.last_buff return self.last_buff
# =================================================================
class JSLocationOnlyRewriter(JSLocationRewriterMixin, RegexRewriter):
pass
# =================================================================
class JSLinkOnlyRewriter(JSLinkRewriterMixin, RegexRewriter):
pass
# =================================================================
class JSLinkAndLocationRewriter(JSLocationRewriterMixin,
JSLinkRewriterMixin,
RegexRewriter):
pass
# ================================================================= # =================================================================
class JSNoneRewriter(RegexRewriter): class JSNoneRewriter(RegexRewriter):
def __init__(self, rewriter, rules=[]):
super(JSNoneRewriter, self).__init__(rewriter, rules)
# =================================================================
class JSWombatProxyRewriter(JSWombatProxyRewriterMixin, RegexRewriter):
pass pass
@ -287,16 +303,40 @@ class JSReplaceFuzzy(object):
# ================================================================= # =================================================================
# Set 'default' JSRewriter class CSSRules(RxRules):
JSRewriter = JSLinkAndLocationRewriter CSS_URL_REGEX = "url\\s*\\(\\s*(?:[\\\\\"']|(?:&.{1,4};))*\\s*([^)'\"]+)\\s*(?:[\\\\\"']|(?:&.{1,4};))*\\s*\\)"
CSS_IMPORT_NO_URL_REGEX = ("@import\\s+(?!url)\\(?\\s*['\"]?" +
"(?!url[\\s\\(])([\w.:/\\\\-]+)")
def __init__(self):
rules = [
(self.CSS_URL_REGEX, self.archival_rewrite(), 1),
(self.CSS_IMPORT_NO_URL_REGEX, self.archival_rewrite(), 1),
]
super(CSSRules, self).__init__(rules)
# =================================================================
class CSSRewriter(RegexRewriter):
rules_factory = CSSRules()
# =================================================================
class XMLRules(RxRules):
def __init__(self):
rules = [
('([A-Za-z:]+[\s=]+)?["\'\s]*(' +
self.HTTPX_MATCH_STR + ')',
self.archival_rewrite(), 2),
]
super(XMLRules, self).__init__(rules)
# ================================================================= # =================================================================
class XMLRewriter(RegexRewriter): class XMLRewriter(RegexRewriter):
def __init__(self, rewriter, extra=[]): rules_factory = XMLRules()
rules = self._create_rules(rewriter)
super(XMLRewriter, self).__init__(rewriter, rules)
# custom filter to reject 'xmlns' attr # custom filter to reject 'xmlns' attr
def filter(self, m): def filter(self, m):
@ -306,30 +346,5 @@ class XMLRewriter(RegexRewriter):
return True return True
def _create_rules(self, rewriter):
return [
('([A-Za-z:]+[\s=]+)?["\'\s]*(' +
RegexRewriter.HTTPX_MATCH_STR + ')',
RegexRewriter.archival_rewrite(rewriter), 2),
]
# =================================================================
class CSSRewriter(RegexRewriter):
CSS_URL_REGEX = "url\\s*\\(\\s*(?:[\\\\\"']|(?:&.{1,4};))*\\s*([^)'\"]+)\\s*(?:[\\\\\"']|(?:&.{1,4};))*\\s*\\)"
CSS_IMPORT_NO_URL_REGEX = ("@import\\s+(?!url)\\(?\\s*['\"]?" +
"(?!url[\\s\\(])([\w.:/\\\\-]+)")
def __init__(self, rewriter):
rules = self._create_rules(rewriter)
super(CSSRewriter, self).__init__(rewriter, rules)
def _create_rules(self, rewriter):
return [
(CSSRewriter.CSS_URL_REGEX,
RegexRewriter.archival_rewrite(rewriter), 1),
(CSSRewriter.CSS_IMPORT_NO_URL_REGEX,
RegexRewriter.archival_rewrite(rewriter), 1),
]

View File

@ -58,7 +58,7 @@ class RewriteDASH(BufferedRewriter):
# ============================================================================ # ============================================================================
def rewrite_fb_dash(string): def rewrite_fb_dash(string, *args):
DASH_SPLIT = r'\n",dash_prefetched_representation_ids:' DASH_SPLIT = r'\n",dash_prefetched_representation_ids:'
inx = string.find(DASH_SPLIT) inx = string.find(DASH_SPLIT)
if inx < 0: if inx < 0:

View File

@ -10,7 +10,7 @@ from pywb.utils.io import chunk_encode_iter
from pywb.rewrite.wburl import WbUrl from pywb.rewrite.wburl import WbUrl
from pywb.rewrite.url_rewriter import UrlRewriter from pywb.rewrite.url_rewriter import UrlRewriter
from pywb.rewrite.default_rewriter import DefaultRewriter from pywb.rewrite.default_rewriter import DefaultRewriter, RewriterWithJSProxy
from pywb import get_test_dir from pywb import get_test_dir
@ -35,6 +35,7 @@ class TestContentRewriter(object):
@classmethod @classmethod
def setup_class(self): def setup_class(self):
self.content_rewriter = DefaultRewriter() self.content_rewriter = DefaultRewriter()
self.js_proxy_content_rewriter = RewriterWithJSProxy()
def _create_response_record(self, url, headers, payload, warc_headers): def _create_response_record(self, url, headers, payload, warc_headers):
writer = BufferWARCWriter() writer = BufferWARCWriter()
@ -53,7 +54,7 @@ class TestContentRewriter(object):
def rewrite_record(self, headers, content, ts, url='http://example.com/', def rewrite_record(self, headers, content, ts, url='http://example.com/',
prefix='http://localhost:8080/prefix/', warc_headers=None, prefix='http://localhost:8080/prefix/', warc_headers=None,
request_url=None, is_live=None): request_url=None, is_live=None, use_js_proxy=True):
record = self._create_response_record(url, headers, content, warc_headers) record = self._create_response_record(url, headers, content, warc_headers)
@ -68,7 +69,10 @@ class TestContentRewriter(object):
cdx['is_fuzzy'] = '1' cdx['is_fuzzy'] = '1'
cdx['is_live'] = is_live cdx['is_live'] = is_live
return self.content_rewriter(record, url_rewriter, None, cdx=cdx) if use_js_proxy:
return self.js_proxy_content_rewriter(record, url_rewriter, None, cdx=cdx)
else:
return self.content_rewriter(record, url_rewriter, None, cdx=cdx)
def test_rewrite_html(self, headers): def test_rewrite_html(self, headers):
content = '<html><body><a href="http://example.com/"></a></body></html>' content = '<html><body><a href="http://example.com/"></a></body></html>'
@ -109,17 +113,34 @@ class TestContentRewriter(object):
assert ('Content-Type', 'text/html') in headers.headers assert ('Content-Type', 'text/html') in headers.headers
exp = '<html><body><a href="http://localhost:8080/prefix/201701/http://example.com/"></a></body></html>' exp = '<html><body><a href="http://localhost:8080/prefix/201701/http://example.com/"></a></body></html>'
assert b''.join(gen).decode('utf-8') == exp
result = b''.join(gen).decode('utf-8')
assert exp == result
def test_rewrite_js_mod(self, headers): def test_rewrite_js_mod(self, headers):
content = 'function() { location.href = "http://example.com/"; }' content = 'function() { location.href = "http://example.com/"; }'
headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701js_') headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701js_', use_js_proxy=False)
assert ('Content-Type', 'text/javascript') in headers.headers assert ('Content-Type', 'text/javascript') in headers.headers
exp = 'function() { WB_wombat_location.href = "http://example.com/"; }' exp = 'function() { WB_wombat_location.href = "http://example.com/"; }'
assert b''.join(gen).decode('utf-8') == exp result = b''.join(gen).decode('utf-8')
assert exp == result
def test_rewrite_js_mod_with_obj_proxy(self, headers):
content = 'function() { location.href = "http://example.com/"; }'
headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701js_', use_js_proxy=True)
assert ('Content-Type', 'text/javascript') in headers.headers
exp = 'function() { location.href = "http://example.com/"; }'
result = b''.join(gen).decode('utf-8')
assert 'let window ' in result
assert exp in result
def test_rewrite_cs_mod(self, headers): def test_rewrite_cs_mod(self, headers):
content = '.foo { background: url(http://localhost:8080/prefix/201701cs_/http://example.com/) }' content = '.foo { background: url(http://localhost:8080/prefix/201701cs_/http://example.com/) }'
@ -136,7 +157,7 @@ class TestContentRewriter(object):
headers = {'Content-Type': 'application/x-javascript'} headers = {'Content-Type': 'application/x-javascript'}
content = 'function() { location.href = "http://example.com/"; }' content = 'function() { location.href = "http://example.com/"; }'
headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701js_') headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701js_', use_js_proxy=False)
assert ('Content-Type', 'application/x-javascript') in headers.headers assert ('Content-Type', 'application/x-javascript') in headers.headers
@ -281,11 +302,14 @@ class TestContentRewriter(object):
content = '/**/ jQuery_ABC({"foo": "bar"});' content = '/**/ jQuery_ABC({"foo": "bar"});'
headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701js_', headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701js_',
url='http://example.com/path/file') url='http://example.com/path/file',
use_js_proxy=True)
assert ('Content-Type', 'text/javascript') in headers.headers assert ('Content-Type', 'text/javascript') in headers.headers
assert b''.join(gen).decode('utf-8') == content result = b''.join(gen).decode('utf-8')
assert 'let window' in result
assert content in result
def test_rewrite_text_no_type(self): def test_rewrite_text_no_type(self):
headers = {} headers = {}
@ -307,7 +331,9 @@ class TestContentRewriter(object):
assert headers.headers == [('Content-Type', 'text/javascript')] assert headers.headers == [('Content-Type', 'text/javascript')]
assert b''.join(gen).decode('utf-8') == content result = b''.join(gen).decode('utf-8')
assert 'let window ' in result
assert content in result
def test_custom_fuzzy_replace(self): def test_custom_fuzzy_replace(self):
headers = {'Content-Type': 'application/octet-stream'} headers = {'Content-Type': 'application/octet-stream'}
@ -329,7 +355,7 @@ class TestContentRewriter(object):
content = '{"foo":"bar", "dash": {"on": "true"}, "some": ["list"]' content = '{"foo":"bar", "dash": {"on": "true"}, "some": ["list"]'
# is_live # is_live
rw_headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701mp_', rw_headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701js_',
url='https://player.vimeo.com/video/123445/config/config?A=B', url='https://player.vimeo.com/video/123445/config/config?A=B',
is_live='1') is_live='1')
@ -342,6 +368,21 @@ class TestContentRewriter(object):
assert b''.join(gen).decode('utf-8') == content assert b''.join(gen).decode('utf-8') == content
def test_custom_live_js_obj_proxy(self):
headers = {'Content-Type': 'text/javascript'}
content = '{"foo":"bar", "dash": {"on": "true"}, "some": ["list"], "hls": {"A": "B"}'
# is_live
rw_headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701js_',
url='https://player.vimeo.com/video/123445/config/config?A=B',
is_live='1',
use_js_proxy=True)
# rewritten
rw_content = '{"foo":"bar", "__dash": {"on": "true"}, "some": ["list"], "__hls": {"A": "B"}'
assert rw_content in b''.join(gen).decode('utf-8')
def test_custom_ajax_rewrite(self): def test_custom_ajax_rewrite(self):
headers = {'Content-Type': 'application/json', headers = {'Content-Type': 'application/json',
'X-Pywb-Requested-With': 'XMLHttpRequest'} 'X-Pywb-Requested-With': 'XMLHttpRequest'}
@ -451,6 +492,26 @@ http://example.com/video_4.m3u8
</MPD>""" </MPD>"""
assert b''.join(gen).decode('utf-8') == filtered assert b''.join(gen).decode('utf-8') == filtered
def test_dash_fb_in_js(self):
headers = {'Content-Type': 'text/javascript'}
with open(os.path.join(get_test_dir(), 'text_content', 'sample_dash.mpd'), 'rt') as fh:
content = 'dash_manifest:"' + fh.read().encode('unicode-escape').decode('utf-8')
rep_ids = r'\n",dash_prefetched_representation_ids:["4","5"]'
content += rep_ids
headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701js_',
url='http://facebook.com/example/dash/manifest.mpd')
assert headers.headers == [('Content-Type', 'text/javascript')]
result = b''.join(gen).decode('utf-8')
# 4, 5 representations removed, replaced with default 1, 7
assert 'dash_prefetched_representation_ids:["1", "7"]' in result
assert rep_ids not in result
def test_dash_custom_max_resolution(self): def test_dash_custom_max_resolution(self):
headers = {'Content-Type': 'application/dash+xml'} headers = {'Content-Type': 'application/dash+xml'}
with open(os.path.join(get_test_dir(), 'text_content', 'sample_dash.mpd'), 'rt') as fh: with open(os.path.join(get_test_dir(), 'text_content', 'sample_dash.mpd'), 'rt') as fh:
@ -533,5 +594,3 @@ http://example.com/video_4.m3u8
assert b''.join(gen).decode('utf-8') == filtered assert b''.join(gen).decode('utf-8') == filtered

View File

@ -3,7 +3,7 @@ r"""
# Custom Regex # Custom Regex
#================================================================= #=================================================================
# Test https->http converter (other tests below in subclasses) # Test https->http converter (other tests below in subclasses)
>>> RegexRewriter(urlrewriter, [(RegexRewriter.HTTPX_MATCH_STR, RegexRewriter.remove_https, 0)]).rewrite('a = https://example.com; b = http://example.com; c = https://some-url/path/https://embedded.example.com') >>> RegexRewriter(urlrewriter, [(RxRules.HTTPX_MATCH_STR, RxRules.remove_https, 0)]).rewrite('a = https://example.com; b = http://example.com; c = https://some-url/path/https://embedded.example.com')
'a = http://example.com; b = http://example.com; c = http://some-url/path/http://embedded.example.com' 'a = http://example.com; b = http://example.com; c = http://some-url/path/http://embedded.example.com'
@ -101,7 +101,7 @@ r"""
'"/web/20131010/\\\\/\\\\/example.com/"' '"/web/20131010/\\\\/\\\\/example.com/"'
# custom rules added # custom rules added
>>> _test_js('window.location = "http://example.com/abc.html"; some_func(); ', [('some_func\(\).*', RegexRewriter.format('/*{0}*/'), 0)]) >>> _test_js('window.location = "http://example.com/abc.html"; some_func(); ', [('some_func\(\).*', RxRules.format('/*{0}*/'), 0)])
'window.WB_wombat_location = "/web/20131010/http://example.com/abc.html"; /*some_func(); */' 'window.WB_wombat_location = "/web/20131010/http://example.com/abc.html"; /*some_func(); */'
# scheme-agnostic # scheme-agnostic
@ -274,7 +274,7 @@ r"""
#================================================================= #=================================================================
from pywb.rewrite.url_rewriter import UrlRewriter from pywb.rewrite.url_rewriter import UrlRewriter
from pywb.rewrite.regex_rewriters import RegexRewriter, JSRewriter, CSSRewriter, XMLRewriter from pywb.rewrite.regex_rewriters import RegexRewriter, JSRewriter, CSSRewriter, XMLRewriter, RxRules
from pywb.rewrite.regex_rewriters import JSWombatProxyRewriter from pywb.rewrite.regex_rewriters import JSWombatProxyRewriter