diff --git a/pywb/rewrite/regex_rewriters.py b/pywb/rewrite/regex_rewriters.py index 571cd196..69dbef32 100644 --- a/pywb/rewrite/regex_rewriters.py +++ b/pywb/rewrite/regex_rewriters.py @@ -5,52 +5,118 @@ from six.moves.urllib.parse import unquote # ================================================================= -class RegexRewriter(StreamingRewriter): - # @staticmethod - # def comment_out(string): - # return '/*' + string + '*/' +class RxRules(object): + HTTPX_MATCH_STR = r'https?:\\?/\\?/[A-Za-z0-9:_@.-]+' @staticmethod - def format(template): - return lambda string: template.format(string) - - @staticmethod - def fixed(string): - return lambda _: string - - @staticmethod - def remove_https(string): + def remove_https(string, _): return string.replace("https", "http") + @staticmethod + def replace_str(replacer): + return lambda x, _: x.replace('this', replacer) + + @staticmethod + def format(template): + return lambda string, _: template.format(string) + + @staticmethod + def fixed(string): + return lambda _, _2: string + + @staticmethod + def archival_rewrite(): + return lambda string, rewriter: rewriter.rewrite(string) + @staticmethod def add_prefix(prefix): - return lambda string: prefix + string + return lambda string, _: prefix + string @staticmethod def add_suffix(suffix): - return lambda string: string + suffix + return lambda string, _: string + suffix @staticmethod - def archival_rewrite(rewriter): - return lambda string: rewriter.rewrite(string) - - - HTTPX_MATCH_STR = r'https?:\\?/\\?/[A-Za-z0-9:_@.-]+' - - # DEFAULT_OP = add_prefix - - def __init__(self, rewriter, rules): - super(RegexRewriter, self).__init__(rewriter) - # rules = self.create_rules(http_prefix) - + def compile_rules(rules): # Build regexstr, concatenating regex list regex_str = '|'.join(['(' + rx + ')' for rx, op, count in rules]) # ensure it's not middle of a word, wrap in non-capture group regex_str = '(?' @@ -109,17 +113,34 @@ class TestContentRewriter(object): assert ('Content-Type', 'text/html') in headers.headers exp = '' - assert b''.join(gen).decode('utf-8') == exp + + result = b''.join(gen).decode('utf-8') + assert exp == result def test_rewrite_js_mod(self, headers): content = 'function() { location.href = "http://example.com/"; }' - headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701js_') + headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701js_', use_js_proxy=False) assert ('Content-Type', 'text/javascript') in headers.headers exp = 'function() { WB_wombat_location.href = "http://example.com/"; }' - assert b''.join(gen).decode('utf-8') == exp + result = b''.join(gen).decode('utf-8') + + assert exp == result + + def test_rewrite_js_mod_with_obj_proxy(self, headers): + content = 'function() { location.href = "http://example.com/"; }' + + headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701js_', use_js_proxy=True) + + assert ('Content-Type', 'text/javascript') in headers.headers + + exp = 'function() { location.href = "http://example.com/"; }' + result = b''.join(gen).decode('utf-8') + + assert 'let window ' in result + assert exp in result def test_rewrite_cs_mod(self, headers): content = '.foo { background: url(http://localhost:8080/prefix/201701cs_/http://example.com/) }' @@ -136,7 +157,7 @@ class TestContentRewriter(object): headers = {'Content-Type': 'application/x-javascript'} content = 'function() { location.href = "http://example.com/"; }' - headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701js_') + headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701js_', use_js_proxy=False) assert ('Content-Type', 'application/x-javascript') in headers.headers @@ -281,11 +302,14 @@ class TestContentRewriter(object): content = '/**/ jQuery_ABC({"foo": "bar"});' headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701js_', - url='http://example.com/path/file') + url='http://example.com/path/file', + use_js_proxy=True) assert ('Content-Type', 'text/javascript') in headers.headers - assert b''.join(gen).decode('utf-8') == content + result = b''.join(gen).decode('utf-8') + assert 'let window' in result + assert content in result def test_rewrite_text_no_type(self): headers = {} @@ -307,7 +331,9 @@ class TestContentRewriter(object): assert headers.headers == [('Content-Type', 'text/javascript')] - assert b''.join(gen).decode('utf-8') == content + result = b''.join(gen).decode('utf-8') + assert 'let window ' in result + assert content in result def test_custom_fuzzy_replace(self): headers = {'Content-Type': 'application/octet-stream'} @@ -329,7 +355,7 @@ class TestContentRewriter(object): content = '{"foo":"bar", "dash": {"on": "true"}, "some": ["list"]' # is_live - rw_headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701mp_', + rw_headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701js_', url='https://player.vimeo.com/video/123445/config/config?A=B', is_live='1') @@ -342,6 +368,21 @@ class TestContentRewriter(object): assert b''.join(gen).decode('utf-8') == content + def test_custom_live_js_obj_proxy(self): + headers = {'Content-Type': 'text/javascript'} + content = '{"foo":"bar", "dash": {"on": "true"}, "some": ["list"], "hls": {"A": "B"}' + + # is_live + rw_headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701js_', + url='https://player.vimeo.com/video/123445/config/config?A=B', + is_live='1', + use_js_proxy=True) + + # rewritten + rw_content = '{"foo":"bar", "__dash": {"on": "true"}, "some": ["list"], "__hls": {"A": "B"}' + + assert rw_content in b''.join(gen).decode('utf-8') + def test_custom_ajax_rewrite(self): headers = {'Content-Type': 'application/json', 'X-Pywb-Requested-With': 'XMLHttpRequest'} @@ -451,6 +492,26 @@ http://example.com/video_4.m3u8 """ assert b''.join(gen).decode('utf-8') == filtered + + def test_dash_fb_in_js(self): + headers = {'Content-Type': 'text/javascript'} + with open(os.path.join(get_test_dir(), 'text_content', 'sample_dash.mpd'), 'rt') as fh: + content = 'dash_manifest:"' + fh.read().encode('unicode-escape').decode('utf-8') + + rep_ids = r'\n",dash_prefetched_representation_ids:["4","5"]' + content += rep_ids + + headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701js_', + url='http://facebook.com/example/dash/manifest.mpd') + + assert headers.headers == [('Content-Type', 'text/javascript')] + + result = b''.join(gen).decode('utf-8') + + # 4, 5 representations removed, replaced with default 1, 7 + assert 'dash_prefetched_representation_ids:["1", "7"]' in result + assert rep_ids not in result + def test_dash_custom_max_resolution(self): headers = {'Content-Type': 'application/dash+xml'} with open(os.path.join(get_test_dir(), 'text_content', 'sample_dash.mpd'), 'rt') as fh: @@ -533,5 +594,3 @@ http://example.com/video_4.m3u8 assert b''.join(gen).decode('utf-8') == filtered - - diff --git a/pywb/rewrite/test/test_regex_rewriters.py b/pywb/rewrite/test/test_regex_rewriters.py index 46608257..7e0070d6 100644 --- a/pywb/rewrite/test/test_regex_rewriters.py +++ b/pywb/rewrite/test/test_regex_rewriters.py @@ -3,7 +3,7 @@ r""" # Custom Regex #================================================================= # Test https->http converter (other tests below in subclasses) ->>> RegexRewriter(urlrewriter, [(RegexRewriter.HTTPX_MATCH_STR, RegexRewriter.remove_https, 0)]).rewrite('a = https://example.com; b = http://example.com; c = https://some-url/path/https://embedded.example.com') +>>> RegexRewriter(urlrewriter, [(RxRules.HTTPX_MATCH_STR, RxRules.remove_https, 0)]).rewrite('a = https://example.com; b = http://example.com; c = https://some-url/path/https://embedded.example.com') 'a = http://example.com; b = http://example.com; c = http://some-url/path/http://embedded.example.com' @@ -101,7 +101,7 @@ r""" '"/web/20131010/\\\\/\\\\/example.com/"' # custom rules added ->>> _test_js('window.location = "http://example.com/abc.html"; some_func(); ', [('some_func\(\).*', RegexRewriter.format('/*{0}*/'), 0)]) +>>> _test_js('window.location = "http://example.com/abc.html"; some_func(); ', [('some_func\(\).*', RxRules.format('/*{0}*/'), 0)]) 'window.WB_wombat_location = "/web/20131010/http://example.com/abc.html"; /*some_func(); */' # scheme-agnostic @@ -274,7 +274,7 @@ r""" #================================================================= from pywb.rewrite.url_rewriter import UrlRewriter -from pywb.rewrite.regex_rewriters import RegexRewriter, JSRewriter, CSSRewriter, XMLRewriter +from pywb.rewrite.regex_rewriters import RegexRewriter, JSRewriter, CSSRewriter, XMLRewriter, RxRules from pywb.rewrite.regex_rewriters import JSWombatProxyRewriter