mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
embed rewriting: add 'em_' flag for all regex-based rewrites
(js, css, xml) to be able to distinguish between embeds and non-embeds more conclusively wbrequest: add is_embed(), is_identity() properties update tests don't insert html banner if detected as an embed
This commit is contained in:
parent
52d99aef57
commit
10c84d8354
@ -87,6 +87,17 @@ class WbRequest(object):
|
||||
|
||||
self._parse_extra()
|
||||
|
||||
@property
|
||||
def is_embed(self):
|
||||
return (self.wb_url and
|
||||
self.wb_url.mod and
|
||||
self.wb_url.mod != 'id_')
|
||||
|
||||
@property
|
||||
def is_identity(self):
|
||||
return (self.wb_url and
|
||||
self.wb_url.mod == 'id_')
|
||||
|
||||
def _is_ajax(self):
|
||||
value = self.env.get('HTTP_X_REQUESTED_WITH')
|
||||
if not value:
|
||||
|
@ -25,7 +25,7 @@ class RegexRewriter(object):
|
||||
|
||||
@staticmethod
|
||||
def archival_rewrite(rewriter):
|
||||
return lambda string: rewriter.rewrite(string)
|
||||
return lambda string: rewriter.rewrite(string, 'em_')
|
||||
|
||||
#@staticmethod
|
||||
#def replacer(other):
|
||||
@ -33,7 +33,7 @@ class RegexRewriter(object):
|
||||
|
||||
HTTPX_MATCH_STR = r'https?:\\?/\\?/[A-Za-z0-9:_@.-]+'
|
||||
|
||||
DEFAULT_OP = add_prefix
|
||||
#DEFAULT_OP = add_prefix
|
||||
|
||||
def __init__(self, rules):
|
||||
#rules = self.create_rules(http_prefix)
|
||||
@ -74,8 +74,8 @@ class RegexRewriter(object):
|
||||
return m.group(0)
|
||||
|
||||
# Custom func
|
||||
if not hasattr(op, '__call__'):
|
||||
op = RegexRewriter.DEFAULT_OP(op)
|
||||
#if not hasattr(op, '__call__'):
|
||||
# op = RegexRewriter.DEFAULT_OP(op)
|
||||
|
||||
result = op(m.group(i))
|
||||
final_str = result
|
||||
@ -124,8 +124,8 @@ class JSLinkAndLocationRewriter(JSLinkOnlyRewriter):
|
||||
|
||||
def __init__(self, rewriter, rules=[], prefix='WB_wombat_'):
|
||||
rules = rules + [
|
||||
(r'(?<!/)\blocation\b', prefix, 0),
|
||||
(r'(?<=document\.)domain', prefix, 0),
|
||||
(r'(?<!/)\blocation\b', RegexRewriter.add_prefix(prefix), 0),
|
||||
(r'(?<=document\.)domain', RegexRewriter.add_prefix(prefix), 0),
|
||||
]
|
||||
#import sys
|
||||
#sys.stderr.write('\n\n*** RULES:' + str(rules) + '\n\n')
|
||||
@ -140,7 +140,7 @@ JSRewriter = JSLinkAndLocationRewriter
|
||||
#=================================================================
|
||||
class XMLRewriter(RegexRewriter):
|
||||
def __init__(self, rewriter, extra=[]):
|
||||
rules = self._create_rules(rewriter.get_abs_url())
|
||||
rules = self._create_rules(rewriter)
|
||||
|
||||
super(XMLRewriter, self).__init__(rules)
|
||||
|
||||
@ -152,10 +152,11 @@ class XMLRewriter(RegexRewriter):
|
||||
|
||||
return True
|
||||
|
||||
def _create_rules(self, http_prefix):
|
||||
def _create_rules(self, rewriter):
|
||||
return [
|
||||
('([A-Za-z:]+[\s=]+)?["\'\s]*(' +
|
||||
RegexRewriter.HTTPX_MATCH_STR + ')', http_prefix, 2),
|
||||
RegexRewriter.HTTPX_MATCH_STR + ')',
|
||||
RegexRewriter.archival_rewrite(rewriter), 2),
|
||||
]
|
||||
|
||||
|
||||
|
@ -51,24 +51,24 @@ ur"""
|
||||
|
||||
# Script tag
|
||||
>>> parse('<script>window.location = "http://example.com/a/b/c.html"</script>')
|
||||
<script>window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html"</script>
|
||||
<script>window.WB_wombat_location = "/web/20131226101010em_/http://example.com/a/b/c.html"</script>
|
||||
|
||||
# Unterminated script tag, handle but don't auto-terminate
|
||||
>>> parse('<script>window.location = "http://example.com/a/b/c.html"</sc>')
|
||||
<script>window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html"</sc>
|
||||
<script>window.WB_wombat_location = "/web/20131226101010em_/http://example.com/a/b/c.html"</sc>
|
||||
|
||||
>>> parse('<script>/*<![CDATA[*/window.location = "http://example.com/a/b/c.html;/*]]>*/"</script>')
|
||||
<script>/*<![CDATA[*/window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html;/*]]>*/"</script>
|
||||
<script>/*<![CDATA[*/window.WB_wombat_location = "/web/20131226101010em_/http://example.com/a/b/c.html;/*]]>*/"</script>
|
||||
|
||||
>>> parse('<div style="background: url(\'abc.html\')" onblah onclick="location = \'redirect.html\'"></div>')
|
||||
<div style="background: url('/web/20131226101010/http://example.com/some/path/abc.html')" onblah="" onclick="WB_wombat_location = 'redirect.html'"></div>
|
||||
<div style="background: url('/web/20131226101010em_/http://example.com/some/path/abc.html')" onblah="" onclick="WB_wombat_location = 'redirect.html'"></div>
|
||||
|
||||
>>> parse('<style>@import "styles.css" .a { font-face: url(\'myfont.ttf\') }</style>')
|
||||
<style>@import "/web/20131226101010/http://example.com/some/path/styles.css" .a { font-face: url('/web/20131226101010/http://example.com/some/path/myfont.ttf') }</style>
|
||||
<style>@import "/web/20131226101010em_/http://example.com/some/path/styles.css" .a { font-face: url('/web/20131226101010em_/http://example.com/some/path/myfont.ttf') }</style>
|
||||
|
||||
# Unterminated style tag, handle but don't auto-terminate
|
||||
>>> parse('<style>@import url(styles.css)')
|
||||
<style>@import url(/web/20131226101010/http://example.com/some/path/styles.css)
|
||||
<style>@import url(/web/20131226101010em_/http://example.com/some/path/styles.css)
|
||||
|
||||
# Head Insertion
|
||||
>>> parse('<html><head><script src="other.js"></script></head><body>Test</body></html>', head_insert = '<script src="cool.js"></script>')
|
||||
|
@ -49,24 +49,24 @@ ur"""
|
||||
|
||||
# Script tag
|
||||
>>> parse('<script>window.location = "http://example.com/a/b/c.html"</script>')
|
||||
<html><head><script>window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html"</script></head></html>
|
||||
<html><head><script>window.WB_wombat_location = "/web/20131226101010em_/http://example.com/a/b/c.html"</script></head></html>
|
||||
|
||||
# Unterminated script tag, will auto-terminate
|
||||
>>> parse('<script>window.location = "http://example.com/a/b/c.html"</sc>')
|
||||
<html><head><script>window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html"</sc></script></head></html>
|
||||
<html><head><script>window.WB_wombat_location = "/web/20131226101010em_/http://example.com/a/b/c.html"</sc></script></head></html>
|
||||
|
||||
>>> parse('<script>/*<![CDATA[*/window.location = "http://example.com/a/b/c.html;/*]]>*/"</script>')
|
||||
<html><head><script>/*<![CDATA[*/window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html;/*]]>*/"</script></head></html>
|
||||
<html><head><script>/*<![CDATA[*/window.WB_wombat_location = "/web/20131226101010em_/http://example.com/a/b/c.html;/*]]>*/"</script></head></html>
|
||||
|
||||
>>> parse('<div style="background: url(\'abc.html\')" onblah onclick="location = \'redirect.html\'"></div>')
|
||||
<html><body><div style="background: url('/web/20131226101010/http://example.com/some/path/abc.html')" onblah="" onclick="WB_wombat_location = 'redirect.html'"></div></body></html>
|
||||
<html><body><div style="background: url('/web/20131226101010em_/http://example.com/some/path/abc.html')" onblah="" onclick="WB_wombat_location = 'redirect.html'"></div></body></html>
|
||||
|
||||
>>> parse('<style>@import "styles.css" .a { font-face: url(\'myfont.ttf\') }</style>')
|
||||
<html><head><style>@import "/web/20131226101010/http://example.com/some/path/styles.css" .a { font-face: url('/web/20131226101010/http://example.com/some/path/myfont.ttf') }</style></head></html>
|
||||
<html><head><style>@import "/web/20131226101010em_/http://example.com/some/path/styles.css" .a { font-face: url('/web/20131226101010em_/http://example.com/some/path/myfont.ttf') }</style></head></html>
|
||||
|
||||
# Unterminated style tag, handle but don't auto-terminate
|
||||
>>> parse('<style>@import url(styles.css)')
|
||||
<html><head><style>@import url(/web/20131226101010/http://example.com/some/path/styles.css)</style></head></html>
|
||||
<html><head><style>@import url(/web/20131226101010em_/http://example.com/some/path/styles.css)</style></head></html>
|
||||
|
||||
# Head Insertion
|
||||
>>> parse('<html><head><script src="other.js"></script></head><body>Test</body></html>', head_insert = '<script src="cool.js"></script>')
|
||||
|
@ -12,16 +12,16 @@ r"""
|
||||
#=================================================================
|
||||
|
||||
>>> _test_js('location = "http://example.com/abc.html"')
|
||||
'WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"'
|
||||
'WB_wombat_location = "/web/20131010em_/http://example.com/abc.html"'
|
||||
|
||||
>>> _test_js(r'location = "http:\/\/example.com/abc.html"')
|
||||
'WB_wombat_location = "/web/20131010im_/http:\\/\\/example.com/abc.html"'
|
||||
'WB_wombat_location = "/web/20131010em_/http:\\/\\/example.com/abc.html"'
|
||||
|
||||
>>> _test_js(r'location = "http:\\/\\/example.com/abc.html"')
|
||||
'WB_wombat_location = "/web/20131010im_/http:\\\\/\\\\/example.com/abc.html"'
|
||||
'WB_wombat_location = "/web/20131010em_/http:\\\\/\\\\/example.com/abc.html"'
|
||||
|
||||
>>> _test_js(r"location = 'http://example.com/abc.html/'")
|
||||
"WB_wombat_location = '/web/20131010im_/http://example.com/abc.html/'"
|
||||
"WB_wombat_location = '/web/20131010em_/http://example.com/abc.html/'"
|
||||
|
||||
>>> _test_js(r'location = http://example.com/abc.html/')
|
||||
'WB_wombat_location = http://example.com/abc.html/'
|
||||
@ -37,21 +37,21 @@ r"""
|
||||
'"/location" == some_location_val; locations = WB_wombat_location;'
|
||||
|
||||
>>> _test_js('cool_Location = "http://example.com/abc.html"')
|
||||
'cool_Location = "/web/20131010im_/http://example.com/abc.html"'
|
||||
'cool_Location = "/web/20131010em_/http://example.com/abc.html"'
|
||||
|
||||
>>> _test_js('window.location = "http://example.com/abc.html" document.domain = "anotherdomain.com"')
|
||||
'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html" document.WB_wombat_domain = "anotherdomain.com"'
|
||||
'window.WB_wombat_location = "/web/20131010em_/http://example.com/abc.html" document.WB_wombat_domain = "anotherdomain.com"'
|
||||
|
||||
>>> _test_js('document_domain = "anotherdomain.com"; window.document.domain = "example.com"')
|
||||
'document_domain = "anotherdomain.com"; window.document.WB_wombat_domain = "example.com"'
|
||||
|
||||
# custom rules added
|
||||
>>> _test_js('window.location = "http://example.com/abc.html"; some_func(); ', [('some_func\(\).*', RegexRewriter.format('/*{0}*/'), 0)])
|
||||
'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"; /*some_func(); */'
|
||||
'window.WB_wombat_location = "/web/20131010em_/http://example.com/abc.html"; /*some_func(); */'
|
||||
|
||||
# scheme-agnostic
|
||||
>>> _test_js('cool_Location = "//example.com/abc.html" //comment')
|
||||
'cool_Location = "/web/20131010im_///example.com/abc.html" //comment'
|
||||
'cool_Location = "/web/20131010em_///example.com/abc.html" //comment'
|
||||
|
||||
|
||||
#=================================================================
|
||||
@ -59,62 +59,62 @@ r"""
|
||||
#=================================================================
|
||||
|
||||
>>> _test_xml('<tag xmlns="http://www.example.com/ns" attr="http://example.com"></tag>')
|
||||
'<tag xmlns="http://www.example.com/ns" attr="/web/20131010im_/http://example.com"></tag>'
|
||||
'<tag xmlns="http://www.example.com/ns" attr="/web/20131010em_/http://example.com"></tag>'
|
||||
|
||||
>>> _test_xml('<tag xmlns:xsi="http://www.example.com/ns" attr=" http://example.com"></tag>')
|
||||
'<tag xmlns:xsi="http://www.example.com/ns" attr=" /web/20131010im_/http://example.com"></tag>'
|
||||
'<tag xmlns:xsi="http://www.example.com/ns" attr=" /web/20131010em_/http://example.com"></tag>'
|
||||
|
||||
>>> _test_xml('<tag> http://example.com<other>abchttp://example.com</other></tag>')
|
||||
'<tag> /web/20131010im_/http://example.com<other>abchttp://example.com</other></tag>'
|
||||
'<tag> /web/20131010em_/http://example.com<other>abchttp://example.com</other></tag>'
|
||||
|
||||
>>> _test_xml('<main> http://www.example.com/blah</tag> <other xmlns:abcdef= " http://example.com"/> http://example.com </main>')
|
||||
'<main> /web/20131010im_/http://www.example.com/blah</tag> <other xmlns:abcdef= " http://example.com"/> /web/20131010im_/http://example.com </main>'
|
||||
'<main> /web/20131010em_/http://www.example.com/blah</tag> <other xmlns:abcdef= " http://example.com"/> /web/20131010em_/http://example.com </main>'
|
||||
|
||||
#=================================================================
|
||||
# CSS Rewriting
|
||||
#=================================================================
|
||||
|
||||
>>> _test_css("background: url('/some/path.html')")
|
||||
"background: url('/web/20131010im_/http://example.com/some/path.html')"
|
||||
"background: url('/web/20131010em_/http://example.com/some/path.html')"
|
||||
|
||||
>>> _test_css("background: url('../path.html')")
|
||||
"background: url('/web/20131010im_/http://example.com/path.html')"
|
||||
"background: url('/web/20131010em_/http://example.com/path.html')"
|
||||
|
||||
>>> _test_css("background: url(\"http://domain.com/path.html\")")
|
||||
'background: url("/web/20131010im_/http://domain.com/path.html")'
|
||||
'background: url("/web/20131010em_/http://domain.com/path.html")'
|
||||
|
||||
>>> _test_css("background: url(file.jpeg)")
|
||||
'background: url(/web/20131010im_/http://example.com/file.jpeg)'
|
||||
'background: url(/web/20131010em_/http://example.com/file.jpeg)'
|
||||
|
||||
>>> _test_css("background: url('')")
|
||||
"background: url('')"
|
||||
|
||||
>>> _test_css("background: url (\"weirdpath\')")
|
||||
'background: url ("/web/20131010im_/http://example.com/weirdpath\')'
|
||||
'background: url ("/web/20131010em_/http://example.com/weirdpath\')'
|
||||
|
||||
>>> _test_css("@import url ('path.css')")
|
||||
"@import url ('/web/20131010im_/http://example.com/path.css')"
|
||||
"@import url ('/web/20131010em_/http://example.com/path.css')"
|
||||
|
||||
>>> _test_css("@import url('path.css')")
|
||||
"@import url('/web/20131010im_/http://example.com/path.css')"
|
||||
"@import url('/web/20131010em_/http://example.com/path.css')"
|
||||
|
||||
>>> _test_css("@import ( 'path.css')")
|
||||
"@import ( '/web/20131010im_/http://example.com/path.css')"
|
||||
"@import ( '/web/20131010em_/http://example.com/path.css')"
|
||||
|
||||
>>> _test_css("@import \"path.css\"")
|
||||
'@import "/web/20131010im_/http://example.com/path.css"'
|
||||
'@import "/web/20131010em_/http://example.com/path.css"'
|
||||
|
||||
>>> _test_css("@import ('../path.css\"")
|
||||
'@import (\'/web/20131010im_/http://example.com/path.css"'
|
||||
'@import (\'/web/20131010em_/http://example.com/path.css"'
|
||||
|
||||
>>> _test_css("@import ('../url.css\"")
|
||||
'@import (\'/web/20131010im_/http://example.com/url.css"'
|
||||
'@import (\'/web/20131010em_/http://example.com/url.css"'
|
||||
|
||||
>>> _test_css("@import (\"url.css\")")
|
||||
'@import ("/web/20131010im_/http://example.com/url.css")'
|
||||
'@import ("/web/20131010em_/http://example.com/url.css")'
|
||||
|
||||
>>> _test_css("@import url(/url.css)\n@import url(/anotherurl.css)\n @import url(/and_a_third.css)")
|
||||
'@import url(/web/20131010im_/http://example.com/url.css)\n@import url(/web/20131010im_/http://example.com/anotherurl.css)\n @import url(/web/20131010im_/http://example.com/and_a_third.css)'
|
||||
'@import url(/web/20131010em_/http://example.com/url.css)\n@import url(/web/20131010em_/http://example.com/anotherurl.css)\n @import url(/web/20131010em_/http://example.com/and_a_third.css)'
|
||||
|
||||
#=================================================================
|
||||
HTTP Headers Rewriting
|
||||
@ -134,7 +134,7 @@ HTTP Headers Rewriting
|
||||
{'charset': None,
|
||||
'removed_header_dict': {},
|
||||
'status_headers': StatusAndHeaders(protocol = '', statusline = '302 Redirect', headers = [ ('X-Archive-Orig-Connection', 'close'),
|
||||
('Location', '/web/20131010im_/http://example.com/other.html')]),
|
||||
('Location', '/web/20131010/http://example.com/other.html')]),
|
||||
'text_type': None}
|
||||
|
||||
# gzip
|
||||
@ -172,7 +172,7 @@ from pywb.utils.statusandheaders import StatusAndHeaders
|
||||
|
||||
import pprint
|
||||
|
||||
urlrewriter = UrlRewriter('20131010im_/http://example.com/', '/web/')
|
||||
urlrewriter = UrlRewriter('20131010/http://example.com/', '/web/')
|
||||
|
||||
|
||||
def _test_js(string, extra = []):
|
||||
|
@ -22,6 +22,10 @@ function init_banner() {
|
||||
|
||||
var banner = document.getElementById(BANNER_ID);
|
||||
|
||||
if (wbinfo.is_embed) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (!banner) {
|
||||
banner = document.createElement("wb_div");
|
||||
banner.setAttribute("id", BANNER_ID);
|
||||
|
@ -11,7 +11,9 @@
|
||||
<script>
|
||||
wbinfo = {}
|
||||
wbinfo.capture_str = "{{ cdx.timestamp | format_ts }}";
|
||||
wbinfo.is_embed = {{"true" if wbrequest.is_embed else "false"}};
|
||||
</script>
|
||||
<script src='{{ wbrequest.host_prefix }}/static/default/wb.js'> </script>
|
||||
<link rel='stylesheet' href='{{ wbrequest.host_prefix }}/static/default/wb.css'/>
|
||||
<!-- End WB Insert -->
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user