1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

embed rewriting: add 'em_' flag for all regex-based rewrites

(js, css, xml) to be able to distinguish between embeds and non-embeds
more conclusively
wbrequest: add is_embed(), is_identity() properties
update tests
don't insert html banner if detected as an embed
This commit is contained in:
Ilya Kreymer 2014-03-17 19:36:25 -07:00
parent 52d99aef57
commit 10c84d8354
8 changed files with 67 additions and 49 deletions

View File

@ -87,6 +87,17 @@ class WbRequest(object):
self._parse_extra()
@property
def is_embed(self):
return (self.wb_url and
self.wb_url.mod and
self.wb_url.mod != 'id_')
@property
def is_identity(self):
return (self.wb_url and
self.wb_url.mod == 'id_')
def _is_ajax(self):
value = self.env.get('HTTP_X_REQUESTED_WITH')
if not value:

View File

@ -25,7 +25,7 @@ class RegexRewriter(object):
@staticmethod
def archival_rewrite(rewriter):
return lambda string: rewriter.rewrite(string)
return lambda string: rewriter.rewrite(string, 'em_')
#@staticmethod
#def replacer(other):
@ -33,7 +33,7 @@ class RegexRewriter(object):
HTTPX_MATCH_STR = r'https?:\\?/\\?/[A-Za-z0-9:_@.-]+'
DEFAULT_OP = add_prefix
#DEFAULT_OP = add_prefix
def __init__(self, rules):
#rules = self.create_rules(http_prefix)
@ -74,8 +74,8 @@ class RegexRewriter(object):
return m.group(0)
# Custom func
if not hasattr(op, '__call__'):
op = RegexRewriter.DEFAULT_OP(op)
#if not hasattr(op, '__call__'):
# op = RegexRewriter.DEFAULT_OP(op)
result = op(m.group(i))
final_str = result
@ -124,8 +124,8 @@ class JSLinkAndLocationRewriter(JSLinkOnlyRewriter):
def __init__(self, rewriter, rules=[], prefix='WB_wombat_'):
rules = rules + [
(r'(?<!/)\blocation\b', prefix, 0),
(r'(?<=document\.)domain', prefix, 0),
(r'(?<!/)\blocation\b', RegexRewriter.add_prefix(prefix), 0),
(r'(?<=document\.)domain', RegexRewriter.add_prefix(prefix), 0),
]
#import sys
#sys.stderr.write('\n\n*** RULES:' + str(rules) + '\n\n')
@ -140,7 +140,7 @@ JSRewriter = JSLinkAndLocationRewriter
#=================================================================
class XMLRewriter(RegexRewriter):
def __init__(self, rewriter, extra=[]):
rules = self._create_rules(rewriter.get_abs_url())
rules = self._create_rules(rewriter)
super(XMLRewriter, self).__init__(rules)
@ -152,10 +152,11 @@ class XMLRewriter(RegexRewriter):
return True
def _create_rules(self, http_prefix):
def _create_rules(self, rewriter):
return [
('([A-Za-z:]+[\s=]+)?["\'\s]*(' +
RegexRewriter.HTTPX_MATCH_STR + ')', http_prefix, 2),
RegexRewriter.HTTPX_MATCH_STR + ')',
RegexRewriter.archival_rewrite(rewriter), 2),
]

View File

@ -51,24 +51,24 @@ ur"""
# Script tag
>>> parse('<script>window.location = "http://example.com/a/b/c.html"</script>')
<script>window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html"</script>
<script>window.WB_wombat_location = "/web/20131226101010em_/http://example.com/a/b/c.html"</script>
# Unterminated script tag, handle but don't auto-terminate
>>> parse('<script>window.location = "http://example.com/a/b/c.html"</sc>')
<script>window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html"</sc>
<script>window.WB_wombat_location = "/web/20131226101010em_/http://example.com/a/b/c.html"</sc>
>>> parse('<script>/*<![CDATA[*/window.location = "http://example.com/a/b/c.html;/*]]>*/"</script>')
<script>/*<![CDATA[*/window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html;/*]]>*/"</script>
<script>/*<![CDATA[*/window.WB_wombat_location = "/web/20131226101010em_/http://example.com/a/b/c.html;/*]]>*/"</script>
>>> parse('<div style="background: url(\'abc.html\')" onblah onclick="location = \'redirect.html\'"></div>')
<div style="background: url('/web/20131226101010/http://example.com/some/path/abc.html')" onblah="" onclick="WB_wombat_location = 'redirect.html'"></div>
<div style="background: url('/web/20131226101010em_/http://example.com/some/path/abc.html')" onblah="" onclick="WB_wombat_location = 'redirect.html'"></div>
>>> parse('<style>@import "styles.css" .a { font-face: url(\'myfont.ttf\') }</style>')
<style>@import "/web/20131226101010/http://example.com/some/path/styles.css" .a { font-face: url('/web/20131226101010/http://example.com/some/path/myfont.ttf') }</style>
<style>@import "/web/20131226101010em_/http://example.com/some/path/styles.css" .a { font-face: url('/web/20131226101010em_/http://example.com/some/path/myfont.ttf') }</style>
# Unterminated style tag, handle but don't auto-terminate
>>> parse('<style>@import url(styles.css)')
<style>@import url(/web/20131226101010/http://example.com/some/path/styles.css)
<style>@import url(/web/20131226101010em_/http://example.com/some/path/styles.css)
# Head Insertion
>>> parse('<html><head><script src="other.js"></script></head><body>Test</body></html>', head_insert = '<script src="cool.js"></script>')

View File

@ -49,24 +49,24 @@ ur"""
# Script tag
>>> parse('<script>window.location = "http://example.com/a/b/c.html"</script>')
<html><head><script>window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html"</script></head></html>
<html><head><script>window.WB_wombat_location = "/web/20131226101010em_/http://example.com/a/b/c.html"</script></head></html>
# Unterminated script tag, will auto-terminate
>>> parse('<script>window.location = "http://example.com/a/b/c.html"</sc>')
<html><head><script>window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html"</sc></script></head></html>
<html><head><script>window.WB_wombat_location = "/web/20131226101010em_/http://example.com/a/b/c.html"</sc></script></head></html>
>>> parse('<script>/*<![CDATA[*/window.location = "http://example.com/a/b/c.html;/*]]>*/"</script>')
<html><head><script>/*<![CDATA[*/window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html;/*]]>*/"</script></head></html>
<html><head><script>/*<![CDATA[*/window.WB_wombat_location = "/web/20131226101010em_/http://example.com/a/b/c.html;/*]]>*/"</script></head></html>
>>> parse('<div style="background: url(\'abc.html\')" onblah onclick="location = \'redirect.html\'"></div>')
<html><body><div style="background: url('/web/20131226101010/http://example.com/some/path/abc.html')" onblah="" onclick="WB_wombat_location = 'redirect.html'"></div></body></html>
<html><body><div style="background: url('/web/20131226101010em_/http://example.com/some/path/abc.html')" onblah="" onclick="WB_wombat_location = 'redirect.html'"></div></body></html>
>>> parse('<style>@import "styles.css" .a { font-face: url(\'myfont.ttf\') }</style>')
<html><head><style>@import "/web/20131226101010/http://example.com/some/path/styles.css" .a { font-face: url('/web/20131226101010/http://example.com/some/path/myfont.ttf') }</style></head></html>
<html><head><style>@import "/web/20131226101010em_/http://example.com/some/path/styles.css" .a { font-face: url('/web/20131226101010em_/http://example.com/some/path/myfont.ttf') }</style></head></html>
# Unterminated style tag, handle but don't auto-terminate
>>> parse('<style>@import url(styles.css)')
<html><head><style>@import url(/web/20131226101010/http://example.com/some/path/styles.css)</style></head></html>
<html><head><style>@import url(/web/20131226101010em_/http://example.com/some/path/styles.css)</style></head></html>
# Head Insertion
>>> parse('<html><head><script src="other.js"></script></head><body>Test</body></html>', head_insert = '<script src="cool.js"></script>')

View File

@ -12,16 +12,16 @@ r"""
#=================================================================
>>> _test_js('location = "http://example.com/abc.html"')
'WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"'
'WB_wombat_location = "/web/20131010em_/http://example.com/abc.html"'
>>> _test_js(r'location = "http:\/\/example.com/abc.html"')
'WB_wombat_location = "/web/20131010im_/http:\\/\\/example.com/abc.html"'
'WB_wombat_location = "/web/20131010em_/http:\\/\\/example.com/abc.html"'
>>> _test_js(r'location = "http:\\/\\/example.com/abc.html"')
'WB_wombat_location = "/web/20131010im_/http:\\\\/\\\\/example.com/abc.html"'
'WB_wombat_location = "/web/20131010em_/http:\\\\/\\\\/example.com/abc.html"'
>>> _test_js(r"location = 'http://example.com/abc.html/'")
"WB_wombat_location = '/web/20131010im_/http://example.com/abc.html/'"
"WB_wombat_location = '/web/20131010em_/http://example.com/abc.html/'"
>>> _test_js(r'location = http://example.com/abc.html/')
'WB_wombat_location = http://example.com/abc.html/'
@ -37,21 +37,21 @@ r"""
'"/location" == some_location_val; locations = WB_wombat_location;'
>>> _test_js('cool_Location = "http://example.com/abc.html"')
'cool_Location = "/web/20131010im_/http://example.com/abc.html"'
'cool_Location = "/web/20131010em_/http://example.com/abc.html"'
>>> _test_js('window.location = "http://example.com/abc.html" document.domain = "anotherdomain.com"')
'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html" document.WB_wombat_domain = "anotherdomain.com"'
'window.WB_wombat_location = "/web/20131010em_/http://example.com/abc.html" document.WB_wombat_domain = "anotherdomain.com"'
>>> _test_js('document_domain = "anotherdomain.com"; window.document.domain = "example.com"')
'document_domain = "anotherdomain.com"; window.document.WB_wombat_domain = "example.com"'
# custom rules added
>>> _test_js('window.location = "http://example.com/abc.html"; some_func(); ', [('some_func\(\).*', RegexRewriter.format('/*{0}*/'), 0)])
'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"; /*some_func(); */'
'window.WB_wombat_location = "/web/20131010em_/http://example.com/abc.html"; /*some_func(); */'
# scheme-agnostic
>>> _test_js('cool_Location = "//example.com/abc.html" //comment')
'cool_Location = "/web/20131010im_///example.com/abc.html" //comment'
'cool_Location = "/web/20131010em_///example.com/abc.html" //comment'
#=================================================================
@ -59,62 +59,62 @@ r"""
#=================================================================
>>> _test_xml('<tag xmlns="http://www.example.com/ns" attr="http://example.com"></tag>')
'<tag xmlns="http://www.example.com/ns" attr="/web/20131010im_/http://example.com"></tag>'
'<tag xmlns="http://www.example.com/ns" attr="/web/20131010em_/http://example.com"></tag>'
>>> _test_xml('<tag xmlns:xsi="http://www.example.com/ns" attr=" http://example.com"></tag>')
'<tag xmlns:xsi="http://www.example.com/ns" attr=" /web/20131010im_/http://example.com"></tag>'
'<tag xmlns:xsi="http://www.example.com/ns" attr=" /web/20131010em_/http://example.com"></tag>'
>>> _test_xml('<tag> http://example.com<other>abchttp://example.com</other></tag>')
'<tag> /web/20131010im_/http://example.com<other>abchttp://example.com</other></tag>'
'<tag> /web/20131010em_/http://example.com<other>abchttp://example.com</other></tag>'
>>> _test_xml('<main> http://www.example.com/blah</tag> <other xmlns:abcdef= " http://example.com"/> http://example.com </main>')
'<main> /web/20131010im_/http://www.example.com/blah</tag> <other xmlns:abcdef= " http://example.com"/> /web/20131010im_/http://example.com </main>'
'<main> /web/20131010em_/http://www.example.com/blah</tag> <other xmlns:abcdef= " http://example.com"/> /web/20131010em_/http://example.com </main>'
#=================================================================
# CSS Rewriting
#=================================================================
>>> _test_css("background: url('/some/path.html')")
"background: url('/web/20131010im_/http://example.com/some/path.html')"
"background: url('/web/20131010em_/http://example.com/some/path.html')"
>>> _test_css("background: url('../path.html')")
"background: url('/web/20131010im_/http://example.com/path.html')"
"background: url('/web/20131010em_/http://example.com/path.html')"
>>> _test_css("background: url(\"http://domain.com/path.html\")")
'background: url("/web/20131010im_/http://domain.com/path.html")'
'background: url("/web/20131010em_/http://domain.com/path.html")'
>>> _test_css("background: url(file.jpeg)")
'background: url(/web/20131010im_/http://example.com/file.jpeg)'
'background: url(/web/20131010em_/http://example.com/file.jpeg)'
>>> _test_css("background: url('')")
"background: url('')"
>>> _test_css("background: url (\"weirdpath\')")
'background: url ("/web/20131010im_/http://example.com/weirdpath\')'
'background: url ("/web/20131010em_/http://example.com/weirdpath\')'
>>> _test_css("@import url ('path.css')")
"@import url ('/web/20131010im_/http://example.com/path.css')"
"@import url ('/web/20131010em_/http://example.com/path.css')"
>>> _test_css("@import url('path.css')")
"@import url('/web/20131010im_/http://example.com/path.css')"
"@import url('/web/20131010em_/http://example.com/path.css')"
>>> _test_css("@import ( 'path.css')")
"@import ( '/web/20131010im_/http://example.com/path.css')"
"@import ( '/web/20131010em_/http://example.com/path.css')"
>>> _test_css("@import \"path.css\"")
'@import "/web/20131010im_/http://example.com/path.css"'
'@import "/web/20131010em_/http://example.com/path.css"'
>>> _test_css("@import ('../path.css\"")
'@import (\'/web/20131010im_/http://example.com/path.css"'
'@import (\'/web/20131010em_/http://example.com/path.css"'
>>> _test_css("@import ('../url.css\"")
'@import (\'/web/20131010im_/http://example.com/url.css"'
'@import (\'/web/20131010em_/http://example.com/url.css"'
>>> _test_css("@import (\"url.css\")")
'@import ("/web/20131010im_/http://example.com/url.css")'
'@import ("/web/20131010em_/http://example.com/url.css")'
>>> _test_css("@import url(/url.css)\n@import url(/anotherurl.css)\n @import url(/and_a_third.css)")
'@import url(/web/20131010im_/http://example.com/url.css)\n@import url(/web/20131010im_/http://example.com/anotherurl.css)\n @import url(/web/20131010im_/http://example.com/and_a_third.css)'
'@import url(/web/20131010em_/http://example.com/url.css)\n@import url(/web/20131010em_/http://example.com/anotherurl.css)\n @import url(/web/20131010em_/http://example.com/and_a_third.css)'
#=================================================================
HTTP Headers Rewriting
@ -134,7 +134,7 @@ HTTP Headers Rewriting
{'charset': None,
'removed_header_dict': {},
'status_headers': StatusAndHeaders(protocol = '', statusline = '302 Redirect', headers = [ ('X-Archive-Orig-Connection', 'close'),
('Location', '/web/20131010im_/http://example.com/other.html')]),
('Location', '/web/20131010/http://example.com/other.html')]),
'text_type': None}
# gzip
@ -172,7 +172,7 @@ from pywb.utils.statusandheaders import StatusAndHeaders
import pprint
urlrewriter = UrlRewriter('20131010im_/http://example.com/', '/web/')
urlrewriter = UrlRewriter('20131010/http://example.com/', '/web/')
def _test_js(string, extra = []):

View File

@ -22,6 +22,10 @@ function init_banner() {
var banner = document.getElementById(BANNER_ID);
if (wbinfo.is_embed) {
return;
}
if (!banner) {
banner = document.createElement("wb_div");
banner.setAttribute("id", BANNER_ID);

View File

@ -11,7 +11,9 @@
<script>
wbinfo = {}
wbinfo.capture_str = "{{ cdx.timestamp | format_ts }}";
wbinfo.is_embed = {{"true" if wbrequest.is_embed else "false"}};
</script>
<script src='{{ wbrequest.host_prefix }}/static/default/wb.js'> </script>
<link rel='stylesheet' href='{{ wbrequest.host_prefix }}/static/default/wb.css'/>
<!-- End WB Insert -->

View File

@ -32,7 +32,7 @@ class PyTest(TestCommand):
setup(
name='pywb',
version='0.2.0',
version='0.2.2',
url='https://github.com/ikreymer/pywb',
author='Ilya Kreymer',
author_email='ikreymer@gmail.com',