mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
rewrite: url rewriter: don't rewrite relative urls, only those that start with scheme, / or contain ../ #195
update tests to reflect this new behavior
This commit is contained in:
parent
70fdaae2b3
commit
1fb6e9b5fa
@ -22,10 +22,10 @@ True
|
||||
[('Set-Cookie', 'some=value; Path=/pywb/')]
|
||||
|
||||
>>> rewrite_cookie('abc=def; Path=file.html; Expires=Wed, 13 Jan 2021 22:23:01 GMT', urlrewriter, 'coll')
|
||||
[('Set-Cookie', 'abc=def; Path=/pywb/20131226101010/http://example.com/some/path/file.html')]
|
||||
[('Set-Cookie', 'abc=def; Path=file.html')]
|
||||
|
||||
# keep Max-Age
|
||||
>>> rewrite_cookie('abc=def; Path=file.html; Max-Age=1500', urlrewriter2, 'coll')
|
||||
>>> rewrite_cookie('abc=def; Path=/file.html; Max-Age=1500', urlrewriter2, 'coll')
|
||||
[('Set-Cookie', 'abc=def; Max-Age=1500; Path=/preview/em_/http://example.com/file.html')]
|
||||
|
||||
# Cookie with invalid chars, not parsed
|
||||
@ -92,14 +92,14 @@ def rewrite_cookie(cookie_str, rewriter=urlrewriter, scope='default'):
|
||||
@pytest.mark.skipif(sys.version_info < (2,7), reason='Unsupported')
|
||||
def test_with_expires():
|
||||
# keep expires
|
||||
res = rewrite_cookie('abc=def; Path=file.html; Expires=Wed, 13 Jan 2021 22:23:01 GMT', urlrewriter2, 'coll')
|
||||
res = rewrite_cookie('abc=def; Path=/file.html; Expires=Wed, 13 Jan 2021 22:23:01 GMT', urlrewriter2, 'coll')
|
||||
assert len(res) == 1
|
||||
assert res[0][1].lower() == 'abc=def; expires=wed, 13 jan 2021 22:23:01 gmt; path=/preview/em_/http://example.com/file.html'
|
||||
|
||||
@pytest.mark.skipif(sys.version_info < (2,7), reason='Unsupported')
|
||||
def test_with_expires_utc_replace():
|
||||
# keep expires, UTC->GMT
|
||||
res = rewrite_cookie('abc=def; Path=file.html; Expires=Wed, 13 Jan 2021 22:23:01 UTC', urlrewriter2, 'coll')
|
||||
res = rewrite_cookie('abc=def; Path=/file.html; Expires=Wed, 13 Jan 2021 22:23:01 UTC', urlrewriter2, 'coll')
|
||||
assert len(res) == 1
|
||||
assert res[0][1].lower() == 'abc=def; expires=wed, 13 jan 2021 22:23:01 gmt; path=/preview/em_/http://example.com/file.html'
|
||||
|
||||
@ -113,14 +113,14 @@ def test_http_secure_flag():
|
||||
@pytest.mark.skipif(sys.version_info < (2,7), reason='Unsupported')
|
||||
def test_secure_flag_remove():
|
||||
# Secure Remove
|
||||
res = rewrite_cookie('abc=def; Path=file.html; HttpOnly; Secure', urlrewriter2, 'coll')
|
||||
res = rewrite_cookie('abc=def; Path=/file.html; HttpOnly; Secure', urlrewriter2, 'coll')
|
||||
assert len(res) == 1
|
||||
assert res[0][1].lower() == 'abc=def; httponly; path=/preview/em_/http://example.com/file.html'
|
||||
|
||||
@pytest.mark.skipif(sys.version_info < (2,7), reason='Unsupported')
|
||||
def test_secure_flag_keep():
|
||||
# Secure Keep
|
||||
res = rewrite_cookie('abc=def; Path=file.html; HttpOnly; Secure', urlrewriter3, 'coll')
|
||||
res = rewrite_cookie('abc=def; Path=/file.html; HttpOnly; Secure', urlrewriter3, 'coll')
|
||||
assert res[0][1].lower() == 'abc=def; httponly; path=/preview/em_/http://example.com/file.html; secure'
|
||||
|
||||
|
||||
|
@ -78,7 +78,7 @@ def _test_head_data(headers, status='200 OK', rewriter=urlrewriter):
|
||||
def test_cookie_headers():
|
||||
# cookie, host/origin rewriting
|
||||
res = _test_head_data([('Connection', 'close'),
|
||||
('Set-Cookie', 'foo=bar; Path=/; abc=def; Path=somefile.html'),
|
||||
('Set-Cookie', 'foo=bar; Path=/; abc=def; Path=/somefile.html'),
|
||||
('Host', 'example.com'),
|
||||
('Origin', 'https://example.com')])
|
||||
|
||||
|
@ -8,7 +8,7 @@ r"""
|
||||
#=================================================================
|
||||
|
||||
>>> parse('<HTML><A Href="page.html">Text</a></hTmL>')
|
||||
<html><a href="/web/20131226101010/http://example.com/some/path/page.html">Text</a></html>
|
||||
<html><a href="page.html">Text</a></html>
|
||||
|
||||
>>> parse('<body x="y"><img src="../img.gif"/><br/></body>')
|
||||
<body x="y"><img src="/web/20131226101010im_/http://example.com/some/img.gif"/><br/></body>
|
||||
@ -35,8 +35,8 @@ r"""
|
||||
>>> parse('<html><head><base href="/other/file.html"/>', urlrewriter=full_path_urlrewriter)
|
||||
<html><head><base href="/web/20131226101010/http://example.com/other/file.html"/>
|
||||
|
||||
>>> parse('<base href="static/"/><img src="image.gif"/>')
|
||||
<base href="/web/20131226101010/http://example.com/some/path/static/"/><img src="/web/20131226101010im_/http://example.com/some/path/static/image.gif"/>
|
||||
>>> parse('<base href="./static/"/><img src="image.gif"/>')
|
||||
<base href="./static/"/><img src="image.gif"/>
|
||||
|
||||
# ensure trailing slash added
|
||||
>>> parse('<base href="http://example.com"/>')
|
||||
@ -47,7 +47,7 @@ r"""
|
||||
<html><head><base href="http://example.com/diff/path/file.html"/>
|
||||
|
||||
>>> parse('<base href="static/"/><img src="image.gif"/>', urlrewriter=no_base_canon_rewriter)
|
||||
<base href="static/"/><img src="/web/20131226101010im_/http://example.com/some/path/static/image.gif"/>
|
||||
<base href="static/"/><img src="image.gif"/>
|
||||
|
||||
# Empty url
|
||||
>>> parse('<base href="">')
|
||||
@ -58,7 +58,7 @@ r"""
|
||||
|
||||
# href on other tags
|
||||
>>> parse('<HTML><div Href="page.html">Text</div></hTmL>')
|
||||
<html><div href="/web/20131226101010/http://example.com/some/path/page.html">Text</div></html>
|
||||
<html><div href="page.html">Text</div></html>
|
||||
|
||||
# HTML Entities
|
||||
>>> parse('<a href="">› > ?</div>')
|
||||
@ -148,10 +148,10 @@ r"""
|
||||
<script>/*<![CDATA[*/window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html;/*]]>*/"</script>
|
||||
|
||||
>>> parse('<div style="background: url(\'abc.html\')" onblah onclick="location = \'redirect.html\'"></div>')
|
||||
<div style="background: url('/web/20131226101010/http://example.com/some/path/abc.html')" onblah onclick="window.WB_wombat_location = 'redirect.html'"></div>
|
||||
<div style="background: url('abc.html')" onblah onclick="window.WB_wombat_location = 'redirect.html'"></div>
|
||||
|
||||
>>> parse('<div style="background: url(\'abc.html\')" onblah onclick="window.location = \'redirect.html\'"></div>')
|
||||
<div style="background: url('/web/20131226101010/http://example.com/some/path/abc.html')" onblah onclick="window.WB_wombat_location = 'redirect.html'"></div>
|
||||
>>> parse('<div style="background: url(\'/other_path/abc.html\')" onblah onclick="window.location = \'redirect.html\'"></div>')
|
||||
<div style="background: url('/web/20131226101010/http://example.com/other_path/abc.html')" onblah onclick="window.WB_wombat_location = 'redirect.html'"></div>
|
||||
|
||||
>>> parse('<i style="background-image: url(http://foo-.bar_.example.com/)"></i>')
|
||||
<i style="background-image: url(/web/20131226101010/http://foo-.bar_.example.com/)"></i>
|
||||
@ -169,19 +169,19 @@ r"""
|
||||
<i style="background-image: url("/web/20131226101010/http://%D0%B8%D1%81%D0%BF/")"></i>
|
||||
|
||||
# Style
|
||||
>>> parse('<style>@import "styles.css" .a { font-face: url(\'myfont.ttf\') }</style>')
|
||||
<style>@import "/web/20131226101010/http://example.com/some/path/styles.css" .a { font-face: url('/web/20131226101010/http://example.com/some/path/myfont.ttf') }</style>
|
||||
>>> parse('<style>@import "/styles.css" .a { font-face: url(\'../myfont.ttf\') }</style>')
|
||||
<style>@import "/web/20131226101010/http://example.com/styles.css" .a { font-face: url('/web/20131226101010/http://example.com/some/myfont.ttf') }</style>
|
||||
|
||||
# Unterminated style tag, handle and auto-terminate
|
||||
>>> parse('<style>@import url(styles.css)')
|
||||
<style>@import url(/web/20131226101010/http://example.com/some/path/styles.css)</style>
|
||||
<style>@import url(styles.css)</style>
|
||||
|
||||
# Head Insertion
|
||||
>>> parse('<html><head><script src="other.js"></script></head><body>Test</body></html>', head_insert = '<script src="cool.js"></script>')
|
||||
<html><head><script src="cool.js"></script><script src="/web/20131226101010js_/http://example.com/some/path/other.js"></script></head><body>Test</body></html>
|
||||
>>> parse('<html><head><script src="/other.js"></script></head><body>Test</body></html>', head_insert = '<script src="cool.js"></script>')
|
||||
<html><head><script src="cool.js"></script><script src="/web/20131226101010js_/http://example.com/other.js"></script></head><body>Test</body></html>
|
||||
|
||||
>>> parse('<html><script src="other.js"></script></html>', head_insert = '<script src="cool.js"></script>')
|
||||
<html><script src="cool.js"></script><script src="/web/20131226101010js_/http://example.com/some/path/other.js"></script></html>
|
||||
<html><script src="cool.js"></script><script src="other.js"></script></html>
|
||||
|
||||
>>> parse('<html><head/><body>Test</body></html>', head_insert = '<script src="cool.js"></script>')
|
||||
<html><head><script src="cool.js"></script></head><body>Test</body></html>
|
||||
@ -189,7 +189,7 @@ r"""
|
||||
>>> parse('<body><div style="">SomeTest</div>', head_insert = '/* Insert */')
|
||||
/* Insert */<body><div style="">SomeTest</div>
|
||||
|
||||
>>> parse('<link href="abc.txt"><div>SomeTest</div>', head_insert = '<script>load_stuff();</script>')
|
||||
>>> parse('<link href="/some/path/abc.txt"><div>SomeTest</div>', head_insert = '<script>load_stuff();</script>')
|
||||
<script>load_stuff();</script><link href="/web/20131226101010oe_/http://example.com/some/path/abc.txt"><div>SomeTest</div>
|
||||
|
||||
>>> parse('<!DOCTYPE html>Some Text without any tags <!-- comments -->', head_insert = '<script>load_stuff();</script>')
|
||||
@ -236,7 +236,7 @@ r"""
|
||||
|
||||
# remove extra spaces
|
||||
>>> parse('<HTML><A Href=" page.html ">Text</a></hTmL>')
|
||||
<html><a href="/web/20131226101010/http://example.com/some/path/page.html">Text</a></html>
|
||||
<html><a href="page.html">Text</a></html>
|
||||
|
||||
>>> parse('<HTML><A Href=" ">Text</a></hTmL>')
|
||||
<html><a href="">Text</a></html>
|
||||
|
@ -151,7 +151,7 @@ r"""
|
||||
'background: url(" /web/20131010/http://domain.com/path.html x ")'
|
||||
|
||||
>>> _test_css("background: url(file.jpeg)")
|
||||
'background: url(/web/20131010/http://example.com/file.jpeg)'
|
||||
'background: url(file.jpeg)'
|
||||
|
||||
>>> _test_css("background:#abc url('/static/styles/../images/layout/logo.png')")
|
||||
"background:#abc url('/web/20131010/http://example.com/static/images/layout/logo.png')"
|
||||
@ -163,18 +163,18 @@ r"""
|
||||
"background: url('')"
|
||||
|
||||
>>> _test_css("background: url (\"weirdpath\')")
|
||||
'background: url ("/web/20131010/http://example.com/weirdpath\')'
|
||||
'background: url ("weirdpath\')'
|
||||
|
||||
>>> _test_css("@import url ('path.css')")
|
||||
>>> _test_css("@import url ('/path.css')")
|
||||
"@import url ('/web/20131010/http://example.com/path.css')"
|
||||
|
||||
>>> _test_css("@import url('path.css')")
|
||||
"@import url('/web/20131010/http://example.com/path.css')"
|
||||
"@import url('path.css')"
|
||||
|
||||
>>> _test_css("@import ( 'path.css')")
|
||||
"@import ( '/web/20131010/http://example.com/path.css')"
|
||||
"@import ( 'path.css')"
|
||||
|
||||
>>> _test_css("@import \"path.css\"")
|
||||
>>> _test_css("@import \"/path.css\"")
|
||||
'@import "/web/20131010/http://example.com/path.css"'
|
||||
|
||||
>>> _test_css("@import ('../path.css\"")
|
||||
@ -184,7 +184,7 @@ r"""
|
||||
'@import (\'/web/20131010/http://example.com/url.css"'
|
||||
|
||||
>>> _test_css("@import (\"url.css\")")
|
||||
'@import ("/web/20131010/http://example.com/url.css")'
|
||||
'@import ("url.css")'
|
||||
|
||||
>>> _test_css("@import url(/url.css)\n@import url(/anotherurl.css)\n @import url(/and_a_third.css)")
|
||||
'@import url(/web/20131010/http://example.com/url.css)\n@import url(/web/20131010/http://example.com/anotherurl.css)\n @import url(/web/20131010/http://example.com/and_a_third.css)'
|
||||
|
@ -123,7 +123,7 @@ def test_local_no_head_banner_only():
|
||||
assert 'window.location = "/other.html"' in buff
|
||||
|
||||
# link NOT rewritten
|
||||
assert '"another.html"' in buff
|
||||
assert '"/some/path/another.html"' in buff
|
||||
|
||||
def test_local_banner_only_no_rewrite():
|
||||
status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html',
|
||||
@ -138,7 +138,7 @@ def test_local_banner_only_no_rewrite():
|
||||
assert 'window.location = "http:\/\/example.com/dynamic_page.html"' in buff, buff
|
||||
|
||||
# link NOT rewritten
|
||||
assert '"another.html"' in buff
|
||||
assert '"/some/path/another.html"' in buff
|
||||
|
||||
def test_local_2_link_only_rewrite():
|
||||
status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html',
|
||||
|
@ -21,19 +21,19 @@
|
||||
|
||||
# UrlRewriter tests
|
||||
>>> do_rewrite('other.html', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/')
|
||||
'/web/20131010/http://example.com/path/other.html'
|
||||
'other.html'
|
||||
|
||||
>>> do_rewrite('file.js', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/', 'js_')
|
||||
>>> do_rewrite('/path/file.js', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/', 'js_')
|
||||
'/web/20131010js_/http://example.com/path/file.js'
|
||||
|
||||
>>> do_rewrite('file.js', '20131010/http://example.com/', '/coll/')
|
||||
>>> do_rewrite('/file.js', '20131010/http://example.com/', '/coll/')
|
||||
'/coll/20131010/http://example.com/file.js'
|
||||
|
||||
>>> do_rewrite('file.js', '20131010/http://example.com', '/coll/', 'js_')
|
||||
>>> do_rewrite('/file.js', '20131010/http://example.com', '/coll/', 'js_')
|
||||
'/coll/20131010js_/http://example.com/file.js'
|
||||
|
||||
>>> do_rewrite('file.js', '20131010/http://example.com', '/coll/', '')
|
||||
'/coll/20131010/http://example.com/file.js'
|
||||
'file.js'
|
||||
|
||||
>>> do_rewrite('/other.html', '20130907*/http://example.com/path/page.html', 'http://localhost:8080/coll/')
|
||||
'/coll/20130907*/http://example.com/other.html'
|
||||
@ -41,8 +41,8 @@
|
||||
>>> do_rewrite('/other.html', '20130907*/http://example.com/path/page.html', '/coll/')
|
||||
'/coll/20130907*/http://example.com/other.html'
|
||||
|
||||
>>> do_rewrite('./other.html', '20130907*/http://example.com/path/page.html', '/coll/')
|
||||
'/coll/20130907*/http://example.com/path/other.html'
|
||||
>>> do_rewrite('other.html', '20130907*/http://example.com/path/page.html', '/coll/')
|
||||
'other.html'
|
||||
|
||||
>>> do_rewrite('../other.html', '20131112im_/http://example.com/path/page.html', '/coll/')
|
||||
'/coll/20131112im_/http://example.com/other.html'
|
||||
@ -87,7 +87,7 @@
|
||||
'2020/http://example.com/other.html'
|
||||
|
||||
>>> do_rewrite('', '20131010010203/http://example.com/file.html', '/web/')
|
||||
'/web/20131010010203/http://example.com/file.html'
|
||||
''
|
||||
|
||||
>>> do_rewrite('#anchor', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/')
|
||||
'#anchor'
|
||||
|
@ -19,6 +19,9 @@ class UrlRewriter(object):
|
||||
|
||||
REL_SCHEME = ('//', r'\/\/', r'\\/\\/')
|
||||
|
||||
PARENT_PATH = '../'
|
||||
REL_PATH = '/'
|
||||
|
||||
def __init__(self, wburl, prefix='', full_prefix=None, rel_prefix=None,
|
||||
root_path=None, cookie_scope=None, rewrite_opts=None):
|
||||
self.wburl = wburl if isinstance(wburl, WbUrl) else WbUrl(wburl)
|
||||
@ -60,6 +63,11 @@ class UrlRewriter(object):
|
||||
if url.startswith(self.REL_SCHEME):
|
||||
is_abs = True
|
||||
scheme_rel = True
|
||||
elif (not is_abs and
|
||||
not url.startswith(self.REL_PATH) and
|
||||
self.PARENT_PATH not in url):
|
||||
return url
|
||||
|
||||
# if prefix starts with a scheme
|
||||
#if self.prefix_scheme:
|
||||
# url = self.prefix_scheme + ':' + url
|
||||
|
@ -109,7 +109,7 @@ var wombat_internal = function($wbwindow) {
|
||||
}
|
||||
|
||||
//============================================
|
||||
var rewrite_url = rewrite_url_;
|
||||
var rewrite_url = rewrite_url_debug;
|
||||
|
||||
function rewrite_url_debug(url, use_rel, mod) {
|
||||
var rewritten = rewrite_url_(url, use_rel, mod);
|
||||
|
@ -10,5 +10,5 @@ if (some_val) {
|
||||
}
|
||||
</script>
|
||||
Test Content
|
||||
<a href="another.html">Some Link</a>
|
||||
<a href="/some/path/another.html">Some Link</a>
|
||||
</body>
|
||||
|
@ -5,4 +5,4 @@ if (some_val) {
|
||||
}
|
||||
</script>
|
||||
Test Content
|
||||
<a href="another.html">Some Link</a>
|
||||
<a href="/some/path/another.html">Some Link</a>
|
||||
|
Loading…
x
Reference in New Issue
Block a user