mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
Merge branch 'develop'
This commit is contained in:
commit
c3f98c3d38
@ -1,4 +1,4 @@
|
||||
PyWb 0.31.0
|
||||
PyWb 0.32.0
|
||||
===========
|
||||
|
||||
.. image:: https://travis-ci.org/ikreymer/pywb.svg?branch=master
|
||||
|
@ -1,4 +1,4 @@
|
||||
__version__ = '0.31.0'
|
||||
__version__ = '0.32.0'
|
||||
|
||||
DEFAULT_CONFIG = 'pywb/default_config.yaml'
|
||||
|
||||
|
@ -17,6 +17,12 @@ class RewrittenStatusAndHeaders(object):
|
||||
def contains_removed_header(self, name, value):
|
||||
return self.removed_header_dict.get(name) == value
|
||||
|
||||
def readd_rewrite_removed(self):
|
||||
for name in HeaderRewriter.PROXY_NO_REWRITE_HEADERS:
|
||||
value = self.removed_header_dict.get(name)
|
||||
if value is not None:
|
||||
self.status_headers.headers.append((name, value))
|
||||
|
||||
|
||||
#=================================================================
|
||||
class HeaderRewriter(object):
|
||||
@ -34,19 +40,21 @@ class HeaderRewriter(object):
|
||||
'json': ['application/json'],
|
||||
|
||||
'xml': ['/xml', '+xml', '.xml', '.rss'],
|
||||
|
||||
'plain': ['text/plain'],
|
||||
}
|
||||
|
||||
PROXY_HEADERS = ['content-type', 'content-disposition', 'content-range',
|
||||
'accept-ranges']
|
||||
'accept-ranges', 'www-authenticate', 'proxy-authenticate']
|
||||
|
||||
URL_REWRITE_HEADERS = ['location', 'content-location', 'content-base']
|
||||
|
||||
ENCODING_HEADERS = ['content-encoding']
|
||||
#ENCODING_HEADERS = ['content-encoding']
|
||||
|
||||
REMOVE_HEADERS = ['transfer-encoding', 'content-security-policy',
|
||||
'strict-transport-security']
|
||||
|
||||
PROXY_NO_REWRITE_HEADERS = ['content-length']
|
||||
PROXY_NO_REWRITE_HEADERS = ['content-length', 'content-encoding']
|
||||
|
||||
COOKIE_HEADERS = ['set-cookie', 'cookie']
|
||||
|
||||
@ -141,9 +149,10 @@ class HeaderRewriter(object):
|
||||
elif urlrewriter and lowername in self.URL_REWRITE_HEADERS:
|
||||
new_headers.append((name, urlrewriter.rewrite(value)))
|
||||
|
||||
elif lowername in self.ENCODING_HEADERS:
|
||||
elif lowername in self.PROXY_NO_REWRITE_HEADERS:
|
||||
if content_rewritten:
|
||||
removed_header_dict[lowername] = value
|
||||
add_prefixed_header(name, value)
|
||||
else:
|
||||
add_header(name, value)
|
||||
|
||||
@ -151,10 +160,6 @@ class HeaderRewriter(object):
|
||||
removed_header_dict[lowername] = value
|
||||
add_prefixed_header(name, value)
|
||||
|
||||
elif (lowername in self.PROXY_NO_REWRITE_HEADERS and
|
||||
not content_rewritten):
|
||||
add_header(name, value)
|
||||
|
||||
elif (lowername in self.COOKIE_HEADERS and
|
||||
cookie_rewriter):
|
||||
cookie_list = cookie_rewriter.rewrite(value)
|
||||
|
@ -40,6 +40,7 @@ class HTMLRewriterMixin(object):
|
||||
'embed': {'src': 'oe_'},
|
||||
'head': {'': defmod}, # for head rewriting
|
||||
'iframe': {'src': 'if_'},
|
||||
'image': {'src': 'im_', 'xlink:href': 'im_'},
|
||||
'img': {'src': 'im_',
|
||||
'srcset': 'im_'},
|
||||
'ins': {'cite': defmod},
|
||||
@ -118,6 +119,8 @@ class HTMLRewriterMixin(object):
|
||||
META_REFRESH_REGEX = re.compile('^[\\d.]+\\s*;\\s*url\\s*=\\s*(.+?)\\s*$',
|
||||
re.IGNORECASE | re.MULTILINE)
|
||||
|
||||
ADD_WINDOW = re.compile('(?<![.])(WB_wombat_)')
|
||||
|
||||
def _rewrite_meta_refresh(self, meta_refresh):
|
||||
if not meta_refresh:
|
||||
return ''
|
||||
@ -225,12 +228,16 @@ class HTMLRewriterMixin(object):
|
||||
else:
|
||||
return ''
|
||||
|
||||
def _rewrite_script(self, script_content):
|
||||
if script_content:
|
||||
return self.js_rewriter.rewrite(script_content)
|
||||
else:
|
||||
def _rewrite_script(self, script_content, ensure_window=False):
|
||||
if not script_content:
|
||||
return ''
|
||||
|
||||
content = self.js_rewriter.rewrite(script_content)
|
||||
if ensure_window:
|
||||
content = self.ADD_WINDOW.sub('window.\\1', content)
|
||||
|
||||
return content
|
||||
|
||||
def has_attr(self, tag_attrs, attr):
|
||||
name, value = attr
|
||||
for attr_name, attr_value in tag_attrs:
|
||||
@ -266,7 +273,7 @@ class HTMLRewriterMixin(object):
|
||||
# special case: inline JS/event handler
|
||||
if ((attr_value and attr_value.startswith('javascript:'))
|
||||
or attr_name.startswith('on')):
|
||||
attr_value = self._rewrite_script(attr_value)
|
||||
attr_value = self._rewrite_script(attr_value, True)
|
||||
|
||||
# special case: inline CSS/style attribute
|
||||
elif attr_name == 'style':
|
||||
@ -328,6 +335,11 @@ class HTMLRewriterMixin(object):
|
||||
elif (tag == 'base') and (attr_name == 'href') and attr_value:
|
||||
rw_mod = handler.get(attr_name)
|
||||
attr_value = self._rewrite_base(attr_value, rw_mod)
|
||||
|
||||
elif attr_name == 'href':
|
||||
rw_mod = self.defmod
|
||||
attr_value = self._rewrite_url(attr_value, rw_mod)
|
||||
|
||||
else:
|
||||
# rewrite url using tag handler
|
||||
rw_mod = handler.get(attr_name)
|
||||
|
@ -209,7 +209,7 @@ class XMLRewriter(RegexRewriter):
|
||||
#=================================================================
|
||||
class CSSRewriter(RegexRewriter):
|
||||
|
||||
CSS_URL_REGEX = "url\\s*\\(\\s*[\\\\\"']*([^)'\"]+)[\\\\\"']*\\s*\\)"
|
||||
CSS_URL_REGEX = "url\\s*\\(\\s*(?:[\\\\\"']|(?:&.{1,4};))*\\s*([^)'\"]+)\\s*(?:[\\\\\"']|(?:&.{1,4};))*\\s*\\)"
|
||||
|
||||
CSS_IMPORT_NO_URL_REGEX = ("@import\\s+(?!url)\\(?\\s*['\"]?" +
|
||||
"(?!url[\\s\\(])([\w.:/\\\\-]+)")
|
||||
|
@ -6,13 +6,12 @@ from pywb.rewrite.rewrite_content import RewriteContent
|
||||
# ============================================================================
|
||||
# Expiermental: not fully tested
|
||||
class RewriteContentAMF(RewriteContent): #pragma: no cover
|
||||
def handle_custom_rewrite(self, text_type, status_headers, stream, env):
|
||||
|
||||
if status_headers.get_header('Content-Type') == 'application/x-amf':
|
||||
def handle_custom_rewrite(self, rewritten_headers, stream, mod, env):
|
||||
if rewritten_headers.status_headers.get_header('Content-Type') == 'application/x-amf':
|
||||
stream = self.rewrite_amf(stream, env)
|
||||
|
||||
return (super(RewriteContentAMF, self).
|
||||
handle_custom_rewrite(text_type, status_headers, stream, env))
|
||||
handle_custom_rewrite(rewritten_headers, stream, mod, env))
|
||||
|
||||
def rewrite_amf(self, stream, env):
|
||||
try:
|
||||
|
@ -118,11 +118,9 @@ class RewriteContent(object):
|
||||
urlkey,
|
||||
cookie_rewriter)
|
||||
|
||||
status_headers = rewritten_headers.status_headers
|
||||
|
||||
res = self.handle_custom_rewrite(rewritten_headers.text_type,
|
||||
status_headers,
|
||||
res = self.handle_custom_rewrite(rewritten_headers,
|
||||
stream,
|
||||
wb_url.mod,
|
||||
env)
|
||||
if res:
|
||||
return res
|
||||
@ -131,6 +129,7 @@ class RewriteContent(object):
|
||||
# ====================================================================
|
||||
# special case -- need to ungzip the body
|
||||
|
||||
status_headers = rewritten_headers.status_headers
|
||||
text_type = rewritten_headers.text_type
|
||||
|
||||
# see known js/css modifier specified, the context should run
|
||||
@ -246,11 +245,18 @@ class RewriteContent(object):
|
||||
|
||||
return (status_headers, gen, True)
|
||||
|
||||
def handle_custom_rewrite(self, text_type, status_headers, stream, env):
|
||||
def handle_custom_rewrite(self, rewritten_headers, stream, mod, env):
|
||||
text_type = rewritten_headers.text_type
|
||||
status_headers = rewritten_headers.status_headers
|
||||
|
||||
# use rewritten headers, but no further rewriting needed
|
||||
if text_type is None:
|
||||
return (status_headers, self.stream_to_gen(stream), False)
|
||||
|
||||
if text_type == 'plain' and not mod in ('js_', 'cs_'):
|
||||
rewritten_headers.readd_rewrite_removed()
|
||||
return (status_headers, self.stream_to_gen(stream), False)
|
||||
|
||||
@staticmethod
|
||||
def _extract_html_charset(buff, status_headers):
|
||||
charset = None
|
||||
|
@ -22,10 +22,10 @@ True
|
||||
[('Set-Cookie', 'some=value; Path=/pywb/')]
|
||||
|
||||
>>> rewrite_cookie('abc=def; Path=file.html; Expires=Wed, 13 Jan 2021 22:23:01 GMT', urlrewriter, 'coll')
|
||||
[('Set-Cookie', 'abc=def; Path=/pywb/20131226101010/http://example.com/some/path/file.html')]
|
||||
[('Set-Cookie', 'abc=def; Path=file.html')]
|
||||
|
||||
# keep Max-Age
|
||||
>>> rewrite_cookie('abc=def; Path=file.html; Max-Age=1500', urlrewriter2, 'coll')
|
||||
>>> rewrite_cookie('abc=def; Path=/file.html; Max-Age=1500', urlrewriter2, 'coll')
|
||||
[('Set-Cookie', 'abc=def; Max-Age=1500; Path=/preview/em_/http://example.com/file.html')]
|
||||
|
||||
# Cookie with invalid chars, not parsed
|
||||
@ -92,14 +92,14 @@ def rewrite_cookie(cookie_str, rewriter=urlrewriter, scope='default'):
|
||||
@pytest.mark.skipif(sys.version_info < (2,7), reason='Unsupported')
|
||||
def test_with_expires():
|
||||
# keep expires
|
||||
res = rewrite_cookie('abc=def; Path=file.html; Expires=Wed, 13 Jan 2021 22:23:01 GMT', urlrewriter2, 'coll')
|
||||
res = rewrite_cookie('abc=def; Path=/file.html; Expires=Wed, 13 Jan 2021 22:23:01 GMT', urlrewriter2, 'coll')
|
||||
assert len(res) == 1
|
||||
assert res[0][1].lower() == 'abc=def; expires=wed, 13 jan 2021 22:23:01 gmt; path=/preview/em_/http://example.com/file.html'
|
||||
|
||||
@pytest.mark.skipif(sys.version_info < (2,7), reason='Unsupported')
|
||||
def test_with_expires_utc_replace():
|
||||
# keep expires, UTC->GMT
|
||||
res = rewrite_cookie('abc=def; Path=file.html; Expires=Wed, 13 Jan 2021 22:23:01 UTC', urlrewriter2, 'coll')
|
||||
res = rewrite_cookie('abc=def; Path=/file.html; Expires=Wed, 13 Jan 2021 22:23:01 UTC', urlrewriter2, 'coll')
|
||||
assert len(res) == 1
|
||||
assert res[0][1].lower() == 'abc=def; expires=wed, 13 jan 2021 22:23:01 gmt; path=/preview/em_/http://example.com/file.html'
|
||||
|
||||
@ -113,14 +113,14 @@ def test_http_secure_flag():
|
||||
@pytest.mark.skipif(sys.version_info < (2,7), reason='Unsupported')
|
||||
def test_secure_flag_remove():
|
||||
# Secure Remove
|
||||
res = rewrite_cookie('abc=def; Path=file.html; HttpOnly; Secure', urlrewriter2, 'coll')
|
||||
res = rewrite_cookie('abc=def; Path=/file.html; HttpOnly; Secure', urlrewriter2, 'coll')
|
||||
assert len(res) == 1
|
||||
assert res[0][1].lower() == 'abc=def; httponly; path=/preview/em_/http://example.com/file.html'
|
||||
|
||||
@pytest.mark.skipif(sys.version_info < (2,7), reason='Unsupported')
|
||||
def test_secure_flag_keep():
|
||||
# Secure Keep
|
||||
res = rewrite_cookie('abc=def; Path=file.html; HttpOnly; Secure', urlrewriter3, 'coll')
|
||||
res = rewrite_cookie('abc=def; Path=/file.html; HttpOnly; Secure', urlrewriter3, 'coll')
|
||||
assert res[0][1].lower() == 'abc=def; httponly; path=/preview/em_/http://example.com/file.html; secure'
|
||||
|
||||
|
||||
|
@ -6,7 +6,7 @@ HTTP Headers Rewriting
|
||||
# Text with charset
|
||||
>>> _test_headers([('Date', 'Fri, 03 Jan 2014 03:03:21 GMT'), ('Content-Length', '5'), ('Content-Type', 'text/html;charset=UTF-8')])
|
||||
{'charset': 'utf-8',
|
||||
'removed_header_dict': {},
|
||||
'removed_header_dict': {'content-length': '5'},
|
||||
'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Date', 'Fri, 03 Jan 2014 03:03:21 GMT'),
|
||||
('X-Archive-Orig-Content-Length', '5'),
|
||||
('Content-Type', 'text/html;charset=UTF-8')]),
|
||||
@ -24,9 +24,11 @@ HTTP Headers Rewriting
|
||||
>>> _test_headers([('Content-Length', '199999'), ('Content-Type', 'text/javascript'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked')])
|
||||
{'charset': None,
|
||||
'removed_header_dict': {'content-encoding': 'gzip',
|
||||
'content-length': '199999',
|
||||
'transfer-encoding': 'chunked'},
|
||||
'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Content-Length', '199999'),
|
||||
('Content-Type', 'text/javascript'),
|
||||
('X-Archive-Orig-Content-Encoding', 'gzip'),
|
||||
('X-Archive-Orig-Transfer-Encoding', 'chunked')]),
|
||||
'text_type': 'js'}
|
||||
|
||||
@ -76,7 +78,7 @@ def _test_head_data(headers, status='200 OK', rewriter=urlrewriter):
|
||||
def test_cookie_headers():
|
||||
# cookie, host/origin rewriting
|
||||
res = _test_head_data([('Connection', 'close'),
|
||||
('Set-Cookie', 'foo=bar; Path=/; abc=def; Path=somefile.html'),
|
||||
('Set-Cookie', 'foo=bar; Path=/; abc=def; Path=/somefile.html'),
|
||||
('Host', 'example.com'),
|
||||
('Origin', 'https://example.com')])
|
||||
|
||||
|
@ -8,7 +8,7 @@ r"""
|
||||
#=================================================================
|
||||
|
||||
>>> parse('<HTML><A Href="page.html">Text</a></hTmL>')
|
||||
<html><a href="/web/20131226101010/http://example.com/some/path/page.html">Text</a></html>
|
||||
<html><a href="page.html">Text</a></html>
|
||||
|
||||
>>> parse('<body x="y"><img src="../img.gif"/><br/></body>')
|
||||
<body x="y"><img src="/web/20131226101010im_/http://example.com/some/img.gif"/><br/></body>
|
||||
@ -35,8 +35,8 @@ r"""
|
||||
>>> parse('<html><head><base href="/other/file.html"/>', urlrewriter=full_path_urlrewriter)
|
||||
<html><head><base href="/web/20131226101010/http://example.com/other/file.html"/>
|
||||
|
||||
>>> parse('<base href="static/"/><img src="image.gif"/>')
|
||||
<base href="/web/20131226101010/http://example.com/some/path/static/"/><img src="/web/20131226101010im_/http://example.com/some/path/static/image.gif"/>
|
||||
>>> parse('<base href="./static/"/><img src="image.gif"/>')
|
||||
<base href="./static/"/><img src="image.gif"/>
|
||||
|
||||
# ensure trailing slash added
|
||||
>>> parse('<base href="http://example.com"/>')
|
||||
@ -47,7 +47,7 @@ r"""
|
||||
<html><head><base href="http://example.com/diff/path/file.html"/>
|
||||
|
||||
>>> parse('<base href="static/"/><img src="image.gif"/>', urlrewriter=no_base_canon_rewriter)
|
||||
<base href="static/"/><img src="/web/20131226101010im_/http://example.com/some/path/static/image.gif"/>
|
||||
<base href="static/"/><img src="image.gif"/>
|
||||
|
||||
# Empty url
|
||||
>>> parse('<base href="">')
|
||||
@ -56,6 +56,9 @@ r"""
|
||||
>>> parse('<base href>')
|
||||
<base href>
|
||||
|
||||
# href on other tags
|
||||
>>> parse('<HTML><div Href="page.html">Text</div></hTmL>')
|
||||
<html><div href="page.html">Text</div></html>
|
||||
|
||||
# HTML Entities
|
||||
>>> parse('<a href="">› > ?</div>')
|
||||
@ -145,25 +148,40 @@ r"""
|
||||
<script>/*<![CDATA[*/window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html;/*]]>*/"</script>
|
||||
|
||||
>>> parse('<div style="background: url(\'abc.html\')" onblah onclick="location = \'redirect.html\'"></div>')
|
||||
<div style="background: url('/web/20131226101010/http://example.com/some/path/abc.html')" onblah onclick="WB_wombat_location = 'redirect.html'"></div>
|
||||
<div style="background: url('abc.html')" onblah onclick="window.WB_wombat_location = 'redirect.html'"></div>
|
||||
|
||||
>>> parse('<div style="background: url(\'/other_path/abc.html\')" onblah onclick="window.location = \'redirect.html\'"></div>')
|
||||
<div style="background: url('/web/20131226101010/http://example.com/other_path/abc.html')" onblah onclick="window.WB_wombat_location = 'redirect.html'"></div>
|
||||
|
||||
>>> parse('<i style="background-image: url(http://foo-.bar_.example.com/)"></i>')
|
||||
<i style="background-image: url(/web/20131226101010/http://foo-.bar_.example.com/)"></i>
|
||||
|
||||
>>> parse('<i style=\'background-image: url("http://foo.example.com/")\'></i>')
|
||||
<i style="background-image: url("/web/20131226101010/http://foo.example.com/")"></i>
|
||||
|
||||
>>> parse('<i style=\'background-image: url("http://foo.example.com/")\'></i>')
|
||||
<i style="background-image: url("/web/20131226101010/http://foo.example.com/")"></i>
|
||||
|
||||
>>> parse("<i style='background-image: url('http://foo.example.com/')'></i>")
|
||||
<i style="background-image: url('/web/20131226101010/http://foo.example.com/')"></i>
|
||||
|
||||
#>>> parse('<i style=\'background-image: url("http://исп/")\'></i>')
|
||||
<i style="background-image: url("/web/20131226101010/http://%D0%B8%D1%81%D0%BF/")"></i>
|
||||
|
||||
# Style
|
||||
>>> parse('<style>@import "styles.css" .a { font-face: url(\'myfont.ttf\') }</style>')
|
||||
<style>@import "/web/20131226101010/http://example.com/some/path/styles.css" .a { font-face: url('/web/20131226101010/http://example.com/some/path/myfont.ttf') }</style>
|
||||
>>> parse('<style>@import "/styles.css" .a { font-face: url(\'../myfont.ttf\') }</style>')
|
||||
<style>@import "/web/20131226101010/http://example.com/styles.css" .a { font-face: url('/web/20131226101010/http://example.com/some/myfont.ttf') }</style>
|
||||
|
||||
# Unterminated style tag, handle and auto-terminate
|
||||
>>> parse('<style>@import url(styles.css)')
|
||||
<style>@import url(/web/20131226101010/http://example.com/some/path/styles.css)</style>
|
||||
<style>@import url(styles.css)</style>
|
||||
|
||||
# Head Insertion
|
||||
>>> parse('<html><head><script src="other.js"></script></head><body>Test</body></html>', head_insert = '<script src="cool.js"></script>')
|
||||
<html><head><script src="cool.js"></script><script src="/web/20131226101010js_/http://example.com/some/path/other.js"></script></head><body>Test</body></html>
|
||||
>>> parse('<html><head><script src="/other.js"></script></head><body>Test</body></html>', head_insert = '<script src="cool.js"></script>')
|
||||
<html><head><script src="cool.js"></script><script src="/web/20131226101010js_/http://example.com/other.js"></script></head><body>Test</body></html>
|
||||
|
||||
>>> parse('<html><script src="other.js"></script></html>', head_insert = '<script src="cool.js"></script>')
|
||||
<html><script src="cool.js"></script><script src="/web/20131226101010js_/http://example.com/some/path/other.js"></script></html>
|
||||
<html><script src="cool.js"></script><script src="other.js"></script></html>
|
||||
|
||||
>>> parse('<html><head/><body>Test</body></html>', head_insert = '<script src="cool.js"></script>')
|
||||
<html><head><script src="cool.js"></script></head><body>Test</body></html>
|
||||
@ -171,7 +189,7 @@ r"""
|
||||
>>> parse('<body><div style="">SomeTest</div>', head_insert = '/* Insert */')
|
||||
/* Insert */<body><div style="">SomeTest</div>
|
||||
|
||||
>>> parse('<link href="abc.txt"><div>SomeTest</div>', head_insert = '<script>load_stuff();</script>')
|
||||
>>> parse('<link href="/some/path/abc.txt"><div>SomeTest</div>', head_insert = '<script>load_stuff();</script>')
|
||||
<script>load_stuff();</script><link href="/web/20131226101010oe_/http://example.com/some/path/abc.txt"><div>SomeTest</div>
|
||||
|
||||
>>> parse('<!DOCTYPE html>Some Text without any tags <!-- comments -->', head_insert = '<script>load_stuff();</script>')
|
||||
@ -218,7 +236,7 @@ r"""
|
||||
|
||||
# remove extra spaces
|
||||
>>> parse('<HTML><A Href=" page.html ">Text</a></hTmL>')
|
||||
<html><a href="/web/20131226101010/http://example.com/some/path/page.html">Text</a></html>
|
||||
<html><a href="page.html">Text</a></html>
|
||||
|
||||
>>> parse('<HTML><A Href=" ">Text</a></hTmL>')
|
||||
<html><a href="">Text</a></html>
|
||||
|
@ -144,8 +144,14 @@ r"""
|
||||
>>> _test_css("background: url(\"http://domain.com/path.html\")")
|
||||
'background: url("/web/20131010/http://domain.com/path.html")'
|
||||
|
||||
>>> _test_css('background: url(" http://domain.com/path.html ")')
|
||||
'background: url(" /web/20131010/http://domain.com/path.html ")'
|
||||
|
||||
>>> _test_css('background: url(" http://domain.com/path.html x ")')
|
||||
'background: url(" /web/20131010/http://domain.com/path.html x ")'
|
||||
|
||||
>>> _test_css("background: url(file.jpeg)")
|
||||
'background: url(/web/20131010/http://example.com/file.jpeg)'
|
||||
'background: url(file.jpeg)'
|
||||
|
||||
>>> _test_css("background:#abc url('/static/styles/../images/layout/logo.png')")
|
||||
"background:#abc url('/web/20131010/http://example.com/static/images/layout/logo.png')"
|
||||
@ -157,18 +163,18 @@ r"""
|
||||
"background: url('')"
|
||||
|
||||
>>> _test_css("background: url (\"weirdpath\')")
|
||||
'background: url ("/web/20131010/http://example.com/weirdpath\')'
|
||||
'background: url ("weirdpath\')'
|
||||
|
||||
>>> _test_css("@import url ('path.css')")
|
||||
>>> _test_css("@import url ('/path.css')")
|
||||
"@import url ('/web/20131010/http://example.com/path.css')"
|
||||
|
||||
>>> _test_css("@import url('path.css')")
|
||||
"@import url('/web/20131010/http://example.com/path.css')"
|
||||
"@import url('path.css')"
|
||||
|
||||
>>> _test_css("@import ( 'path.css')")
|
||||
"@import ( '/web/20131010/http://example.com/path.css')"
|
||||
"@import ( 'path.css')"
|
||||
|
||||
>>> _test_css("@import \"path.css\"")
|
||||
>>> _test_css("@import \"/path.css\"")
|
||||
'@import "/web/20131010/http://example.com/path.css"'
|
||||
|
||||
>>> _test_css("@import ('../path.css\"")
|
||||
@ -178,7 +184,7 @@ r"""
|
||||
'@import (\'/web/20131010/http://example.com/url.css"'
|
||||
|
||||
>>> _test_css("@import (\"url.css\")")
|
||||
'@import ("/web/20131010/http://example.com/url.css")'
|
||||
'@import ("url.css")'
|
||||
|
||||
>>> _test_css("@import url(/url.css)\n@import url(/anotherurl.css)\n @import url(/and_a_third.css)")
|
||||
'@import url(/web/20131010/http://example.com/url.css)\n@import url(/web/20131010/http://example.com/anotherurl.css)\n @import url(/web/20131010/http://example.com/and_a_third.css)'
|
||||
|
@ -123,7 +123,7 @@ def test_local_no_head_banner_only():
|
||||
assert 'window.location = "/other.html"' in buff
|
||||
|
||||
# link NOT rewritten
|
||||
assert '"another.html"' in buff
|
||||
assert '"/some/path/another.html"' in buff
|
||||
|
||||
def test_local_banner_only_no_rewrite():
|
||||
status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html',
|
||||
@ -138,7 +138,7 @@ def test_local_banner_only_no_rewrite():
|
||||
assert 'window.location = "http:\/\/example.com/dynamic_page.html"' in buff, buff
|
||||
|
||||
# link NOT rewritten
|
||||
assert '"another.html"' in buff
|
||||
assert '"/some/path/another.html"' in buff
|
||||
|
||||
def test_local_2_link_only_rewrite():
|
||||
status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html',
|
||||
|
@ -21,19 +21,19 @@
|
||||
|
||||
# UrlRewriter tests
|
||||
>>> do_rewrite('other.html', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/')
|
||||
'/web/20131010/http://example.com/path/other.html'
|
||||
'other.html'
|
||||
|
||||
>>> do_rewrite('file.js', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/', 'js_')
|
||||
>>> do_rewrite('/path/file.js', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/', 'js_')
|
||||
'/web/20131010js_/http://example.com/path/file.js'
|
||||
|
||||
>>> do_rewrite('file.js', '20131010/http://example.com/', '/coll/')
|
||||
>>> do_rewrite('/file.js', '20131010/http://example.com/', '/coll/')
|
||||
'/coll/20131010/http://example.com/file.js'
|
||||
|
||||
>>> do_rewrite('file.js', '20131010/http://example.com', '/coll/', 'js_')
|
||||
>>> do_rewrite('/file.js', '20131010/http://example.com', '/coll/', 'js_')
|
||||
'/coll/20131010js_/http://example.com/file.js'
|
||||
|
||||
>>> do_rewrite('file.js', '20131010/http://example.com', '/coll/', '')
|
||||
'/coll/20131010/http://example.com/file.js'
|
||||
'file.js'
|
||||
|
||||
>>> do_rewrite('/other.html', '20130907*/http://example.com/path/page.html', 'http://localhost:8080/coll/')
|
||||
'/coll/20130907*/http://example.com/other.html'
|
||||
@ -41,8 +41,8 @@
|
||||
>>> do_rewrite('/other.html', '20130907*/http://example.com/path/page.html', '/coll/')
|
||||
'/coll/20130907*/http://example.com/other.html'
|
||||
|
||||
>>> do_rewrite('./other.html', '20130907*/http://example.com/path/page.html', '/coll/')
|
||||
'/coll/20130907*/http://example.com/path/other.html'
|
||||
>>> do_rewrite('other.html', '20130907*/http://example.com/path/page.html', '/coll/')
|
||||
'other.html'
|
||||
|
||||
>>> do_rewrite('../other.html', '20131112im_/http://example.com/path/page.html', '/coll/')
|
||||
'/coll/20131112im_/http://example.com/other.html'
|
||||
@ -87,7 +87,7 @@
|
||||
'2020/http://example.com/other.html'
|
||||
|
||||
>>> do_rewrite('', '20131010010203/http://example.com/file.html', '/web/')
|
||||
'/web/20131010010203/http://example.com/file.html'
|
||||
''
|
||||
|
||||
>>> do_rewrite('#anchor', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/')
|
||||
'#anchor'
|
||||
|
@ -19,6 +19,9 @@ class UrlRewriter(object):
|
||||
|
||||
REL_SCHEME = ('//', r'\/\/', r'\\/\\/')
|
||||
|
||||
PARENT_PATH = '../'
|
||||
REL_PATH = '/'
|
||||
|
||||
def __init__(self, wburl, prefix='', full_prefix=None, rel_prefix=None,
|
||||
root_path=None, cookie_scope=None, rewrite_opts=None):
|
||||
self.wburl = wburl if isinstance(wburl, WbUrl) else WbUrl(wburl)
|
||||
@ -60,6 +63,11 @@ class UrlRewriter(object):
|
||||
if url.startswith(self.REL_SCHEME):
|
||||
is_abs = True
|
||||
scheme_rel = True
|
||||
elif (not is_abs and
|
||||
not url.startswith(self.REL_PATH) and
|
||||
self.PARENT_PATH not in url):
|
||||
return url
|
||||
|
||||
# if prefix starts with a scheme
|
||||
#if self.prefix_scheme:
|
||||
# url = self.prefix_scheme + ':' + url
|
||||
|
@ -135,6 +135,15 @@ rules:
|
||||
|
||||
fuzzy_lookup: 'com,google,plus\)/_/.*?.*(f.sid=[^&]+)'
|
||||
|
||||
# periscope
|
||||
#=================================================================
|
||||
|
||||
- url_prefix: 'tv,periscope,assets)/js/'
|
||||
|
||||
rewrite:
|
||||
js_regexs:
|
||||
- match: '"location"'
|
||||
replace: '"WB_wombat_location"'
|
||||
|
||||
# vimeo rules
|
||||
#=================================================================
|
||||
|
@ -165,7 +165,7 @@ __wbvidrw = (function() {
|
||||
var name = child.getAttribute("name");
|
||||
name = name.toLowerCase();
|
||||
|
||||
if (name == "movie") {
|
||||
if (name == "movie" || name == "src") {
|
||||
var value = child.getAttribute("value");
|
||||
obj_url = value;
|
||||
}
|
||||
|
@ -110,7 +110,7 @@ function remove_event(name, func, object) {
|
||||
}
|
||||
}
|
||||
|
||||
function notify_top() {
|
||||
function notify_top(event) {
|
||||
if (!window.__WB_top_frame) {
|
||||
return;
|
||||
}
|
||||
@ -123,25 +123,18 @@ function notify_top() {
|
||||
return;
|
||||
}
|
||||
|
||||
//if (window.__WB_top_frame.update_wb_url) {
|
||||
// window.__WB_top_frame.update_wb_url(window.WB_wombat_location.href,
|
||||
// wbinfo.timestamp,
|
||||
// wbinfo.request_ts,
|
||||
// wbinfo.is_live);
|
||||
//}
|
||||
|
||||
var message = {
|
||||
"url": window.WB_wombat_location.href,
|
||||
"ts": wbinfo.timestamp,
|
||||
"request_ts": wbinfo.request_ts,
|
||||
"is_live": wbinfo.is_live,
|
||||
"title": "",
|
||||
"title": document ? document.title : "",
|
||||
"wb_type": "load",
|
||||
}
|
||||
|
||||
window.__WB_top_frame.postMessage(message, "*");
|
||||
|
||||
remove_event("readystatechange", notify_top, document);
|
||||
//remove_event("readystatechange", notify_top, document);
|
||||
}
|
||||
|
||||
this.load = function() {
|
||||
@ -152,7 +145,7 @@ this.load = function() {
|
||||
window._wb_js_inited = true;
|
||||
|
||||
// Non-Framed Replay OR top frame for framed replay!
|
||||
if (window.wbinfo && (!window.__WB_top_frame || window.__WB_top_frame == window)) {
|
||||
if (window.wbinfo && !window.__WB_top_frame) {
|
||||
if (wbinfo.is_framed && wbinfo.mod != "bn_") {
|
||||
var hash = window.location.hash;
|
||||
|
||||
@ -171,7 +164,7 @@ this.load = function() {
|
||||
add_event("readystatechange", init_banner, document);
|
||||
|
||||
// Framed Replay
|
||||
} else if (window.__WB_top_frame && window != window.__WB_top_frame && window.__WB_top_frame.update_wb_url) {
|
||||
} else if (window.__WB_top_frame) {
|
||||
add_event("readystatechange", notify_top, document);
|
||||
}
|
||||
}
|
||||
|
@ -19,26 +19,31 @@ This file is part of pywb, https://github.com/ikreymer/pywb
|
||||
|
||||
var LIVE_COOKIE_REGEX = /pywb.timestamp=([\d]{1,14})/;
|
||||
|
||||
var TS_REGEX = /\/([\d]{1,14})\//;
|
||||
var TS_REGEX = /\/([\d]{1,14})(?:\w+_)?\/(?:\w+[:])?\/\//;
|
||||
|
||||
var curr_state = {};
|
||||
//var curr_state = {};
|
||||
|
||||
var IFRAME_ID = "replay_iframe";
|
||||
|
||||
function make_url(url, ts, mod)
|
||||
var last_inner_hash = undefined;
|
||||
|
||||
function make_url(url, ts, mod, prefix)
|
||||
{
|
||||
if (ts || mod) {
|
||||
mod += "/";
|
||||
}
|
||||
|
||||
prefix = prefix || wbinfo.prefix;
|
||||
|
||||
if (ts) {
|
||||
return wbinfo.prefix + ts + mod + url;
|
||||
return prefix + ts + mod + url;
|
||||
} else {
|
||||
return wbinfo.prefix + mod + url;
|
||||
return prefix + mod + url;
|
||||
}
|
||||
}
|
||||
|
||||
function push_state(state) {
|
||||
/*
|
||||
var frame = document.getElementById(IFRAME_ID).contentWindow;
|
||||
if (frame.WB_wombat_location) {
|
||||
var curr_href = frame.WB_wombat_location.href;
|
||||
@ -48,13 +53,19 @@ function push_state(state) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
state.outer_url = make_url(state.url, state.request_ts, wbinfo.frame_mod);
|
||||
state.outer_url = make_url(state.url, state.request_ts, wbinfo.frame_mod, wbinfo.outer_prefix);
|
||||
state.inner_url = make_url(state.url, state.request_ts, wbinfo.replay_mod);
|
||||
|
||||
var canon_url = make_url(state.url, state.request_ts, "");
|
||||
var canon_url = make_url(state.url, state.request_ts, "", wbinfo.outer_prefix);
|
||||
|
||||
if (window.location.href != canon_url) {
|
||||
window.history.replaceState(state, "", canon_url);
|
||||
if (state.wb_type != "pushState") {
|
||||
window.history.replaceState(state, "", canon_url);
|
||||
} else {
|
||||
window.history.pushState(state, "", canon_url);
|
||||
}
|
||||
}
|
||||
|
||||
set_state(state);
|
||||
@ -63,8 +74,8 @@ function push_state(state) {
|
||||
function pop_state(state) {
|
||||
set_state(state);
|
||||
|
||||
var frame = document.getElementById(IFRAME_ID).contentWindow;
|
||||
frame.src = state.inner_url;
|
||||
//var frame = document.getElementById(IFRAME_ID);
|
||||
//frame.src = state.inner_url;
|
||||
}
|
||||
|
||||
function extract_ts(url)
|
||||
@ -103,7 +114,7 @@ function set_state(state) {
|
||||
}
|
||||
}
|
||||
|
||||
curr_state = state;
|
||||
//curr_state = state;
|
||||
}
|
||||
|
||||
window.onpopstate = function(event) {
|
||||
@ -123,43 +134,6 @@ function extract_ts_cookie(value) {
|
||||
}
|
||||
}
|
||||
|
||||
function iframe_loaded(event) {
|
||||
var url;
|
||||
var ts;
|
||||
var request_ts;
|
||||
var capture_str;
|
||||
var is_live = false;
|
||||
var iframe = document.getElementById(IFRAME_ID).contentWindow;
|
||||
|
||||
if (iframe.WB_wombat_location) {
|
||||
url = iframe.WB_wombat_location.href;
|
||||
} else {
|
||||
url = extract_replay_url(iframe.location.href);
|
||||
}
|
||||
|
||||
if (iframe.wbinfo) {
|
||||
ts = iframe.wbinfo.timestamp;
|
||||
request_ts = iframe.wbinfo.request_ts;
|
||||
is_live = iframe.wbinfo.is_live;
|
||||
} else {
|
||||
ts = extract_ts_cookie(iframe.document.cookie);
|
||||
if (ts) {
|
||||
is_live = true;
|
||||
} else {
|
||||
ts = extract_ts(iframe.location.href);
|
||||
}
|
||||
request_ts = ts;
|
||||
}
|
||||
|
||||
var state = {}
|
||||
state["url"] = url;
|
||||
state["ts"] = ts;
|
||||
state["request_ts"] = request_ts;
|
||||
state["is_live"] = is_live
|
||||
|
||||
update_wb_url(state);
|
||||
}
|
||||
|
||||
|
||||
function init_pm() {
|
||||
var frame = document.getElementById(IFRAME_ID).contentWindow;
|
||||
@ -172,7 +146,8 @@ function init_pm() {
|
||||
|
||||
// Check if iframe url change message
|
||||
if (typeof(event.data) == "object" && event.data["wb_type"]) {
|
||||
update_wb_url(event.data);
|
||||
handle_message(event.data);
|
||||
|
||||
} else {
|
||||
// Pass to parent
|
||||
window.parent.postMessage(event.data, "*");
|
||||
@ -187,55 +162,67 @@ function init_pm() {
|
||||
}
|
||||
|
||||
|
||||
function update_wb_url(state) {
|
||||
if (curr_state.url == state.url && curr_state.ts == state.ts) {
|
||||
return;
|
||||
function handle_message(state) {
|
||||
var type = state.wb_type;
|
||||
|
||||
if (type == "load" || type == "pushState" || type == "replaceState") {
|
||||
update_wb_url(state);
|
||||
} else if (type == "go") {
|
||||
window.history.go(state.param);
|
||||
} else if (type == "back") {
|
||||
window.history.back();
|
||||
} else if (type == "forward") {
|
||||
window.history.forward();
|
||||
} else if (type == "hashchange") {
|
||||
inner_hash_changed(state);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
function update_wb_url(state) {
|
||||
//if (curr_state && curr_state.url == state.url && curr_state.ts == state.ts) {
|
||||
// return;
|
||||
//}
|
||||
|
||||
state['capture_str'] = _wb_js.ts_to_date(state.ts, true);
|
||||
|
||||
push_state(state);
|
||||
}
|
||||
|
||||
// Load Banner
|
||||
if (_wb_js) {
|
||||
_wb_js.load();
|
||||
function inner_hash_changed(state) {
|
||||
if (window.location.hash != state.hash) {
|
||||
window.location.hash = state.hash;
|
||||
}
|
||||
last_inner_hash = state.hash;
|
||||
}
|
||||
|
||||
function outer_hash_changed(event) {
|
||||
if (window.location.hash == last_inner_hash) {
|
||||
return;
|
||||
}
|
||||
|
||||
var frame = document.getElementById(IFRAME_ID).contentWindow;
|
||||
|
||||
var message = {"wb_type": "outer_hashchange", "hash": window.location.hash}
|
||||
|
||||
frame.postMessage(message, "*", undefined, true);
|
||||
}
|
||||
|
||||
function init_hash_connect() {
|
||||
var frame = document.getElementById(IFRAME_ID).contentWindow;
|
||||
var frame = document.getElementById(IFRAME_ID);
|
||||
|
||||
if (window.location.hash) {
|
||||
var curr_url = wbinfo.capture_url + window.location.hash;
|
||||
|
||||
frame.location.href = make_url(curr_url, wbinfo.request_ts, wbinfo.replay_mod);
|
||||
|
||||
frame.src = make_url(curr_url, wbinfo.request_ts, wbinfo.replay_mod);
|
||||
|
||||
last_inner_hash = window.location.hash;
|
||||
//frame.location.href = make_url(curr_url, wbinfo.request_ts, wbinfo.replay_mod);
|
||||
//frame.location.hash = window.location.hash;
|
||||
}
|
||||
|
||||
function outer_hash_changed() {
|
||||
var the_frame = document.getElementById(IFRAME_ID).contentWindow;
|
||||
|
||||
if (window.location.hash == the_frame.location.hash) {
|
||||
return;
|
||||
}
|
||||
|
||||
the_frame.location.hash = window.location.hash;
|
||||
//the_frame.location.href = make_url(curr_url, curr_state.request_ts, wbinfo.replay_mod);
|
||||
}
|
||||
|
||||
function inner_hash_changed() {
|
||||
var the_frame = document.getElementById(IFRAME_ID).contentWindow;
|
||||
|
||||
if (window.location.hash == the_frame.location.hash) {
|
||||
return;
|
||||
}
|
||||
|
||||
window.location.hash = the_frame.location.hash;
|
||||
}
|
||||
|
||||
if ("onhashchange" in window) {
|
||||
window.addEventListener("hashchange", outer_hash_changed, false);
|
||||
frame.addEventListener("hashchange", inner_hash_changed, false);
|
||||
}
|
||||
|
||||
// Init Post Message connect
|
||||
@ -244,3 +231,10 @@ function init_hash_connect() {
|
||||
|
||||
document.addEventListener("DOMContentLoaded", init_hash_connect);
|
||||
|
||||
// Load Banner
|
||||
if (_wb_js) {
|
||||
_wb_js.load();
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
@ -18,7 +18,7 @@ This file is part of pywb, https://github.com/ikreymer/pywb
|
||||
*/
|
||||
|
||||
//============================================
|
||||
// Wombat JS-Rewriting Library v2.12
|
||||
// Wombat JS-Rewriting Library v2.16
|
||||
//============================================
|
||||
|
||||
|
||||
@ -314,6 +314,8 @@ var wombat_internal = function($wbwindow) {
|
||||
return "";
|
||||
}
|
||||
|
||||
var orig_href = href;
|
||||
|
||||
// proxy mode: no extraction needed
|
||||
if (!wb_replay_prefix) {
|
||||
return href;
|
||||
@ -348,7 +350,7 @@ var wombat_internal = function($wbwindow) {
|
||||
href = href.substr(4);
|
||||
}
|
||||
|
||||
if (!starts_with(href, VALID_PREFIXES)) {
|
||||
if (href != orig_href && !starts_with(href, VALID_PREFIXES)) {
|
||||
href = HTTP_PREFIX + href;
|
||||
}
|
||||
}
|
||||
@ -402,7 +404,17 @@ var wombat_internal = function($wbwindow) {
|
||||
function make_parser(href) {
|
||||
href = extract_orig(href);
|
||||
|
||||
var p = $wbwindow.document.createElement("a", true);
|
||||
var baseWin;
|
||||
|
||||
// special case: for newly opened blank windows, use the opener
|
||||
// to create parser to have the proper baseURI
|
||||
if ($wbwindow.location.href == "about:blank" && $wbwindow.opener) {
|
||||
baseWin = $wbwindow.opener;
|
||||
} else {
|
||||
baseWin = $wbwindow;
|
||||
}
|
||||
|
||||
var p = baseWin.document.createElement("a", true);
|
||||
p.href = href;
|
||||
return p;
|
||||
}
|
||||
@ -712,21 +724,21 @@ var wombat_internal = function($wbwindow) {
|
||||
function rewritten_func(state_obj, title, url) {
|
||||
url = rewrite_url(url);
|
||||
|
||||
var abs_url = extract_orig(url);
|
||||
|
||||
if (abs_url && !starts_with(abs_url, $wbwindow.WB_wombat_location.origin + "/")) {
|
||||
throw new DOMException("Invalid history change: " + abs_url);
|
||||
}
|
||||
|
||||
if (url == $wbwindow.location.href) {
|
||||
return;
|
||||
}
|
||||
|
||||
orig_func.call(this, state_obj, title, url);
|
||||
|
||||
//if ($wbwindow.__WB_top_frame && $wbwindow != $wbwindow.__WB_top_frame && $wbwindow.__WB_top_frame.update_wb_url) {
|
||||
// $wbwindow.__WB_top_frame.update_wb_url($wbwindow.WB_wombat_location.href,
|
||||
// wb_info.timestamp,
|
||||
// wb_info.request_ts,
|
||||
// wb_info.is_live);
|
||||
//}
|
||||
if ($wbwindow.__WB_top_frame && $wbwindow != $wbwindow.__WB_top_frame) {
|
||||
if ($wbwindow.__WB_top_frame) {
|
||||
var message = {
|
||||
"url": url,
|
||||
"url": abs_url,
|
||||
"ts": wb_info.timestamp,
|
||||
"request_ts": wb_info.request_ts,
|
||||
"is_live": wb_info.is_live,
|
||||
@ -734,7 +746,7 @@ var wombat_internal = function($wbwindow) {
|
||||
"wb_type": func_name,
|
||||
}
|
||||
|
||||
$wbwindow.__WB_top_frame.postMessage(message, "*");
|
||||
$wbwindow.__WB_top_frame.postMessage(message, wb_info.top_host);
|
||||
}
|
||||
}
|
||||
|
||||
@ -746,6 +758,45 @@ var wombat_internal = function($wbwindow) {
|
||||
return rewritten_func;
|
||||
}
|
||||
|
||||
//============================================
|
||||
function override_history_nav(func_name) {
|
||||
if (!$wbwindow.history) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Only useful for framed replay
|
||||
if (!$wbwindow.__WB_top_frame) {
|
||||
return;
|
||||
}
|
||||
|
||||
var orig_func = $wbwindow.history[func_name];
|
||||
|
||||
if (!orig_func) {
|
||||
return;
|
||||
}
|
||||
|
||||
function rewritten_func() {
|
||||
orig_func.apply(this, arguments);
|
||||
|
||||
var message = {
|
||||
"wb_type": func_name,
|
||||
}
|
||||
|
||||
if (func_name == "go") {
|
||||
message["param"] = arguments[0];
|
||||
}
|
||||
|
||||
$wbwindow.__WB_top_frame.postMessage(message, wb_info.top_host);
|
||||
}
|
||||
|
||||
$wbwindow.history[func_name] = rewritten_func;
|
||||
if ($wbwindow.History && $wbwindow.History.prototype) {
|
||||
$wbwindow.History.prototype[func_name] = rewritten_func;
|
||||
}
|
||||
|
||||
return rewritten_func;
|
||||
}
|
||||
|
||||
//============================================
|
||||
function init_ajax_rewrite() {
|
||||
if (!$wbwindow.XMLHttpRequest ||
|
||||
@ -1157,6 +1208,35 @@ var wombat_internal = function($wbwindow) {
|
||||
return value;
|
||||
}
|
||||
|
||||
//============================================
|
||||
function rewrite_frame_src(elem, name)
|
||||
{
|
||||
var value = wb_getAttribute.call(elem, name);
|
||||
var new_value = undefined;
|
||||
|
||||
// special case for rewriting javascript: urls that contain WB_wombat_
|
||||
// must insert wombat init first!
|
||||
if (starts_with(value, "javascript:")) {
|
||||
if (value.indexOf("WB_wombat_") >= 0) {
|
||||
var JS = "javascript:";
|
||||
new_value = JS;
|
||||
new_value += "window.parent._wb_wombat.init_new_window_wombat(window);"
|
||||
new_value += value.substr(JS.length);
|
||||
}
|
||||
}
|
||||
|
||||
if (!new_value) {
|
||||
new_value = rewrite_url(value, false);
|
||||
}
|
||||
|
||||
if (new_value != value) {
|
||||
wb_setAttribute.call(elem, name, new_value);
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
//============================================
|
||||
function rewrite_elem(elem)
|
||||
{
|
||||
@ -1169,6 +1249,7 @@ var wombat_internal = function($wbwindow) {
|
||||
if (elem.tagName == "STYLE") {
|
||||
var new_content = rewrite_style(elem.textContent);
|
||||
if (elem.textContent != new_content) {
|
||||
elem.textContent = new_content;
|
||||
changed = true;
|
||||
}
|
||||
} else if (elem.tagName == "OBJECT") {
|
||||
@ -1177,10 +1258,13 @@ var wombat_internal = function($wbwindow) {
|
||||
changed = rewrite_attr(elem, "action", true);
|
||||
} else if (elem.tagName == "INPUT") {
|
||||
changed = rewrite_attr(elem, "value", true);
|
||||
} else if (elem.tagName == "IFRAME" || elem.tagName == "FRAME") {
|
||||
changed = rewrite_frame_src(elem, "src");
|
||||
} else {
|
||||
changed = rewrite_attr(elem, "src");
|
||||
changed = rewrite_attr(elem, "href") || changed;
|
||||
changed = rewrite_attr(elem, "style") || changed;
|
||||
changed = rewrite_attr(elem, "poster") || changed;
|
||||
}
|
||||
|
||||
if (elem.getAttribute) {
|
||||
@ -1648,6 +1732,47 @@ var wombat_internal = function($wbwindow) {
|
||||
}
|
||||
}
|
||||
|
||||
//============================================
|
||||
function init_hash_change()
|
||||
{
|
||||
if (!$wbwindow.__WB_top_frame) {
|
||||
return;
|
||||
}
|
||||
|
||||
function receive_hash_change(event)
|
||||
{
|
||||
if (!event.data || event.source != $wbwindow.__WB_top_frame) {
|
||||
return;
|
||||
}
|
||||
|
||||
var message = event.data;
|
||||
|
||||
if (!message.wb_type) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (message.wb_type == "outer_hashchange") {
|
||||
if ($wbwindow.location.hash != message.hash) {
|
||||
$wbwindow.location.hash = message.hash;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function send_hash_change() {
|
||||
var message = {"wb_type": "hashchange",
|
||||
"hash": $wbwindow.location.hash
|
||||
}
|
||||
|
||||
if ($wbwindow.__WB_top_frame) {
|
||||
$wbwindow.__WB_top_frame.postMessage(message, wb_info.top_host);
|
||||
}
|
||||
}
|
||||
|
||||
$wbwindow.addEventListener("message", receive_hash_change);
|
||||
|
||||
$wbwindow.addEventListener("hashchange", send_hash_change);
|
||||
}
|
||||
|
||||
//============================================
|
||||
function init_postmessage_override($wbwindow)
|
||||
{
|
||||
@ -1659,7 +1784,7 @@ var wombat_internal = function($wbwindow) {
|
||||
|
||||
$wbwindow.__orig_postMessage = orig;
|
||||
|
||||
var postmessage_rewritten = function(message, targetOrigin, transfer) {
|
||||
var postmessage_rewritten = function(message, targetOrigin, transfer, from_top) {
|
||||
var from = undefined;
|
||||
var src_id = undefined;
|
||||
|
||||
@ -1699,7 +1824,9 @@ var wombat_internal = function($wbwindow) {
|
||||
var new_message = {"from": from,
|
||||
"to_host": to,
|
||||
"src_id": src_id,
|
||||
"message": message};
|
||||
"message": message,
|
||||
"from_top": from_top,
|
||||
}
|
||||
|
||||
if (targetOrigin != "*") {
|
||||
targetOrigin = this.location.origin;
|
||||
@ -1737,7 +1864,9 @@ var wombat_internal = function($wbwindow) {
|
||||
|
||||
var source = event.source;
|
||||
|
||||
if (event.data.src_id && win.__WB_win_id && win.__WB_win_id[event.data.src_id]) {
|
||||
if (event.data.from_top) {
|
||||
source = win.__WB_top_frame;
|
||||
} else if (event.data.src_id && win.__WB_win_id && win.__WB_win_id[event.data.src_id]) {
|
||||
source = win.__WB_win_id[event.data.src_id];
|
||||
}
|
||||
|
||||
@ -1804,7 +1933,9 @@ var wombat_internal = function($wbwindow) {
|
||||
|
||||
var open_rewritten = function(strUrl, strWindowName, strWindowFeatures) {
|
||||
strUrl = rewrite_url(strUrl, false, "");
|
||||
return orig.call(this, strUrl, strWindowName, strWindowFeatures);
|
||||
var res = orig.call(this, strUrl, strWindowName, strWindowFeatures);
|
||||
init_new_window_wombat(res, strUrl);
|
||||
return res;
|
||||
}
|
||||
|
||||
$wbwindow.open = open_rewritten;
|
||||
@ -1845,6 +1976,24 @@ var wombat_internal = function($wbwindow) {
|
||||
cookie = cookie.replace(wb_abs_prefix, '');
|
||||
cookie = cookie.replace(wb_rel_prefix, '');
|
||||
|
||||
// rewrite domain
|
||||
cookie = cookie.replace(cookie_domain_regex, function(m, m1) {
|
||||
var message = {"domain": m1,
|
||||
"cookie": cookie,
|
||||
"wb_type": "cookie",
|
||||
}
|
||||
|
||||
// norify of cookie setting to allow server-side tracking
|
||||
$wbwindow.__WB_top_frame.postMessage(message, wb_info.top_host);
|
||||
|
||||
// if no subdomain, eg. "localhost", just remove domain altogether
|
||||
if ($wbwindow.location.hostname.indexOf(".") >= 0 && !IP_RX.test($wbwindow.location.hostname)) {
|
||||
return "Domain=." + $wbwindow.location.hostname;
|
||||
} else {
|
||||
return "";
|
||||
}
|
||||
});
|
||||
|
||||
// rewrite path
|
||||
cookie = cookie.replace(cookie_path_regex, function(m, m1) {
|
||||
var rewritten = rewrite_url(m1);
|
||||
@ -1856,16 +2005,6 @@ var wombat_internal = function($wbwindow) {
|
||||
return "Path=" + rewritten;
|
||||
});
|
||||
|
||||
// rewrite domain
|
||||
cookie = cookie.replace(cookie_domain_regex, function(m, m1) {
|
||||
// if no subdomain, eg. "localhost", just remove domain altogether
|
||||
if ($wbwindow.location.hostname.indexOf(".") >= 0 && !IP_RX.test($wbwindow.location.hostname)) {
|
||||
return "Domain=." + $wbwindow.location.hostname;
|
||||
} else {
|
||||
return "";
|
||||
}
|
||||
});
|
||||
|
||||
// rewrite secure, if needed
|
||||
if ($wbwindow.location.protocol != "https:") {
|
||||
cookie = cookie.replace("secure", "");
|
||||
@ -1988,7 +2127,11 @@ var wombat_internal = function($wbwindow) {
|
||||
|
||||
//var src = iframe.src;
|
||||
var src = wb_getAttribute.call(iframe, "src");
|
||||
|
||||
|
||||
init_new_window_wombat(win, src);
|
||||
}
|
||||
|
||||
function init_new_window_wombat(win, src) {
|
||||
if (!src || src == "" || src == "about:blank" || src.indexOf("javascript:") >= 0) {
|
||||
win._WBWombat = wombat_internal(win);
|
||||
win._wb_wombat = new win._WBWombat(wb_info);
|
||||
@ -2100,6 +2243,20 @@ var wombat_internal = function($wbwindow) {
|
||||
}
|
||||
}
|
||||
|
||||
//============================================
|
||||
function init_beacon_override()
|
||||
{
|
||||
if (!$wbwindow.navigator.sendBeacon) {
|
||||
return;
|
||||
}
|
||||
|
||||
var orig_sendBeacon = $wbwindow.navigator.sendBeacon;
|
||||
|
||||
$wbwindow.navigator.sendBeacon = function(url, data) {
|
||||
return orig_sendBeacon.call(this, rewrite_url(url), data);
|
||||
}
|
||||
}
|
||||
|
||||
//============================================
|
||||
function get_final_url(prefix, mod, url) {
|
||||
if (mod == undefined) {
|
||||
@ -2126,6 +2283,8 @@ var wombat_internal = function($wbwindow) {
|
||||
wb_opts = wbinfo.wombat_opts;
|
||||
wb_replay_prefix = wbinfo.prefix;
|
||||
|
||||
wb_info.top_host = wb_info.top_host || "*";
|
||||
|
||||
init_top_frame($wbwindow);
|
||||
init_wombat_top($wbwindow);
|
||||
|
||||
@ -2174,6 +2333,10 @@ var wombat_internal = function($wbwindow) {
|
||||
override_history_func("pushState");
|
||||
override_history_func("replaceState");
|
||||
|
||||
override_history_nav("go");
|
||||
override_history_nav("back");
|
||||
override_history_nav("forward");
|
||||
|
||||
// open
|
||||
init_open_override();
|
||||
|
||||
@ -2183,6 +2346,8 @@ var wombat_internal = function($wbwindow) {
|
||||
init_postmessage_override($wbwindow);
|
||||
}
|
||||
|
||||
init_hash_change();
|
||||
|
||||
// write
|
||||
init_write_override();
|
||||
|
||||
@ -2242,14 +2407,17 @@ var wombat_internal = function($wbwindow) {
|
||||
// Date
|
||||
init_date_override(wbinfo.wombat_sec);
|
||||
|
||||
|
||||
// registerProtocolHandler override
|
||||
init_registerPH_override();
|
||||
|
||||
//sendBeacon override
|
||||
init_beacon_override();
|
||||
|
||||
// expose functions
|
||||
this.extract_orig = extract_orig;
|
||||
this.rewrite_url = rewrite_url;
|
||||
this.watch_elem = watch_elem;
|
||||
this.init_new_window_wombat = init_new_window_wombat;
|
||||
}
|
||||
|
||||
function init_top_frame($wbwindow) {
|
||||
@ -2290,11 +2458,14 @@ var wombat_internal = function($wbwindow) {
|
||||
var real_parent = replay_top.__WB_orig_parent || replay_top.parent;
|
||||
|
||||
// Check to ensure top frame is different window and directly accessible (later refactor to support postMessage)
|
||||
try {
|
||||
if ((real_parent == $wbwindow) || !real_parent.wbinfo || !real_parent.wbinfo.is_frame) {
|
||||
real_parent = undefined;
|
||||
}
|
||||
} catch (e) {
|
||||
//try {
|
||||
// if ((real_parent == $wbwindow) || !real_parent.wbinfo || !real_parent.wbinfo.is_frame) {
|
||||
// real_parent = undefined;
|
||||
// }
|
||||
//} catch (e) {
|
||||
// real_parent = undefined;
|
||||
//}
|
||||
if (real_parent == $wbwindow || !wb_info.is_framed) {
|
||||
real_parent = undefined;
|
||||
}
|
||||
|
||||
|
@ -31,7 +31,7 @@ html, body
|
||||
</head>
|
||||
<body style="margin: 0px; padding: 0px;">
|
||||
<div class="wb_iframe_div">
|
||||
<iframe id="replay_iframe" src="{{ wbrequest.wb_prefix + embed_url }}" onload="iframe_loaded(event);" frameborder="0" seamless="seamless" scrolling="yes" class="wb_iframe"></iframe>
|
||||
<iframe id="replay_iframe" src="{{ wbrequest.wb_prefix + embed_url }}" onload="" frameborder="0" seamless="seamless" scrolling="yes" class="wb_iframe"></iframe>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
|
@ -4,7 +4,7 @@
|
||||
wbinfo.url = "{{ cdx.url }}";
|
||||
wbinfo.timestamp = "{{ cdx.timestamp }}";
|
||||
wbinfo.request_ts = "{{ wbrequest.wb_url.timestamp }}";
|
||||
wbinfo.prefix = "{{ wbrequest.wb_prefix }}";
|
||||
wbinfo.prefix = decodeURI("{{ wbrequest.wb_prefix }}");
|
||||
wbinfo.mod = "{{ wbrequest.wb_url.mod }}";
|
||||
wbinfo.top_url = "{{ top_url }}";
|
||||
wbinfo.is_framed = {{ "true" if wbrequest.options.is_framed else "false" }};
|
||||
|
@ -392,6 +392,7 @@ class HttpLoader(BaseLoader):
|
||||
self.session = requests.Session()
|
||||
|
||||
r = self.session.get(url, headers=headers, stream=True)
|
||||
r.raise_for_status()
|
||||
return r.raw
|
||||
|
||||
|
||||
|
@ -7,6 +7,7 @@ from copy import copy
|
||||
from six.moves import range
|
||||
from six import iteritems
|
||||
from pywb.utils.loaders import to_native_str
|
||||
import uuid
|
||||
|
||||
|
||||
WRAP_WIDTH = 80
|
||||
@ -257,6 +258,12 @@ class StatusAndHeadersParser(object):
|
||||
plen = len(prefix)
|
||||
return (key_upper[:plen], key[plen:])
|
||||
|
||||
@staticmethod
|
||||
def make_warc_id(id_=None):
|
||||
if not id_:
|
||||
id_ = uuid.uuid1()
|
||||
return '<urn:uuid:{0}>'.format(id_)
|
||||
|
||||
|
||||
#=================================================================
|
||||
class StatusAndHeadersParserException(Exception):
|
||||
|
@ -138,6 +138,7 @@ import pytest
|
||||
import six
|
||||
from six import StringIO
|
||||
from io import BytesIO
|
||||
import requests
|
||||
|
||||
from pywb.utils.loaders import BlockLoader, HMACCookieMaker, to_file_url
|
||||
from pywb.utils.loaders import LimitReader, extract_client_cookie, extract_post_query
|
||||
@ -176,6 +177,14 @@ def test_s3_read_1():
|
||||
assert reader.readline() == b'WARC/1.0\r\n'
|
||||
assert reader.readline() == b'WARC-Type: response\r\n'
|
||||
|
||||
def test_limit_post():
|
||||
reader = LimitReader(BytesIO(b'abcdefg'), 3)
|
||||
r = requests.request(method='POST',
|
||||
url='http://httpbin.org/post',
|
||||
data=reader,
|
||||
headers={'Content-Length': '3'})
|
||||
|
||||
assert '"abc"' in r.text
|
||||
|
||||
# Error
|
||||
def test_err_no_such_file():
|
||||
|
@ -121,6 +121,18 @@ def iso_date_to_timestamp(string):
|
||||
|
||||
return datetime_to_timestamp(iso_date_to_datetime(string))
|
||||
|
||||
def timestamp_to_iso_date(string):
|
||||
"""
|
||||
>>> timestamp_to_iso_date('20131226101112')
|
||||
'2013-12-26T10:11:12Z'
|
||||
|
||||
>>> timestamp_to_iso_date('20131226101112')
|
||||
'2013-12-26T10:11:12Z'
|
||||
"""
|
||||
|
||||
|
||||
return datetime_to_iso_date(timestamp_to_datetime(string))
|
||||
|
||||
|
||||
def http_date_to_timestamp(string):
|
||||
"""
|
||||
|
@ -54,15 +54,18 @@ class ArchiveIterator(object):
|
||||
|
||||
|
||||
def __init__(self, fileobj, no_record_parse=False,
|
||||
verify_http=False):
|
||||
verify_http=False, arc2warc=False):
|
||||
self.fh = fileobj
|
||||
|
||||
self.loader = ArcWarcRecordLoader(verify_http=verify_http)
|
||||
self.loader = ArcWarcRecordLoader(verify_http=verify_http,
|
||||
arc2warc=arc2warc)
|
||||
self.reader = None
|
||||
|
||||
self.offset = 0
|
||||
self.known_format = None
|
||||
|
||||
self.mixed_arc_warc = arc2warc
|
||||
|
||||
self.member_info = None
|
||||
self.no_record_parse = no_record_parse
|
||||
|
||||
@ -226,7 +229,8 @@ class ArchiveIterator(object):
|
||||
self.member_info = None
|
||||
|
||||
# Track known format for faster parsing of other records
|
||||
self.known_format = record.format
|
||||
if not self.mixed_arc_warc:
|
||||
self.known_format = record.format
|
||||
|
||||
return record
|
||||
|
||||
@ -359,6 +363,9 @@ class DefaultRecordParser(object):
|
||||
if not include_all and not minimal and (record.status_headers.get_statuscode() == '-'):
|
||||
continue
|
||||
|
||||
if record.rec_type == 'arc_header':
|
||||
continue
|
||||
|
||||
if record.format == 'warc':
|
||||
if (record.rec_type in ('request', 'warcinfo') and
|
||||
not include_all and
|
||||
@ -495,9 +502,6 @@ class DefaultRecordParser(object):
|
||||
def parse_arc_record(self, record):
|
||||
""" Parse arc record
|
||||
"""
|
||||
if record.rec_type == 'arc_header':
|
||||
return None
|
||||
|
||||
url = record.rec_headers.get_header('uri')
|
||||
url = url.replace('\r', '%0D')
|
||||
url = url.replace('\n', '%0A')
|
||||
@ -528,7 +532,8 @@ class DefaultRecordParser(object):
|
||||
|
||||
def __call__(self, fh):
|
||||
aiter = ArchiveIterator(fh, self.options.get('minimal', False),
|
||||
self.options.get('verify_http', False))
|
||||
self.options.get('verify_http', False),
|
||||
self.options.get('arc2warc', False))
|
||||
|
||||
entry_iter = self.create_record_iter(aiter)
|
||||
|
||||
|
@ -9,6 +9,7 @@ from pywb.utils.loaders import to_native_str
|
||||
from pywb.utils.bufferedreaders import DecompressingBufferedReader
|
||||
|
||||
from pywb.utils.wbexception import WbException
|
||||
from pywb.utils.timeutils import timestamp_to_iso_date
|
||||
|
||||
from six.moves import zip
|
||||
import six
|
||||
@ -37,11 +38,6 @@ class ArchiveLoadFailed(WbException):
|
||||
|
||||
#=================================================================
|
||||
class ArcWarcRecordLoader(object):
|
||||
# Standard ARC v1.0 headers
|
||||
# TODO: support ARC v2.0 also?
|
||||
ARC_HEADERS = ["uri", "ip-address", "archive-date",
|
||||
"content-type", "length"]
|
||||
|
||||
WARC_TYPES = ['WARC/1.0', 'WARC/0.17', 'WARC/0.18']
|
||||
|
||||
HTTP_TYPES = ['HTTP/1.0', 'HTTP/1.1']
|
||||
@ -55,14 +51,17 @@ class ArcWarcRecordLoader(object):
|
||||
HTTP_SCHEMES = ('http:', 'https:')
|
||||
|
||||
def __init__(self, loader=None, cookie_maker=None, block_size=8192,
|
||||
verify_http=True):
|
||||
verify_http=True, arc2warc=True):
|
||||
if not loader:
|
||||
loader = BlockLoader(cookie_maker=cookie_maker)
|
||||
|
||||
self.loader = loader
|
||||
self.block_size = block_size
|
||||
|
||||
self.arc_parser = ARCHeadersParser(self.ARC_HEADERS)
|
||||
if arc2warc:
|
||||
self.arc_parser = ARC2WARCHeadersParser()
|
||||
else:
|
||||
self.arc_parser = ARCHeadersParser()
|
||||
|
||||
self.warc_parser = StatusAndHeadersParser(self.WARC_TYPES)
|
||||
self.http_parser = StatusAndHeadersParser(self.HTTP_TYPES, verify_http)
|
||||
@ -114,12 +113,16 @@ class ArcWarcRecordLoader(object):
|
||||
else:
|
||||
rec_type = 'response'
|
||||
|
||||
elif the_format == 'warc':
|
||||
elif the_format in ('warc', 'arc2warc'):
|
||||
rec_type = rec_headers.get_header('WARC-Type')
|
||||
uri = rec_headers.get_header('WARC-Target-URI')
|
||||
length = rec_headers.get_header('Content-Length')
|
||||
content_type = rec_headers.get_header('Content-Type')
|
||||
sub_len = 0
|
||||
if the_format == 'warc':
|
||||
sub_len = 0
|
||||
else:
|
||||
sub_len = rec_headers.total_len
|
||||
the_format = 'warc'
|
||||
|
||||
is_err = False
|
||||
|
||||
@ -201,7 +204,7 @@ class ArcWarcRecordLoader(object):
|
||||
# now try as arc
|
||||
try:
|
||||
rec_headers = self.arc_parser.parse(stream, statusline)
|
||||
return 'arc', rec_headers
|
||||
return self.arc_parser.get_rec_type(), rec_headers
|
||||
except StatusAndHeadersParserException as se:
|
||||
if known_format == 'arc':
|
||||
msg = 'Invalid ARC record, first line: '
|
||||
@ -212,8 +215,15 @@ class ArcWarcRecordLoader(object):
|
||||
|
||||
#=================================================================
|
||||
class ARCHeadersParser(object):
|
||||
def __init__(self, headernames):
|
||||
self.headernames = headernames
|
||||
# ARC 1.0 headers
|
||||
ARC_HEADERS = ["uri", "ip-address", "archive-date",
|
||||
"content-type", "length"]
|
||||
|
||||
def __init__(self):
|
||||
self.headernames = self.get_header_names()
|
||||
|
||||
def get_rec_type(self):
|
||||
return 'arc'
|
||||
|
||||
def parse(self, stream, headerline=None):
|
||||
total_read = 0
|
||||
@ -250,12 +260,60 @@ class ARCHeadersParser(object):
|
||||
msg = msg.format(headernames, parts)
|
||||
raise StatusAndHeadersParserException(msg, parts)
|
||||
|
||||
headers = []
|
||||
|
||||
for name, value in zip(headernames, parts):
|
||||
headers.append((name, value))
|
||||
protocol, headers = self._get_protocol_and_headers(headerline, parts)
|
||||
|
||||
return StatusAndHeaders(statusline='',
|
||||
headers=headers,
|
||||
protocol='ARC/1.0',
|
||||
protocol='WARC/1.0',
|
||||
total_len=total_read)
|
||||
|
||||
@classmethod
|
||||
def get_header_names(cls):
|
||||
return cls.ARC_HEADERS
|
||||
|
||||
def _get_protocol_and_headers(self, headerline, parts):
|
||||
headers = []
|
||||
|
||||
for name, value in zip(self.headernames, parts):
|
||||
headers.append((name, value))
|
||||
|
||||
return ('ARC/1.0', headers)
|
||||
|
||||
|
||||
#=================================================================
|
||||
class ARC2WARCHeadersParser(ARCHeadersParser):
|
||||
# Headers for converting ARC -> WARC Header
|
||||
ARC_TO_WARC_HEADERS = ["WARC-Target-URI",
|
||||
"WARC-IP-Address",
|
||||
"WARC-Date",
|
||||
"Content-Type",
|
||||
"Content-Length"]
|
||||
|
||||
def get_rec_type(self):
|
||||
return 'arc2warc'
|
||||
|
||||
@classmethod
|
||||
def get_header_names(cls):
|
||||
return cls.ARC_TO_WARC_HEADERS
|
||||
|
||||
def _get_protocol_and_headers(self, headerline, parts):
|
||||
headers = []
|
||||
|
||||
for name, value in zip(self.headernames, parts):
|
||||
if name == 'WARC-Date':
|
||||
value = timestamp_to_iso_date(value)
|
||||
|
||||
headers.append((name, value))
|
||||
|
||||
if headerline.startswith('filedesc://'):
|
||||
rec_type = 'arc_header'
|
||||
else:
|
||||
rec_type = 'response'
|
||||
|
||||
headers.append(('WARC-Type', rec_type))
|
||||
headers.append(('WARC-Record-ID', StatusAndHeadersParser.make_warc_id()))
|
||||
|
||||
return ('WARC/1.0', headers)
|
||||
|
||||
|
||||
|
@ -44,6 +44,19 @@ com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ
|
||||
CDX N b a m s k r M S V g
|
||||
com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1656 151 example.arc
|
||||
|
||||
# arc.gz
|
||||
>>> print_cdx_index('example.arc.gz', arc2warc=True)
|
||||
CDX N b a m s k r M S V g
|
||||
com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 856 171 example.arc.gz
|
||||
|
||||
# arc
|
||||
>>> print_cdx_index('example.arc', arc2warc=True)
|
||||
CDX N b a m s k r M S V g
|
||||
com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1656 151 example.arc
|
||||
|
||||
|
||||
|
||||
|
||||
# wget warc, includes metadata by default
|
||||
>>> print_cdx_index('example-wget-1-14.warc.gz')
|
||||
CDX N b a m s k r M S V g
|
||||
@ -328,6 +341,22 @@ def test_cdxj_arc_minimal():
|
||||
com,example)/ 20140216050221 {"url": "http://example.com/", "digest": "PEWDX5GTH66WU74WBPGFECIYBMPMP3FP", "length": "856", "offset": "171", "filename": "example.arc.gz"}
|
||||
""")
|
||||
|
||||
def test_cdxj_arc_conv():
|
||||
# arc.gz -- json
|
||||
res = cdx_index('example.arc.gz', cdxj=True, arc2warc=True)
|
||||
assert parse_cdxj(res) == parse_cdxj(b"""
|
||||
com,example)/ 20140216050221 {"url": "http://example.com/", "mime": "text/html", "status": "200", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "856", "offset": "171", "filename": "example.arc.gz"}
|
||||
""")
|
||||
|
||||
def test_cdxj_arc_minimal_conv():
|
||||
# arc.gz -- minimal + json
|
||||
res = cdx_index('example.arc.gz', cdxj=True, minimal=True, arc2warc=True)
|
||||
assert parse_cdxj(res) == parse_cdxj(b"""
|
||||
com,example)/ 20140216050221 {"url": "http://example.com/", "digest": "PEWDX5GTH66WU74WBPGFECIYBMPMP3FP", "length": "856", "offset": "171", "filename": "example.arc.gz"}
|
||||
""")
|
||||
|
||||
|
||||
|
||||
|
||||
def test_cdxj_empty():
|
||||
options = dict(cdxj=True)
|
||||
|
@ -1,11 +1,13 @@
|
||||
from pywb.cdx.cdxserver import create_cdx_server
|
||||
|
||||
from pywb.utils.wbexception import NotFoundException
|
||||
from pywb.framework.basehandlers import BaseHandler
|
||||
from pywb.framework.wbrequestresponse import WbResponse
|
||||
|
||||
from pywb.webapp.query_handler import QueryHandler
|
||||
|
||||
from six.moves.urllib.parse import parse_qs
|
||||
import json
|
||||
import six
|
||||
|
||||
|
||||
@ -21,7 +23,18 @@ class CDXAPIHandler(BaseHandler):
|
||||
def __call__(self, wbrequest):
|
||||
params = self.extract_params_from_wsgi_env(wbrequest.env)
|
||||
|
||||
cdx_iter = self.index_handler.load_cdx(wbrequest, params)
|
||||
try:
|
||||
cdx_iter = self.index_handler.load_cdx(wbrequest, params)
|
||||
except NotFoundException:
|
||||
msg = 'No Captures found for: ' + params.get('url')
|
||||
if params.get('output') == 'json':
|
||||
msg = json.dumps(dict(error=msg))
|
||||
content_type='application/json'
|
||||
else:
|
||||
content_type='text/plain'
|
||||
|
||||
return WbResponse.text_response(msg, content_type=content_type,
|
||||
status='404 Not Found')
|
||||
|
||||
return WbResponse.text_stream(cdx_iter,
|
||||
content_type='text/plain')
|
||||
|
@ -10,5 +10,5 @@ if (some_val) {
|
||||
}
|
||||
</script>
|
||||
Test Content
|
||||
<a href="another.html">Some Link</a>
|
||||
<a href="/some/path/another.html">Some Link</a>
|
||||
</body>
|
||||
|
@ -5,4 +5,4 @@ if (some_val) {
|
||||
}
|
||||
</script>
|
||||
Test Content
|
||||
<a href="another.html">Some Link</a>
|
||||
<a href="/some/path/another.html">Some Link</a>
|
||||
|
@ -125,7 +125,7 @@ class TestProxyLiveRewriter:
|
||||
|
||||
def test_echo_proxy_start_unbounded_remove_range(self):
|
||||
headers = [('Range', 'bytes=0-')]
|
||||
resp = self.testapp.get('/rewrite/http://example.com/', headers=headers)
|
||||
resp = self.testapp.get('/rewrite/http://httpbin.org/range/100', headers=headers)
|
||||
|
||||
# actual response is with range
|
||||
assert resp.status_int == 206
|
||||
@ -138,7 +138,7 @@ class TestProxyLiveRewriter:
|
||||
assert self.requestlog[0] == resp.text
|
||||
assert resp.headers['x-archive-orig-x-proxy'] == 'test'
|
||||
|
||||
assert self.requestlog[0].startswith('GET http://example.com/ HTTP/1.1')
|
||||
assert self.requestlog[0].startswith('GET http://httpbin.org/range/100 HTTP/1.1')
|
||||
assert 'range: ' not in self.requestlog[0]
|
||||
|
||||
assert len(self.cache) == 0
|
||||
|
@ -4,7 +4,6 @@ from pywb.framework.wsgi_wrappers import init_app
|
||||
import webtest
|
||||
import pywb.rewrite.rewrite_live
|
||||
|
||||
|
||||
#=================================================================
|
||||
class MockYTDWrapper(object):
|
||||
def extract_info(self, url):
|
||||
@ -47,6 +46,7 @@ class TestLiveRewriter:
|
||||
def test_live_live_frame(self):
|
||||
resp = self.testapp.get('/live/http://example.com/')
|
||||
assert resp.status_int == 200
|
||||
resp.charset = 'utf-8'
|
||||
assert '<iframe ' in resp.text
|
||||
assert 'src="http://localhost:80/live/mp_/http://example.com/"' in resp.text, resp.text
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user