diff --git a/README.rst b/README.rst
index 010a6f3e..60f5237d 100644
--- a/README.rst
+++ b/README.rst
@@ -1,4 +1,4 @@
-PyWb 0.31.0
+PyWb 0.32.0
===========
.. image:: https://travis-ci.org/ikreymer/pywb.svg?branch=master
diff --git a/pywb/__init__.py b/pywb/__init__.py
index 9f66d658..7cdc246c 100644
--- a/pywb/__init__.py
+++ b/pywb/__init__.py
@@ -1,4 +1,4 @@
-__version__ = '0.31.0'
+__version__ = '0.32.0'
DEFAULT_CONFIG = 'pywb/default_config.yaml'
diff --git a/pywb/rewrite/header_rewriter.py b/pywb/rewrite/header_rewriter.py
index 610df546..564d5b91 100644
--- a/pywb/rewrite/header_rewriter.py
+++ b/pywb/rewrite/header_rewriter.py
@@ -17,6 +17,12 @@ class RewrittenStatusAndHeaders(object):
def contains_removed_header(self, name, value):
return self.removed_header_dict.get(name) == value
+ def readd_rewrite_removed(self):
+ for name in HeaderRewriter.PROXY_NO_REWRITE_HEADERS:
+ value = self.removed_header_dict.get(name)
+ if value is not None:
+ self.status_headers.headers.append((name, value))
+
#=================================================================
class HeaderRewriter(object):
@@ -34,19 +40,21 @@ class HeaderRewriter(object):
'json': ['application/json'],
'xml': ['/xml', '+xml', '.xml', '.rss'],
+
+ 'plain': ['text/plain'],
}
PROXY_HEADERS = ['content-type', 'content-disposition', 'content-range',
- 'accept-ranges']
+ 'accept-ranges', 'www-authenticate', 'proxy-authenticate']
URL_REWRITE_HEADERS = ['location', 'content-location', 'content-base']
- ENCODING_HEADERS = ['content-encoding']
+ #ENCODING_HEADERS = ['content-encoding']
REMOVE_HEADERS = ['transfer-encoding', 'content-security-policy',
'strict-transport-security']
- PROXY_NO_REWRITE_HEADERS = ['content-length']
+ PROXY_NO_REWRITE_HEADERS = ['content-length', 'content-encoding']
COOKIE_HEADERS = ['set-cookie', 'cookie']
@@ -141,9 +149,10 @@ class HeaderRewriter(object):
elif urlrewriter and lowername in self.URL_REWRITE_HEADERS:
new_headers.append((name, urlrewriter.rewrite(value)))
- elif lowername in self.ENCODING_HEADERS:
+ elif lowername in self.PROXY_NO_REWRITE_HEADERS:
if content_rewritten:
removed_header_dict[lowername] = value
+ add_prefixed_header(name, value)
else:
add_header(name, value)
@@ -151,10 +160,6 @@ class HeaderRewriter(object):
removed_header_dict[lowername] = value
add_prefixed_header(name, value)
- elif (lowername in self.PROXY_NO_REWRITE_HEADERS and
- not content_rewritten):
- add_header(name, value)
-
elif (lowername in self.COOKIE_HEADERS and
cookie_rewriter):
cookie_list = cookie_rewriter.rewrite(value)
diff --git a/pywb/rewrite/html_rewriter.py b/pywb/rewrite/html_rewriter.py
index 90148c1f..21989a29 100644
--- a/pywb/rewrite/html_rewriter.py
+++ b/pywb/rewrite/html_rewriter.py
@@ -40,6 +40,7 @@ class HTMLRewriterMixin(object):
'embed': {'src': 'oe_'},
'head': {'': defmod}, # for head rewriting
'iframe': {'src': 'if_'},
+ 'image': {'src': 'im_', 'xlink:href': 'im_'},
'img': {'src': 'im_',
'srcset': 'im_'},
'ins': {'cite': defmod},
@@ -118,6 +119,8 @@ class HTMLRewriterMixin(object):
META_REFRESH_REGEX = re.compile('^[\\d.]+\\s*;\\s*url\\s*=\\s*(.+?)\\s*$',
re.IGNORECASE | re.MULTILINE)
+ ADD_WINDOW = re.compile('(?>> rewrite_cookie('abc=def; Path=file.html; Expires=Wed, 13 Jan 2021 22:23:01 GMT', urlrewriter, 'coll')
-[('Set-Cookie', 'abc=def; Path=/pywb/20131226101010/http://example.com/some/path/file.html')]
+[('Set-Cookie', 'abc=def; Path=file.html')]
# keep Max-Age
->>> rewrite_cookie('abc=def; Path=file.html; Max-Age=1500', urlrewriter2, 'coll')
+>>> rewrite_cookie('abc=def; Path=/file.html; Max-Age=1500', urlrewriter2, 'coll')
[('Set-Cookie', 'abc=def; Max-Age=1500; Path=/preview/em_/http://example.com/file.html')]
# Cookie with invalid chars, not parsed
@@ -92,14 +92,14 @@ def rewrite_cookie(cookie_str, rewriter=urlrewriter, scope='default'):
@pytest.mark.skipif(sys.version_info < (2,7), reason='Unsupported')
def test_with_expires():
# keep expires
- res = rewrite_cookie('abc=def; Path=file.html; Expires=Wed, 13 Jan 2021 22:23:01 GMT', urlrewriter2, 'coll')
+ res = rewrite_cookie('abc=def; Path=/file.html; Expires=Wed, 13 Jan 2021 22:23:01 GMT', urlrewriter2, 'coll')
assert len(res) == 1
assert res[0][1].lower() == 'abc=def; expires=wed, 13 jan 2021 22:23:01 gmt; path=/preview/em_/http://example.com/file.html'
@pytest.mark.skipif(sys.version_info < (2,7), reason='Unsupported')
def test_with_expires_utc_replace():
# keep expires, UTC->GMT
- res = rewrite_cookie('abc=def; Path=file.html; Expires=Wed, 13 Jan 2021 22:23:01 UTC', urlrewriter2, 'coll')
+ res = rewrite_cookie('abc=def; Path=/file.html; Expires=Wed, 13 Jan 2021 22:23:01 UTC', urlrewriter2, 'coll')
assert len(res) == 1
assert res[0][1].lower() == 'abc=def; expires=wed, 13 jan 2021 22:23:01 gmt; path=/preview/em_/http://example.com/file.html'
@@ -113,14 +113,14 @@ def test_http_secure_flag():
@pytest.mark.skipif(sys.version_info < (2,7), reason='Unsupported')
def test_secure_flag_remove():
# Secure Remove
- res = rewrite_cookie('abc=def; Path=file.html; HttpOnly; Secure', urlrewriter2, 'coll')
+ res = rewrite_cookie('abc=def; Path=/file.html; HttpOnly; Secure', urlrewriter2, 'coll')
assert len(res) == 1
assert res[0][1].lower() == 'abc=def; httponly; path=/preview/em_/http://example.com/file.html'
@pytest.mark.skipif(sys.version_info < (2,7), reason='Unsupported')
def test_secure_flag_keep():
# Secure Keep
- res = rewrite_cookie('abc=def; Path=file.html; HttpOnly; Secure', urlrewriter3, 'coll')
+ res = rewrite_cookie('abc=def; Path=/file.html; HttpOnly; Secure', urlrewriter3, 'coll')
assert res[0][1].lower() == 'abc=def; httponly; path=/preview/em_/http://example.com/file.html; secure'
diff --git a/pywb/rewrite/test/test_header_rewriter.py b/pywb/rewrite/test/test_header_rewriter.py
index 8e1f1a87..e58c6d84 100644
--- a/pywb/rewrite/test/test_header_rewriter.py
+++ b/pywb/rewrite/test/test_header_rewriter.py
@@ -6,7 +6,7 @@ HTTP Headers Rewriting
# Text with charset
>>> _test_headers([('Date', 'Fri, 03 Jan 2014 03:03:21 GMT'), ('Content-Length', '5'), ('Content-Type', 'text/html;charset=UTF-8')])
{'charset': 'utf-8',
- 'removed_header_dict': {},
+ 'removed_header_dict': {'content-length': '5'},
'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Date', 'Fri, 03 Jan 2014 03:03:21 GMT'),
('X-Archive-Orig-Content-Length', '5'),
('Content-Type', 'text/html;charset=UTF-8')]),
@@ -24,9 +24,11 @@ HTTP Headers Rewriting
>>> _test_headers([('Content-Length', '199999'), ('Content-Type', 'text/javascript'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked')])
{'charset': None,
'removed_header_dict': {'content-encoding': 'gzip',
+ 'content-length': '199999',
'transfer-encoding': 'chunked'},
'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Content-Length', '199999'),
('Content-Type', 'text/javascript'),
+ ('X-Archive-Orig-Content-Encoding', 'gzip'),
('X-Archive-Orig-Transfer-Encoding', 'chunked')]),
'text_type': 'js'}
@@ -76,7 +78,7 @@ def _test_head_data(headers, status='200 OK', rewriter=urlrewriter):
def test_cookie_headers():
# cookie, host/origin rewriting
res = _test_head_data([('Connection', 'close'),
- ('Set-Cookie', 'foo=bar; Path=/; abc=def; Path=somefile.html'),
+ ('Set-Cookie', 'foo=bar; Path=/; abc=def; Path=/somefile.html'),
('Host', 'example.com'),
('Origin', 'https://example.com')])
diff --git a/pywb/rewrite/test/test_html_rewriter.py b/pywb/rewrite/test/test_html_rewriter.py
index afb1da93..d8087555 100644
--- a/pywb/rewrite/test/test_html_rewriter.py
+++ b/pywb/rewrite/test/test_html_rewriter.py
@@ -8,7 +8,7 @@ r"""
#=================================================================
>>> parse('Text')
-Text
+Text
>>> parse('

')

@@ -35,8 +35,8 @@ r"""
>>> parse('', urlrewriter=full_path_urlrewriter)
->>> parse('
')
-
+>>> parse('
')
+
# ensure trailing slash added
>>> parse('')
@@ -47,7 +47,7 @@ r"""
>>> parse('
', urlrewriter=no_base_canon_rewriter)
-
+
# Empty url
>>> parse('')
@@ -56,6 +56,9 @@ r"""
>>> parse('')
+# href on other tags
+>>> parse('Text
')
+Text
# HTML Entities
>>> parse('› > ?')
@@ -145,25 +148,40 @@ r"""
>>> parse('')
-
+
+
+>>> parse('')
+
>>> parse('')
+>>> parse('')
+
+
+>>> parse('')
+
+
+>>> parse("")
+
+
+#>>> parse('')
+
+
# Style
->>> parse('')
-
+>>> parse('')
+
# Unterminated style tag, handle and auto-terminate
>>> parse('
+
# Head Insertion
->>> parse('Test', head_insert = '')
-Test
+>>> parse('Test', head_insert = '')
+Test
>>> parse('', head_insert = '')
-
+
>>> parse('Test', head_insert = '')
Test
@@ -171,7 +189,7 @@ r"""
>>> parse('SomeTest
', head_insert = '/* Insert */')
/* Insert */SomeTest
->>> parse('SomeTest
', head_insert = '')
+>>> parse('SomeTest
', head_insert = '')
SomeTest
>>> parse('Some Text without any tags ', head_insert = '')
@@ -218,7 +236,7 @@ r"""
# remove extra spaces
>>> parse('Text')
-Text
+Text
>>> parse('Text')
Text
diff --git a/pywb/rewrite/test/test_regex_rewriters.py b/pywb/rewrite/test/test_regex_rewriters.py
index 30480660..2762b9d0 100644
--- a/pywb/rewrite/test/test_regex_rewriters.py
+++ b/pywb/rewrite/test/test_regex_rewriters.py
@@ -144,8 +144,14 @@ r"""
>>> _test_css("background: url(\"http://domain.com/path.html\")")
'background: url("/web/20131010/http://domain.com/path.html")'
+>>> _test_css('background: url(" http://domain.com/path.html ")')
+'background: url(" /web/20131010/http://domain.com/path.html ")'
+
+>>> _test_css('background: url(" http://domain.com/path.html x ")')
+'background: url(" /web/20131010/http://domain.com/path.html x ")'
+
>>> _test_css("background: url(file.jpeg)")
-'background: url(/web/20131010/http://example.com/file.jpeg)'
+'background: url(file.jpeg)'
>>> _test_css("background:#abc url('/static/styles/../images/layout/logo.png')")
"background:#abc url('/web/20131010/http://example.com/static/images/layout/logo.png')"
@@ -157,18 +163,18 @@ r"""
"background: url('')"
>>> _test_css("background: url (\"weirdpath\')")
-'background: url ("/web/20131010/http://example.com/weirdpath\')'
+'background: url ("weirdpath\')'
->>> _test_css("@import url ('path.css')")
+>>> _test_css("@import url ('/path.css')")
"@import url ('/web/20131010/http://example.com/path.css')"
>>> _test_css("@import url('path.css')")
-"@import url('/web/20131010/http://example.com/path.css')"
+"@import url('path.css')"
>>> _test_css("@import ( 'path.css')")
-"@import ( '/web/20131010/http://example.com/path.css')"
+"@import ( 'path.css')"
->>> _test_css("@import \"path.css\"")
+>>> _test_css("@import \"/path.css\"")
'@import "/web/20131010/http://example.com/path.css"'
>>> _test_css("@import ('../path.css\"")
@@ -178,7 +184,7 @@ r"""
'@import (\'/web/20131010/http://example.com/url.css"'
>>> _test_css("@import (\"url.css\")")
-'@import ("/web/20131010/http://example.com/url.css")'
+'@import ("url.css")'
>>> _test_css("@import url(/url.css)\n@import url(/anotherurl.css)\n @import url(/and_a_third.css)")
'@import url(/web/20131010/http://example.com/url.css)\n@import url(/web/20131010/http://example.com/anotherurl.css)\n @import url(/web/20131010/http://example.com/and_a_third.css)'
diff --git a/pywb/rewrite/test/test_rewrite_live.py b/pywb/rewrite/test/test_rewrite_live.py
index d3ffc3d8..62fa3bf9 100644
--- a/pywb/rewrite/test/test_rewrite_live.py
+++ b/pywb/rewrite/test/test_rewrite_live.py
@@ -123,7 +123,7 @@ def test_local_no_head_banner_only():
assert 'window.location = "/other.html"' in buff
# link NOT rewritten
- assert '"another.html"' in buff
+ assert '"/some/path/another.html"' in buff
def test_local_banner_only_no_rewrite():
status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html',
@@ -138,7 +138,7 @@ def test_local_banner_only_no_rewrite():
assert 'window.location = "http:\/\/example.com/dynamic_page.html"' in buff, buff
# link NOT rewritten
- assert '"another.html"' in buff
+ assert '"/some/path/another.html"' in buff
def test_local_2_link_only_rewrite():
status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html',
diff --git a/pywb/rewrite/test/test_url_rewriter.py b/pywb/rewrite/test/test_url_rewriter.py
index ac23051a..da243cd7 100644
--- a/pywb/rewrite/test/test_url_rewriter.py
+++ b/pywb/rewrite/test/test_url_rewriter.py
@@ -21,19 +21,19 @@
# UrlRewriter tests
>>> do_rewrite('other.html', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/')
-'/web/20131010/http://example.com/path/other.html'
+'other.html'
->>> do_rewrite('file.js', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/', 'js_')
+>>> do_rewrite('/path/file.js', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/', 'js_')
'/web/20131010js_/http://example.com/path/file.js'
->>> do_rewrite('file.js', '20131010/http://example.com/', '/coll/')
+>>> do_rewrite('/file.js', '20131010/http://example.com/', '/coll/')
'/coll/20131010/http://example.com/file.js'
->>> do_rewrite('file.js', '20131010/http://example.com', '/coll/', 'js_')
+>>> do_rewrite('/file.js', '20131010/http://example.com', '/coll/', 'js_')
'/coll/20131010js_/http://example.com/file.js'
>>> do_rewrite('file.js', '20131010/http://example.com', '/coll/', '')
-'/coll/20131010/http://example.com/file.js'
+'file.js'
>>> do_rewrite('/other.html', '20130907*/http://example.com/path/page.html', 'http://localhost:8080/coll/')
'/coll/20130907*/http://example.com/other.html'
@@ -41,8 +41,8 @@
>>> do_rewrite('/other.html', '20130907*/http://example.com/path/page.html', '/coll/')
'/coll/20130907*/http://example.com/other.html'
->>> do_rewrite('./other.html', '20130907*/http://example.com/path/page.html', '/coll/')
-'/coll/20130907*/http://example.com/path/other.html'
+>>> do_rewrite('other.html', '20130907*/http://example.com/path/page.html', '/coll/')
+'other.html'
>>> do_rewrite('../other.html', '20131112im_/http://example.com/path/page.html', '/coll/')
'/coll/20131112im_/http://example.com/other.html'
@@ -87,7 +87,7 @@
'2020/http://example.com/other.html'
>>> do_rewrite('', '20131010010203/http://example.com/file.html', '/web/')
-'/web/20131010010203/http://example.com/file.html'
+''
>>> do_rewrite('#anchor', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/')
'#anchor'
diff --git a/pywb/rewrite/url_rewriter.py b/pywb/rewrite/url_rewriter.py
index 4774bc03..d6fda47f 100644
--- a/pywb/rewrite/url_rewriter.py
+++ b/pywb/rewrite/url_rewriter.py
@@ -19,6 +19,9 @@ class UrlRewriter(object):
REL_SCHEME = ('//', r'\/\/', r'\\/\\/')
+ PARENT_PATH = '../'
+ REL_PATH = '/'
+
def __init__(self, wburl, prefix='', full_prefix=None, rel_prefix=None,
root_path=None, cookie_scope=None, rewrite_opts=None):
self.wburl = wburl if isinstance(wburl, WbUrl) else WbUrl(wburl)
@@ -60,6 +63,11 @@ class UrlRewriter(object):
if url.startswith(self.REL_SCHEME):
is_abs = True
scheme_rel = True
+ elif (not is_abs and
+ not url.startswith(self.REL_PATH) and
+ self.PARENT_PATH not in url):
+ return url
+
# if prefix starts with a scheme
#if self.prefix_scheme:
# url = self.prefix_scheme + ':' + url
diff --git a/pywb/rules.yaml b/pywb/rules.yaml
index 3b4e08cf..73eab273 100644
--- a/pywb/rules.yaml
+++ b/pywb/rules.yaml
@@ -135,6 +135,15 @@ rules:
fuzzy_lookup: 'com,google,plus\)/_/.*?.*(f.sid=[^&]+)'
+ # periscope
+ #=================================================================
+
+ - url_prefix: 'tv,periscope,assets)/js/'
+
+ rewrite:
+ js_regexs:
+ - match: '"location"'
+ replace: '"WB_wombat_location"'
# vimeo rules
#=================================================================
diff --git a/pywb/static/vidrw.js b/pywb/static/vidrw.js
index 3deb2343..f4368ece 100644
--- a/pywb/static/vidrw.js
+++ b/pywb/static/vidrw.js
@@ -165,7 +165,7 @@ __wbvidrw = (function() {
var name = child.getAttribute("name");
name = name.toLowerCase();
- if (name == "movie") {
+ if (name == "movie" || name == "src") {
var value = child.getAttribute("value");
obj_url = value;
}
diff --git a/pywb/static/wb.js b/pywb/static/wb.js
index f57e833b..6b5693c1 100644
--- a/pywb/static/wb.js
+++ b/pywb/static/wb.js
@@ -110,7 +110,7 @@ function remove_event(name, func, object) {
}
}
-function notify_top() {
+function notify_top(event) {
if (!window.__WB_top_frame) {
return;
}
@@ -123,25 +123,18 @@ function notify_top() {
return;
}
- //if (window.__WB_top_frame.update_wb_url) {
- // window.__WB_top_frame.update_wb_url(window.WB_wombat_location.href,
- // wbinfo.timestamp,
- // wbinfo.request_ts,
- // wbinfo.is_live);
- //}
-
var message = {
"url": window.WB_wombat_location.href,
"ts": wbinfo.timestamp,
"request_ts": wbinfo.request_ts,
"is_live": wbinfo.is_live,
- "title": "",
+ "title": document ? document.title : "",
"wb_type": "load",
}
window.__WB_top_frame.postMessage(message, "*");
- remove_event("readystatechange", notify_top, document);
+ //remove_event("readystatechange", notify_top, document);
}
this.load = function() {
@@ -152,7 +145,7 @@ this.load = function() {
window._wb_js_inited = true;
// Non-Framed Replay OR top frame for framed replay!
- if (window.wbinfo && (!window.__WB_top_frame || window.__WB_top_frame == window)) {
+ if (window.wbinfo && !window.__WB_top_frame) {
if (wbinfo.is_framed && wbinfo.mod != "bn_") {
var hash = window.location.hash;
@@ -171,7 +164,7 @@ this.load = function() {
add_event("readystatechange", init_banner, document);
// Framed Replay
- } else if (window.__WB_top_frame && window != window.__WB_top_frame && window.__WB_top_frame.update_wb_url) {
+ } else if (window.__WB_top_frame) {
add_event("readystatechange", notify_top, document);
}
}
diff --git a/pywb/static/wb_frame.js b/pywb/static/wb_frame.js
index 168b914f..2f32f43b 100644
--- a/pywb/static/wb_frame.js
+++ b/pywb/static/wb_frame.js
@@ -19,26 +19,31 @@ This file is part of pywb, https://github.com/ikreymer/pywb
var LIVE_COOKIE_REGEX = /pywb.timestamp=([\d]{1,14})/;
-var TS_REGEX = /\/([\d]{1,14})\//;
+var TS_REGEX = /\/([\d]{1,14})(?:\w+_)?\/(?:\w+[:])?\/\//;
-var curr_state = {};
+//var curr_state = {};
var IFRAME_ID = "replay_iframe";
-function make_url(url, ts, mod)
+var last_inner_hash = undefined;
+
+function make_url(url, ts, mod, prefix)
{
if (ts || mod) {
mod += "/";
}
+ prefix = prefix || wbinfo.prefix;
+
if (ts) {
- return wbinfo.prefix + ts + mod + url;
+ return prefix + ts + mod + url;
} else {
- return wbinfo.prefix + mod + url;
+ return prefix + mod + url;
}
}
function push_state(state) {
+ /*
var frame = document.getElementById(IFRAME_ID).contentWindow;
if (frame.WB_wombat_location) {
var curr_href = frame.WB_wombat_location.href;
@@ -48,13 +53,19 @@ function push_state(state) {
return;
}
}
+ */
- state.outer_url = make_url(state.url, state.request_ts, wbinfo.frame_mod);
+ state.outer_url = make_url(state.url, state.request_ts, wbinfo.frame_mod, wbinfo.outer_prefix);
state.inner_url = make_url(state.url, state.request_ts, wbinfo.replay_mod);
- var canon_url = make_url(state.url, state.request_ts, "");
+ var canon_url = make_url(state.url, state.request_ts, "", wbinfo.outer_prefix);
+
if (window.location.href != canon_url) {
- window.history.replaceState(state, "", canon_url);
+ if (state.wb_type != "pushState") {
+ window.history.replaceState(state, "", canon_url);
+ } else {
+ window.history.pushState(state, "", canon_url);
+ }
}
set_state(state);
@@ -63,8 +74,8 @@ function push_state(state) {
function pop_state(state) {
set_state(state);
- var frame = document.getElementById(IFRAME_ID).contentWindow;
- frame.src = state.inner_url;
+ //var frame = document.getElementById(IFRAME_ID);
+ //frame.src = state.inner_url;
}
function extract_ts(url)
@@ -103,7 +114,7 @@ function set_state(state) {
}
}
- curr_state = state;
+ //curr_state = state;
}
window.onpopstate = function(event) {
@@ -123,43 +134,6 @@ function extract_ts_cookie(value) {
}
}
-function iframe_loaded(event) {
- var url;
- var ts;
- var request_ts;
- var capture_str;
- var is_live = false;
- var iframe = document.getElementById(IFRAME_ID).contentWindow;
-
- if (iframe.WB_wombat_location) {
- url = iframe.WB_wombat_location.href;
- } else {
- url = extract_replay_url(iframe.location.href);
- }
-
- if (iframe.wbinfo) {
- ts = iframe.wbinfo.timestamp;
- request_ts = iframe.wbinfo.request_ts;
- is_live = iframe.wbinfo.is_live;
- } else {
- ts = extract_ts_cookie(iframe.document.cookie);
- if (ts) {
- is_live = true;
- } else {
- ts = extract_ts(iframe.location.href);
- }
- request_ts = ts;
- }
-
- var state = {}
- state["url"] = url;
- state["ts"] = ts;
- state["request_ts"] = request_ts;
- state["is_live"] = is_live
-
- update_wb_url(state);
-}
-
function init_pm() {
var frame = document.getElementById(IFRAME_ID).contentWindow;
@@ -172,7 +146,8 @@ function init_pm() {
// Check if iframe url change message
if (typeof(event.data) == "object" && event.data["wb_type"]) {
- update_wb_url(event.data);
+ handle_message(event.data);
+
} else {
// Pass to parent
window.parent.postMessage(event.data, "*");
@@ -187,55 +162,67 @@ function init_pm() {
}
-function update_wb_url(state) {
- if (curr_state.url == state.url && curr_state.ts == state.ts) {
- return;
+function handle_message(state) {
+ var type = state.wb_type;
+
+ if (type == "load" || type == "pushState" || type == "replaceState") {
+ update_wb_url(state);
+ } else if (type == "go") {
+ window.history.go(state.param);
+ } else if (type == "back") {
+ window.history.back();
+ } else if (type == "forward") {
+ window.history.forward();
+ } else if (type == "hashchange") {
+ inner_hash_changed(state);
}
+}
+
+
+function update_wb_url(state) {
+ //if (curr_state && curr_state.url == state.url && curr_state.ts == state.ts) {
+ // return;
+ //}
state['capture_str'] = _wb_js.ts_to_date(state.ts, true);
push_state(state);
}
-// Load Banner
-if (_wb_js) {
- _wb_js.load();
+function inner_hash_changed(state) {
+ if (window.location.hash != state.hash) {
+ window.location.hash = state.hash;
+ }
+ last_inner_hash = state.hash;
+}
+
+function outer_hash_changed(event) {
+ if (window.location.hash == last_inner_hash) {
+ return;
+ }
+
+ var frame = document.getElementById(IFRAME_ID).contentWindow;
+
+ var message = {"wb_type": "outer_hashchange", "hash": window.location.hash}
+
+ frame.postMessage(message, "*", undefined, true);
}
function init_hash_connect() {
- var frame = document.getElementById(IFRAME_ID).contentWindow;
+ var frame = document.getElementById(IFRAME_ID);
if (window.location.hash) {
var curr_url = wbinfo.capture_url + window.location.hash;
-
- frame.location.href = make_url(curr_url, wbinfo.request_ts, wbinfo.replay_mod);
+
+ frame.src = make_url(curr_url, wbinfo.request_ts, wbinfo.replay_mod);
+
+ last_inner_hash = window.location.hash;
+ //frame.location.href = make_url(curr_url, wbinfo.request_ts, wbinfo.replay_mod);
//frame.location.hash = window.location.hash;
}
-
- function outer_hash_changed() {
- var the_frame = document.getElementById(IFRAME_ID).contentWindow;
-
- if (window.location.hash == the_frame.location.hash) {
- return;
- }
-
- the_frame.location.hash = window.location.hash;
- //the_frame.location.href = make_url(curr_url, curr_state.request_ts, wbinfo.replay_mod);
- }
-
- function inner_hash_changed() {
- var the_frame = document.getElementById(IFRAME_ID).contentWindow;
-
- if (window.location.hash == the_frame.location.hash) {
- return;
- }
-
- window.location.hash = the_frame.location.hash;
- }
if ("onhashchange" in window) {
window.addEventListener("hashchange", outer_hash_changed, false);
- frame.addEventListener("hashchange", inner_hash_changed, false);
}
// Init Post Message connect
@@ -244,3 +231,10 @@ function init_hash_connect() {
document.addEventListener("DOMContentLoaded", init_hash_connect);
+// Load Banner
+if (_wb_js) {
+ _wb_js.load();
+}
+
+
+
diff --git a/pywb/static/wombat.js b/pywb/static/wombat.js
index af280f94..2d75f577 100644
--- a/pywb/static/wombat.js
+++ b/pywb/static/wombat.js
@@ -18,7 +18,7 @@ This file is part of pywb, https://github.com/ikreymer/pywb
*/
//============================================
-// Wombat JS-Rewriting Library v2.12
+// Wombat JS-Rewriting Library v2.16
//============================================
@@ -314,6 +314,8 @@ var wombat_internal = function($wbwindow) {
return "";
}
+ var orig_href = href;
+
// proxy mode: no extraction needed
if (!wb_replay_prefix) {
return href;
@@ -348,7 +350,7 @@ var wombat_internal = function($wbwindow) {
href = href.substr(4);
}
- if (!starts_with(href, VALID_PREFIXES)) {
+ if (href != orig_href && !starts_with(href, VALID_PREFIXES)) {
href = HTTP_PREFIX + href;
}
}
@@ -402,7 +404,17 @@ var wombat_internal = function($wbwindow) {
function make_parser(href) {
href = extract_orig(href);
- var p = $wbwindow.document.createElement("a", true);
+ var baseWin;
+
+ // special case: for newly opened blank windows, use the opener
+ // to create parser to have the proper baseURI
+ if ($wbwindow.location.href == "about:blank" && $wbwindow.opener) {
+ baseWin = $wbwindow.opener;
+ } else {
+ baseWin = $wbwindow;
+ }
+
+ var p = baseWin.document.createElement("a", true);
p.href = href;
return p;
}
@@ -712,21 +724,21 @@ var wombat_internal = function($wbwindow) {
function rewritten_func(state_obj, title, url) {
url = rewrite_url(url);
+ var abs_url = extract_orig(url);
+
+ if (abs_url && !starts_with(abs_url, $wbwindow.WB_wombat_location.origin + "/")) {
+ throw new DOMException("Invalid history change: " + abs_url);
+ }
+
if (url == $wbwindow.location.href) {
return;
}
orig_func.call(this, state_obj, title, url);
- //if ($wbwindow.__WB_top_frame && $wbwindow != $wbwindow.__WB_top_frame && $wbwindow.__WB_top_frame.update_wb_url) {
- // $wbwindow.__WB_top_frame.update_wb_url($wbwindow.WB_wombat_location.href,
- // wb_info.timestamp,
- // wb_info.request_ts,
- // wb_info.is_live);
- //}
- if ($wbwindow.__WB_top_frame && $wbwindow != $wbwindow.__WB_top_frame) {
+ if ($wbwindow.__WB_top_frame) {
var message = {
- "url": url,
+ "url": abs_url,
"ts": wb_info.timestamp,
"request_ts": wb_info.request_ts,
"is_live": wb_info.is_live,
@@ -734,7 +746,7 @@ var wombat_internal = function($wbwindow) {
"wb_type": func_name,
}
- $wbwindow.__WB_top_frame.postMessage(message, "*");
+ $wbwindow.__WB_top_frame.postMessage(message, wb_info.top_host);
}
}
@@ -746,6 +758,45 @@ var wombat_internal = function($wbwindow) {
return rewritten_func;
}
+ //============================================
+ function override_history_nav(func_name) {
+ if (!$wbwindow.history) {
+ return;
+ }
+
+ // Only useful for framed replay
+ if (!$wbwindow.__WB_top_frame) {
+ return;
+ }
+
+ var orig_func = $wbwindow.history[func_name];
+
+ if (!orig_func) {
+ return;
+ }
+
+ function rewritten_func() {
+ orig_func.apply(this, arguments);
+
+ var message = {
+ "wb_type": func_name,
+ }
+
+ if (func_name == "go") {
+ message["param"] = arguments[0];
+ }
+
+ $wbwindow.__WB_top_frame.postMessage(message, wb_info.top_host);
+ }
+
+ $wbwindow.history[func_name] = rewritten_func;
+ if ($wbwindow.History && $wbwindow.History.prototype) {
+ $wbwindow.History.prototype[func_name] = rewritten_func;
+ }
+
+ return rewritten_func;
+ }
+
//============================================
function init_ajax_rewrite() {
if (!$wbwindow.XMLHttpRequest ||
@@ -1157,6 +1208,35 @@ var wombat_internal = function($wbwindow) {
return value;
}
+ //============================================
+ function rewrite_frame_src(elem, name)
+ {
+ var value = wb_getAttribute.call(elem, name);
+ var new_value = undefined;
+
+ // special case for rewriting javascript: urls that contain WB_wombat_
+ // must insert wombat init first!
+ if (starts_with(value, "javascript:")) {
+ if (value.indexOf("WB_wombat_") >= 0) {
+ var JS = "javascript:";
+ new_value = JS;
+ new_value += "window.parent._wb_wombat.init_new_window_wombat(window);"
+ new_value += value.substr(JS.length);
+ }
+ }
+
+ if (!new_value) {
+ new_value = rewrite_url(value, false);
+ }
+
+ if (new_value != value) {
+ wb_setAttribute.call(elem, name, new_value);
+ return true;
+ }
+
+ return false;
+ }
+
//============================================
function rewrite_elem(elem)
{
@@ -1169,6 +1249,7 @@ var wombat_internal = function($wbwindow) {
if (elem.tagName == "STYLE") {
var new_content = rewrite_style(elem.textContent);
if (elem.textContent != new_content) {
+ elem.textContent = new_content;
changed = true;
}
} else if (elem.tagName == "OBJECT") {
@@ -1177,10 +1258,13 @@ var wombat_internal = function($wbwindow) {
changed = rewrite_attr(elem, "action", true);
} else if (elem.tagName == "INPUT") {
changed = rewrite_attr(elem, "value", true);
+ } else if (elem.tagName == "IFRAME" || elem.tagName == "FRAME") {
+ changed = rewrite_frame_src(elem, "src");
} else {
changed = rewrite_attr(elem, "src");
changed = rewrite_attr(elem, "href") || changed;
changed = rewrite_attr(elem, "style") || changed;
+ changed = rewrite_attr(elem, "poster") || changed;
}
if (elem.getAttribute) {
@@ -1648,6 +1732,47 @@ var wombat_internal = function($wbwindow) {
}
}
+ //============================================
+ function init_hash_change()
+ {
+ if (!$wbwindow.__WB_top_frame) {
+ return;
+ }
+
+ function receive_hash_change(event)
+ {
+ if (!event.data || event.source != $wbwindow.__WB_top_frame) {
+ return;
+ }
+
+ var message = event.data;
+
+ if (!message.wb_type) {
+ return;
+ }
+
+ if (message.wb_type == "outer_hashchange") {
+ if ($wbwindow.location.hash != message.hash) {
+ $wbwindow.location.hash = message.hash;
+ }
+ }
+ }
+
+ function send_hash_change() {
+ var message = {"wb_type": "hashchange",
+ "hash": $wbwindow.location.hash
+ }
+
+ if ($wbwindow.__WB_top_frame) {
+ $wbwindow.__WB_top_frame.postMessage(message, wb_info.top_host);
+ }
+ }
+
+ $wbwindow.addEventListener("message", receive_hash_change);
+
+ $wbwindow.addEventListener("hashchange", send_hash_change);
+ }
+
//============================================
function init_postmessage_override($wbwindow)
{
@@ -1659,7 +1784,7 @@ var wombat_internal = function($wbwindow) {
$wbwindow.__orig_postMessage = orig;
- var postmessage_rewritten = function(message, targetOrigin, transfer) {
+ var postmessage_rewritten = function(message, targetOrigin, transfer, from_top) {
var from = undefined;
var src_id = undefined;
@@ -1699,7 +1824,9 @@ var wombat_internal = function($wbwindow) {
var new_message = {"from": from,
"to_host": to,
"src_id": src_id,
- "message": message};
+ "message": message,
+ "from_top": from_top,
+ }
if (targetOrigin != "*") {
targetOrigin = this.location.origin;
@@ -1737,7 +1864,9 @@ var wombat_internal = function($wbwindow) {
var source = event.source;
- if (event.data.src_id && win.__WB_win_id && win.__WB_win_id[event.data.src_id]) {
+ if (event.data.from_top) {
+ source = win.__WB_top_frame;
+ } else if (event.data.src_id && win.__WB_win_id && win.__WB_win_id[event.data.src_id]) {
source = win.__WB_win_id[event.data.src_id];
}
@@ -1804,7 +1933,9 @@ var wombat_internal = function($wbwindow) {
var open_rewritten = function(strUrl, strWindowName, strWindowFeatures) {
strUrl = rewrite_url(strUrl, false, "");
- return orig.call(this, strUrl, strWindowName, strWindowFeatures);
+ var res = orig.call(this, strUrl, strWindowName, strWindowFeatures);
+ init_new_window_wombat(res, strUrl);
+ return res;
}
$wbwindow.open = open_rewritten;
@@ -1845,6 +1976,24 @@ var wombat_internal = function($wbwindow) {
cookie = cookie.replace(wb_abs_prefix, '');
cookie = cookie.replace(wb_rel_prefix, '');
+ // rewrite domain
+ cookie = cookie.replace(cookie_domain_regex, function(m, m1) {
+ var message = {"domain": m1,
+ "cookie": cookie,
+ "wb_type": "cookie",
+ }
+
+ // norify of cookie setting to allow server-side tracking
+ $wbwindow.__WB_top_frame.postMessage(message, wb_info.top_host);
+
+ // if no subdomain, eg. "localhost", just remove domain altogether
+ if ($wbwindow.location.hostname.indexOf(".") >= 0 && !IP_RX.test($wbwindow.location.hostname)) {
+ return "Domain=." + $wbwindow.location.hostname;
+ } else {
+ return "";
+ }
+ });
+
// rewrite path
cookie = cookie.replace(cookie_path_regex, function(m, m1) {
var rewritten = rewrite_url(m1);
@@ -1856,16 +2005,6 @@ var wombat_internal = function($wbwindow) {
return "Path=" + rewritten;
});
- // rewrite domain
- cookie = cookie.replace(cookie_domain_regex, function(m, m1) {
- // if no subdomain, eg. "localhost", just remove domain altogether
- if ($wbwindow.location.hostname.indexOf(".") >= 0 && !IP_RX.test($wbwindow.location.hostname)) {
- return "Domain=." + $wbwindow.location.hostname;
- } else {
- return "";
- }
- });
-
// rewrite secure, if needed
if ($wbwindow.location.protocol != "https:") {
cookie = cookie.replace("secure", "");
@@ -1988,7 +2127,11 @@ var wombat_internal = function($wbwindow) {
//var src = iframe.src;
var src = wb_getAttribute.call(iframe, "src");
-
+
+ init_new_window_wombat(win, src);
+ }
+
+ function init_new_window_wombat(win, src) {
if (!src || src == "" || src == "about:blank" || src.indexOf("javascript:") >= 0) {
win._WBWombat = wombat_internal(win);
win._wb_wombat = new win._WBWombat(wb_info);
@@ -2100,6 +2243,20 @@ var wombat_internal = function($wbwindow) {
}
}
+ //============================================
+ function init_beacon_override()
+ {
+ if (!$wbwindow.navigator.sendBeacon) {
+ return;
+ }
+
+ var orig_sendBeacon = $wbwindow.navigator.sendBeacon;
+
+ $wbwindow.navigator.sendBeacon = function(url, data) {
+ return orig_sendBeacon.call(this, rewrite_url(url), data);
+ }
+ }
+
//============================================
function get_final_url(prefix, mod, url) {
if (mod == undefined) {
@@ -2126,6 +2283,8 @@ var wombat_internal = function($wbwindow) {
wb_opts = wbinfo.wombat_opts;
wb_replay_prefix = wbinfo.prefix;
+ wb_info.top_host = wb_info.top_host || "*";
+
init_top_frame($wbwindow);
init_wombat_top($wbwindow);
@@ -2174,6 +2333,10 @@ var wombat_internal = function($wbwindow) {
override_history_func("pushState");
override_history_func("replaceState");
+ override_history_nav("go");
+ override_history_nav("back");
+ override_history_nav("forward");
+
// open
init_open_override();
@@ -2183,6 +2346,8 @@ var wombat_internal = function($wbwindow) {
init_postmessage_override($wbwindow);
}
+ init_hash_change();
+
// write
init_write_override();
@@ -2242,14 +2407,17 @@ var wombat_internal = function($wbwindow) {
// Date
init_date_override(wbinfo.wombat_sec);
-
// registerProtocolHandler override
init_registerPH_override();
+ //sendBeacon override
+ init_beacon_override();
+
// expose functions
this.extract_orig = extract_orig;
this.rewrite_url = rewrite_url;
this.watch_elem = watch_elem;
+ this.init_new_window_wombat = init_new_window_wombat;
}
function init_top_frame($wbwindow) {
@@ -2290,11 +2458,14 @@ var wombat_internal = function($wbwindow) {
var real_parent = replay_top.__WB_orig_parent || replay_top.parent;
// Check to ensure top frame is different window and directly accessible (later refactor to support postMessage)
- try {
- if ((real_parent == $wbwindow) || !real_parent.wbinfo || !real_parent.wbinfo.is_frame) {
- real_parent = undefined;
- }
- } catch (e) {
+ //try {
+ // if ((real_parent == $wbwindow) || !real_parent.wbinfo || !real_parent.wbinfo.is_frame) {
+ // real_parent = undefined;
+ // }
+ //} catch (e) {
+ // real_parent = undefined;
+ //}
+ if (real_parent == $wbwindow || !wb_info.is_framed) {
real_parent = undefined;
}
diff --git a/pywb/templates/frame_insert.html b/pywb/templates/frame_insert.html
index af6a81af..e400d1dd 100644
--- a/pywb/templates/frame_insert.html
+++ b/pywb/templates/frame_insert.html
@@ -31,7 +31,7 @@ html, body
-
+
diff --git a/pywb/templates/head_insert.html b/pywb/templates/head_insert.html
index f398cc01..9e838b61 100644
--- a/pywb/templates/head_insert.html
+++ b/pywb/templates/head_insert.html
@@ -4,7 +4,7 @@
wbinfo.url = "{{ cdx.url }}";
wbinfo.timestamp = "{{ cdx.timestamp }}";
wbinfo.request_ts = "{{ wbrequest.wb_url.timestamp }}";
- wbinfo.prefix = "{{ wbrequest.wb_prefix }}";
+ wbinfo.prefix = decodeURI("{{ wbrequest.wb_prefix }}");
wbinfo.mod = "{{ wbrequest.wb_url.mod }}";
wbinfo.top_url = "{{ top_url }}";
wbinfo.is_framed = {{ "true" if wbrequest.options.is_framed else "false" }};
diff --git a/pywb/utils/loaders.py b/pywb/utils/loaders.py
index 4c298334..3841134b 100644
--- a/pywb/utils/loaders.py
+++ b/pywb/utils/loaders.py
@@ -392,6 +392,7 @@ class HttpLoader(BaseLoader):
self.session = requests.Session()
r = self.session.get(url, headers=headers, stream=True)
+ r.raise_for_status()
return r.raw
diff --git a/pywb/utils/statusandheaders.py b/pywb/utils/statusandheaders.py
index a7188c38..124b7f75 100644
--- a/pywb/utils/statusandheaders.py
+++ b/pywb/utils/statusandheaders.py
@@ -7,6 +7,7 @@ from copy import copy
from six.moves import range
from six import iteritems
from pywb.utils.loaders import to_native_str
+import uuid
WRAP_WIDTH = 80
@@ -257,6 +258,12 @@ class StatusAndHeadersParser(object):
plen = len(prefix)
return (key_upper[:plen], key[plen:])
+ @staticmethod
+ def make_warc_id(id_=None):
+ if not id_:
+ id_ = uuid.uuid1()
+ return ''.format(id_)
+
#=================================================================
class StatusAndHeadersParserException(Exception):
diff --git a/pywb/utils/test/test_loaders.py b/pywb/utils/test/test_loaders.py
index 5d71a711..4b755726 100644
--- a/pywb/utils/test/test_loaders.py
+++ b/pywb/utils/test/test_loaders.py
@@ -138,6 +138,7 @@ import pytest
import six
from six import StringIO
from io import BytesIO
+import requests
from pywb.utils.loaders import BlockLoader, HMACCookieMaker, to_file_url
from pywb.utils.loaders import LimitReader, extract_client_cookie, extract_post_query
@@ -176,6 +177,14 @@ def test_s3_read_1():
assert reader.readline() == b'WARC/1.0\r\n'
assert reader.readline() == b'WARC-Type: response\r\n'
+def test_limit_post():
+ reader = LimitReader(BytesIO(b'abcdefg'), 3)
+ r = requests.request(method='POST',
+ url='http://httpbin.org/post',
+ data=reader,
+ headers={'Content-Length': '3'})
+
+ assert '"abc"' in r.text
# Error
def test_err_no_such_file():
diff --git a/pywb/utils/timeutils.py b/pywb/utils/timeutils.py
index 08ea700d..3c5cd9f3 100644
--- a/pywb/utils/timeutils.py
+++ b/pywb/utils/timeutils.py
@@ -121,6 +121,18 @@ def iso_date_to_timestamp(string):
return datetime_to_timestamp(iso_date_to_datetime(string))
+def timestamp_to_iso_date(string):
+ """
+ >>> timestamp_to_iso_date('20131226101112')
+ '2013-12-26T10:11:12Z'
+
+ >>> timestamp_to_iso_date('20131226101112')
+ '2013-12-26T10:11:12Z'
+ """
+
+
+ return datetime_to_iso_date(timestamp_to_datetime(string))
+
def http_date_to_timestamp(string):
"""
diff --git a/pywb/warc/archiveiterator.py b/pywb/warc/archiveiterator.py
index 4ff500d4..69883304 100644
--- a/pywb/warc/archiveiterator.py
+++ b/pywb/warc/archiveiterator.py
@@ -54,15 +54,18 @@ class ArchiveIterator(object):
def __init__(self, fileobj, no_record_parse=False,
- verify_http=False):
+ verify_http=False, arc2warc=False):
self.fh = fileobj
- self.loader = ArcWarcRecordLoader(verify_http=verify_http)
+ self.loader = ArcWarcRecordLoader(verify_http=verify_http,
+ arc2warc=arc2warc)
self.reader = None
self.offset = 0
self.known_format = None
+ self.mixed_arc_warc = arc2warc
+
self.member_info = None
self.no_record_parse = no_record_parse
@@ -226,7 +229,8 @@ class ArchiveIterator(object):
self.member_info = None
# Track known format for faster parsing of other records
- self.known_format = record.format
+ if not self.mixed_arc_warc:
+ self.known_format = record.format
return record
@@ -359,6 +363,9 @@ class DefaultRecordParser(object):
if not include_all and not minimal and (record.status_headers.get_statuscode() == '-'):
continue
+ if record.rec_type == 'arc_header':
+ continue
+
if record.format == 'warc':
if (record.rec_type in ('request', 'warcinfo') and
not include_all and
@@ -495,9 +502,6 @@ class DefaultRecordParser(object):
def parse_arc_record(self, record):
""" Parse arc record
"""
- if record.rec_type == 'arc_header':
- return None
-
url = record.rec_headers.get_header('uri')
url = url.replace('\r', '%0D')
url = url.replace('\n', '%0A')
@@ -528,7 +532,8 @@ class DefaultRecordParser(object):
def __call__(self, fh):
aiter = ArchiveIterator(fh, self.options.get('minimal', False),
- self.options.get('verify_http', False))
+ self.options.get('verify_http', False),
+ self.options.get('arc2warc', False))
entry_iter = self.create_record_iter(aiter)
diff --git a/pywb/warc/recordloader.py b/pywb/warc/recordloader.py
index 402d1524..43931958 100644
--- a/pywb/warc/recordloader.py
+++ b/pywb/warc/recordloader.py
@@ -9,6 +9,7 @@ from pywb.utils.loaders import to_native_str
from pywb.utils.bufferedreaders import DecompressingBufferedReader
from pywb.utils.wbexception import WbException
+from pywb.utils.timeutils import timestamp_to_iso_date
from six.moves import zip
import six
@@ -37,11 +38,6 @@ class ArchiveLoadFailed(WbException):
#=================================================================
class ArcWarcRecordLoader(object):
- # Standard ARC v1.0 headers
- # TODO: support ARC v2.0 also?
- ARC_HEADERS = ["uri", "ip-address", "archive-date",
- "content-type", "length"]
-
WARC_TYPES = ['WARC/1.0', 'WARC/0.17', 'WARC/0.18']
HTTP_TYPES = ['HTTP/1.0', 'HTTP/1.1']
@@ -55,14 +51,17 @@ class ArcWarcRecordLoader(object):
HTTP_SCHEMES = ('http:', 'https:')
def __init__(self, loader=None, cookie_maker=None, block_size=8192,
- verify_http=True):
+ verify_http=True, arc2warc=True):
if not loader:
loader = BlockLoader(cookie_maker=cookie_maker)
self.loader = loader
self.block_size = block_size
- self.arc_parser = ARCHeadersParser(self.ARC_HEADERS)
+ if arc2warc:
+ self.arc_parser = ARC2WARCHeadersParser()
+ else:
+ self.arc_parser = ARCHeadersParser()
self.warc_parser = StatusAndHeadersParser(self.WARC_TYPES)
self.http_parser = StatusAndHeadersParser(self.HTTP_TYPES, verify_http)
@@ -114,12 +113,16 @@ class ArcWarcRecordLoader(object):
else:
rec_type = 'response'
- elif the_format == 'warc':
+ elif the_format in ('warc', 'arc2warc'):
rec_type = rec_headers.get_header('WARC-Type')
uri = rec_headers.get_header('WARC-Target-URI')
length = rec_headers.get_header('Content-Length')
content_type = rec_headers.get_header('Content-Type')
- sub_len = 0
+ if the_format == 'warc':
+ sub_len = 0
+ else:
+ sub_len = rec_headers.total_len
+ the_format = 'warc'
is_err = False
@@ -201,7 +204,7 @@ class ArcWarcRecordLoader(object):
# now try as arc
try:
rec_headers = self.arc_parser.parse(stream, statusline)
- return 'arc', rec_headers
+ return self.arc_parser.get_rec_type(), rec_headers
except StatusAndHeadersParserException as se:
if known_format == 'arc':
msg = 'Invalid ARC record, first line: '
@@ -212,8 +215,15 @@ class ArcWarcRecordLoader(object):
#=================================================================
class ARCHeadersParser(object):
- def __init__(self, headernames):
- self.headernames = headernames
+ # ARC 1.0 headers
+ ARC_HEADERS = ["uri", "ip-address", "archive-date",
+ "content-type", "length"]
+
+ def __init__(self):
+ self.headernames = self.get_header_names()
+
+ def get_rec_type(self):
+ return 'arc'
def parse(self, stream, headerline=None):
total_read = 0
@@ -250,12 +260,60 @@ class ARCHeadersParser(object):
msg = msg.format(headernames, parts)
raise StatusAndHeadersParserException(msg, parts)
- headers = []
- for name, value in zip(headernames, parts):
- headers.append((name, value))
+ protocol, headers = self._get_protocol_and_headers(headerline, parts)
return StatusAndHeaders(statusline='',
headers=headers,
- protocol='ARC/1.0',
+ protocol='WARC/1.0',
total_len=total_read)
+
+ @classmethod
+ def get_header_names(cls):
+ return cls.ARC_HEADERS
+
+ def _get_protocol_and_headers(self, headerline, parts):
+ headers = []
+
+ for name, value in zip(self.headernames, parts):
+ headers.append((name, value))
+
+ return ('ARC/1.0', headers)
+
+
+#=================================================================
+class ARC2WARCHeadersParser(ARCHeadersParser):
+ # Headers for converting ARC -> WARC Header
+ ARC_TO_WARC_HEADERS = ["WARC-Target-URI",
+ "WARC-IP-Address",
+ "WARC-Date",
+ "Content-Type",
+ "Content-Length"]
+
+ def get_rec_type(self):
+ return 'arc2warc'
+
+ @classmethod
+ def get_header_names(cls):
+ return cls.ARC_TO_WARC_HEADERS
+
+ def _get_protocol_and_headers(self, headerline, parts):
+ headers = []
+
+ for name, value in zip(self.headernames, parts):
+ if name == 'WARC-Date':
+ value = timestamp_to_iso_date(value)
+
+ headers.append((name, value))
+
+ if headerline.startswith('filedesc://'):
+ rec_type = 'arc_header'
+ else:
+ rec_type = 'response'
+
+ headers.append(('WARC-Type', rec_type))
+ headers.append(('WARC-Record-ID', StatusAndHeadersParser.make_warc_id()))
+
+ return ('WARC/1.0', headers)
+
+
diff --git a/pywb/warc/test/test_indexing.py b/pywb/warc/test/test_indexing.py
index c38d3a08..daabf7bb 100644
--- a/pywb/warc/test/test_indexing.py
+++ b/pywb/warc/test/test_indexing.py
@@ -44,6 +44,19 @@ com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ
CDX N b a m s k r M S V g
com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1656 151 example.arc
+# arc.gz
+>>> print_cdx_index('example.arc.gz', arc2warc=True)
+ CDX N b a m s k r M S V g
+com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 856 171 example.arc.gz
+
+# arc
+>>> print_cdx_index('example.arc', arc2warc=True)
+ CDX N b a m s k r M S V g
+com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1656 151 example.arc
+
+
+
+
# wget warc, includes metadata by default
>>> print_cdx_index('example-wget-1-14.warc.gz')
CDX N b a m s k r M S V g
@@ -328,6 +341,22 @@ def test_cdxj_arc_minimal():
com,example)/ 20140216050221 {"url": "http://example.com/", "digest": "PEWDX5GTH66WU74WBPGFECIYBMPMP3FP", "length": "856", "offset": "171", "filename": "example.arc.gz"}
""")
+def test_cdxj_arc_conv():
+ # arc.gz -- json
+ res = cdx_index('example.arc.gz', cdxj=True, arc2warc=True)
+ assert parse_cdxj(res) == parse_cdxj(b"""
+com,example)/ 20140216050221 {"url": "http://example.com/", "mime": "text/html", "status": "200", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "856", "offset": "171", "filename": "example.arc.gz"}
+""")
+
+def test_cdxj_arc_minimal_conv():
+ # arc.gz -- minimal + json
+ res = cdx_index('example.arc.gz', cdxj=True, minimal=True, arc2warc=True)
+ assert parse_cdxj(res) == parse_cdxj(b"""
+com,example)/ 20140216050221 {"url": "http://example.com/", "digest": "PEWDX5GTH66WU74WBPGFECIYBMPMP3FP", "length": "856", "offset": "171", "filename": "example.arc.gz"}
+""")
+
+
+
def test_cdxj_empty():
options = dict(cdxj=True)
diff --git a/pywb/webapp/cdx_api_handler.py b/pywb/webapp/cdx_api_handler.py
index 1835647a..1ebd0018 100644
--- a/pywb/webapp/cdx_api_handler.py
+++ b/pywb/webapp/cdx_api_handler.py
@@ -1,11 +1,13 @@
from pywb.cdx.cdxserver import create_cdx_server
+from pywb.utils.wbexception import NotFoundException
from pywb.framework.basehandlers import BaseHandler
from pywb.framework.wbrequestresponse import WbResponse
from pywb.webapp.query_handler import QueryHandler
from six.moves.urllib.parse import parse_qs
+import json
import six
@@ -21,7 +23,18 @@ class CDXAPIHandler(BaseHandler):
def __call__(self, wbrequest):
params = self.extract_params_from_wsgi_env(wbrequest.env)
- cdx_iter = self.index_handler.load_cdx(wbrequest, params)
+ try:
+ cdx_iter = self.index_handler.load_cdx(wbrequest, params)
+ except NotFoundException:
+ msg = 'No Captures found for: ' + params.get('url')
+ if params.get('output') == 'json':
+ msg = json.dumps(dict(error=msg))
+ content_type='application/json'
+ else:
+ content_type='text/plain'
+
+ return WbResponse.text_response(msg, content_type=content_type,
+ status='404 Not Found')
return WbResponse.text_stream(cdx_iter,
content_type='text/plain')
diff --git a/sample_archive/text_content/sample.html b/sample_archive/text_content/sample.html
index f2ed6842..fc2d66d0 100644
--- a/sample_archive/text_content/sample.html
+++ b/sample_archive/text_content/sample.html
@@ -10,5 +10,5 @@ if (some_val) {
}
Test Content
-Some Link
+Some Link
diff --git a/sample_archive/text_content/sample_no_head.html b/sample_archive/text_content/sample_no_head.html
index ed4bc4f3..4242336f 100644
--- a/sample_archive/text_content/sample_no_head.html
+++ b/sample_archive/text_content/sample_no_head.html
@@ -5,4 +5,4 @@ if (some_val) {
}
Test Content
-Some Link
+Some Link
diff --git a/tests/test_live_proxy.py b/tests/test_live_proxy.py
index 6c48c5de..575c51a8 100644
--- a/tests/test_live_proxy.py
+++ b/tests/test_live_proxy.py
@@ -125,7 +125,7 @@ class TestProxyLiveRewriter:
def test_echo_proxy_start_unbounded_remove_range(self):
headers = [('Range', 'bytes=0-')]
- resp = self.testapp.get('/rewrite/http://example.com/', headers=headers)
+ resp = self.testapp.get('/rewrite/http://httpbin.org/range/100', headers=headers)
# actual response is with range
assert resp.status_int == 206
@@ -138,7 +138,7 @@ class TestProxyLiveRewriter:
assert self.requestlog[0] == resp.text
assert resp.headers['x-archive-orig-x-proxy'] == 'test'
- assert self.requestlog[0].startswith('GET http://example.com/ HTTP/1.1')
+ assert self.requestlog[0].startswith('GET http://httpbin.org/range/100 HTTP/1.1')
assert 'range: ' not in self.requestlog[0]
assert len(self.cache) == 0
diff --git a/tests/test_live_rewriter.py b/tests/test_live_rewriter.py
index fdc94415..0f4129f4 100644
--- a/tests/test_live_rewriter.py
+++ b/tests/test_live_rewriter.py
@@ -4,7 +4,6 @@ from pywb.framework.wsgi_wrappers import init_app
import webtest
import pywb.rewrite.rewrite_live
-
#=================================================================
class MockYTDWrapper(object):
def extract_info(self, url):
@@ -47,6 +46,7 @@ class TestLiveRewriter:
def test_live_live_frame(self):
resp = self.testapp.get('/live/http://example.com/')
assert resp.status_int == 200
+ resp.charset = 'utf-8'
assert '