diff --git a/README.rst b/README.rst index 010a6f3e..60f5237d 100644 --- a/README.rst +++ b/README.rst @@ -1,4 +1,4 @@ -PyWb 0.31.0 +PyWb 0.32.0 =========== .. image:: https://travis-ci.org/ikreymer/pywb.svg?branch=master diff --git a/pywb/__init__.py b/pywb/__init__.py index 9f66d658..7cdc246c 100644 --- a/pywb/__init__.py +++ b/pywb/__init__.py @@ -1,4 +1,4 @@ -__version__ = '0.31.0' +__version__ = '0.32.0' DEFAULT_CONFIG = 'pywb/default_config.yaml' diff --git a/pywb/rewrite/header_rewriter.py b/pywb/rewrite/header_rewriter.py index 610df546..564d5b91 100644 --- a/pywb/rewrite/header_rewriter.py +++ b/pywb/rewrite/header_rewriter.py @@ -17,6 +17,12 @@ class RewrittenStatusAndHeaders(object): def contains_removed_header(self, name, value): return self.removed_header_dict.get(name) == value + def readd_rewrite_removed(self): + for name in HeaderRewriter.PROXY_NO_REWRITE_HEADERS: + value = self.removed_header_dict.get(name) + if value is not None: + self.status_headers.headers.append((name, value)) + #================================================================= class HeaderRewriter(object): @@ -34,19 +40,21 @@ class HeaderRewriter(object): 'json': ['application/json'], 'xml': ['/xml', '+xml', '.xml', '.rss'], + + 'plain': ['text/plain'], } PROXY_HEADERS = ['content-type', 'content-disposition', 'content-range', - 'accept-ranges'] + 'accept-ranges', 'www-authenticate', 'proxy-authenticate'] URL_REWRITE_HEADERS = ['location', 'content-location', 'content-base'] - ENCODING_HEADERS = ['content-encoding'] + #ENCODING_HEADERS = ['content-encoding'] REMOVE_HEADERS = ['transfer-encoding', 'content-security-policy', 'strict-transport-security'] - PROXY_NO_REWRITE_HEADERS = ['content-length'] + PROXY_NO_REWRITE_HEADERS = ['content-length', 'content-encoding'] COOKIE_HEADERS = ['set-cookie', 'cookie'] @@ -141,9 +149,10 @@ class HeaderRewriter(object): elif urlrewriter and lowername in self.URL_REWRITE_HEADERS: new_headers.append((name, urlrewriter.rewrite(value))) - elif lowername in self.ENCODING_HEADERS: + elif lowername in self.PROXY_NO_REWRITE_HEADERS: if content_rewritten: removed_header_dict[lowername] = value + add_prefixed_header(name, value) else: add_header(name, value) @@ -151,10 +160,6 @@ class HeaderRewriter(object): removed_header_dict[lowername] = value add_prefixed_header(name, value) - elif (lowername in self.PROXY_NO_REWRITE_HEADERS and - not content_rewritten): - add_header(name, value) - elif (lowername in self.COOKIE_HEADERS and cookie_rewriter): cookie_list = cookie_rewriter.rewrite(value) diff --git a/pywb/rewrite/html_rewriter.py b/pywb/rewrite/html_rewriter.py index 90148c1f..21989a29 100644 --- a/pywb/rewrite/html_rewriter.py +++ b/pywb/rewrite/html_rewriter.py @@ -40,6 +40,7 @@ class HTMLRewriterMixin(object): 'embed': {'src': 'oe_'}, 'head': {'': defmod}, # for head rewriting 'iframe': {'src': 'if_'}, + 'image': {'src': 'im_', 'xlink:href': 'im_'}, 'img': {'src': 'im_', 'srcset': 'im_'}, 'ins': {'cite': defmod}, @@ -118,6 +119,8 @@ class HTMLRewriterMixin(object): META_REFRESH_REGEX = re.compile('^[\\d.]+\\s*;\\s*url\\s*=\\s*(.+?)\\s*$', re.IGNORECASE | re.MULTILINE) + ADD_WINDOW = re.compile('(?>> rewrite_cookie('abc=def; Path=file.html; Expires=Wed, 13 Jan 2021 22:23:01 GMT', urlrewriter, 'coll') -[('Set-Cookie', 'abc=def; Path=/pywb/20131226101010/http://example.com/some/path/file.html')] +[('Set-Cookie', 'abc=def; Path=file.html')] # keep Max-Age ->>> rewrite_cookie('abc=def; Path=file.html; Max-Age=1500', urlrewriter2, 'coll') +>>> rewrite_cookie('abc=def; Path=/file.html; Max-Age=1500', urlrewriter2, 'coll') [('Set-Cookie', 'abc=def; Max-Age=1500; Path=/preview/em_/http://example.com/file.html')] # Cookie with invalid chars, not parsed @@ -92,14 +92,14 @@ def rewrite_cookie(cookie_str, rewriter=urlrewriter, scope='default'): @pytest.mark.skipif(sys.version_info < (2,7), reason='Unsupported') def test_with_expires(): # keep expires - res = rewrite_cookie('abc=def; Path=file.html; Expires=Wed, 13 Jan 2021 22:23:01 GMT', urlrewriter2, 'coll') + res = rewrite_cookie('abc=def; Path=/file.html; Expires=Wed, 13 Jan 2021 22:23:01 GMT', urlrewriter2, 'coll') assert len(res) == 1 assert res[0][1].lower() == 'abc=def; expires=wed, 13 jan 2021 22:23:01 gmt; path=/preview/em_/http://example.com/file.html' @pytest.mark.skipif(sys.version_info < (2,7), reason='Unsupported') def test_with_expires_utc_replace(): # keep expires, UTC->GMT - res = rewrite_cookie('abc=def; Path=file.html; Expires=Wed, 13 Jan 2021 22:23:01 UTC', urlrewriter2, 'coll') + res = rewrite_cookie('abc=def; Path=/file.html; Expires=Wed, 13 Jan 2021 22:23:01 UTC', urlrewriter2, 'coll') assert len(res) == 1 assert res[0][1].lower() == 'abc=def; expires=wed, 13 jan 2021 22:23:01 gmt; path=/preview/em_/http://example.com/file.html' @@ -113,14 +113,14 @@ def test_http_secure_flag(): @pytest.mark.skipif(sys.version_info < (2,7), reason='Unsupported') def test_secure_flag_remove(): # Secure Remove - res = rewrite_cookie('abc=def; Path=file.html; HttpOnly; Secure', urlrewriter2, 'coll') + res = rewrite_cookie('abc=def; Path=/file.html; HttpOnly; Secure', urlrewriter2, 'coll') assert len(res) == 1 assert res[0][1].lower() == 'abc=def; httponly; path=/preview/em_/http://example.com/file.html' @pytest.mark.skipif(sys.version_info < (2,7), reason='Unsupported') def test_secure_flag_keep(): # Secure Keep - res = rewrite_cookie('abc=def; Path=file.html; HttpOnly; Secure', urlrewriter3, 'coll') + res = rewrite_cookie('abc=def; Path=/file.html; HttpOnly; Secure', urlrewriter3, 'coll') assert res[0][1].lower() == 'abc=def; httponly; path=/preview/em_/http://example.com/file.html; secure' diff --git a/pywb/rewrite/test/test_header_rewriter.py b/pywb/rewrite/test/test_header_rewriter.py index 8e1f1a87..e58c6d84 100644 --- a/pywb/rewrite/test/test_header_rewriter.py +++ b/pywb/rewrite/test/test_header_rewriter.py @@ -6,7 +6,7 @@ HTTP Headers Rewriting # Text with charset >>> _test_headers([('Date', 'Fri, 03 Jan 2014 03:03:21 GMT'), ('Content-Length', '5'), ('Content-Type', 'text/html;charset=UTF-8')]) {'charset': 'utf-8', - 'removed_header_dict': {}, + 'removed_header_dict': {'content-length': '5'}, 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Date', 'Fri, 03 Jan 2014 03:03:21 GMT'), ('X-Archive-Orig-Content-Length', '5'), ('Content-Type', 'text/html;charset=UTF-8')]), @@ -24,9 +24,11 @@ HTTP Headers Rewriting >>> _test_headers([('Content-Length', '199999'), ('Content-Type', 'text/javascript'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked')]) {'charset': None, 'removed_header_dict': {'content-encoding': 'gzip', + 'content-length': '199999', 'transfer-encoding': 'chunked'}, 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Content-Length', '199999'), ('Content-Type', 'text/javascript'), + ('X-Archive-Orig-Content-Encoding', 'gzip'), ('X-Archive-Orig-Transfer-Encoding', 'chunked')]), 'text_type': 'js'} @@ -76,7 +78,7 @@ def _test_head_data(headers, status='200 OK', rewriter=urlrewriter): def test_cookie_headers(): # cookie, host/origin rewriting res = _test_head_data([('Connection', 'close'), - ('Set-Cookie', 'foo=bar; Path=/; abc=def; Path=somefile.html'), + ('Set-Cookie', 'foo=bar; Path=/; abc=def; Path=/somefile.html'), ('Host', 'example.com'), ('Origin', 'https://example.com')]) diff --git a/pywb/rewrite/test/test_html_rewriter.py b/pywb/rewrite/test/test_html_rewriter.py index afb1da93..d8087555 100644 --- a/pywb/rewrite/test/test_html_rewriter.py +++ b/pywb/rewrite/test/test_html_rewriter.py @@ -8,7 +8,7 @@ r""" #================================================================= >>> parse('Text') -Text +Text >>> parse('
')
@@ -35,8 +35,8 @@ r""" >>> parse('', urlrewriter=full_path_urlrewriter) ->>> parse('') - +>>> parse('') + # ensure trailing slash added >>> parse('') @@ -47,7 +47,7 @@ r""" >>> parse('', urlrewriter=no_base_canon_rewriter) - + # Empty url >>> parse('') @@ -56,6 +56,9 @@ r""" >>> parse('') +# href on other tags +>>> parse('
Text
') +
Text
# HTML Entities >>> parse('›   > ?') @@ -145,25 +148,40 @@ r""" >>> parse('
') -
+
+ +>>> parse('
') +
>>> parse('') +>>> parse('') + + +>>> parse('') + + +>>> parse("") + + +#>>> parse('') + + # Style ->>> parse('') - +>>> parse('') + # Unterminated style tag, handle and auto-terminate >>> parse(' + # Head Insertion ->>> parse('Test', head_insert = '') -Test +>>> parse('Test', head_insert = '') +Test >>> parse('', head_insert = '') - + >>> parse('Test', head_insert = '') Test @@ -171,7 +189,7 @@ r""" >>> parse('
SomeTest
', head_insert = '/* Insert */') /* Insert */
SomeTest
->>> parse('
SomeTest
', head_insert = '') +>>> parse('
SomeTest
', head_insert = '')
SomeTest
>>> parse('Some Text without any tags ', head_insert = '') @@ -218,7 +236,7 @@ r""" # remove extra spaces >>> parse('
Text') -Text +Text >>> parse('Text') Text diff --git a/pywb/rewrite/test/test_regex_rewriters.py b/pywb/rewrite/test/test_regex_rewriters.py index 30480660..2762b9d0 100644 --- a/pywb/rewrite/test/test_regex_rewriters.py +++ b/pywb/rewrite/test/test_regex_rewriters.py @@ -144,8 +144,14 @@ r""" >>> _test_css("background: url(\"http://domain.com/path.html\")") 'background: url("/web/20131010/http://domain.com/path.html")' +>>> _test_css('background: url(" http://domain.com/path.html ")') +'background: url(" /web/20131010/http://domain.com/path.html ")' + +>>> _test_css('background: url(" http://domain.com/path.html x ")') +'background: url(" /web/20131010/http://domain.com/path.html x ")' + >>> _test_css("background: url(file.jpeg)") -'background: url(/web/20131010/http://example.com/file.jpeg)' +'background: url(file.jpeg)' >>> _test_css("background:#abc url('/static/styles/../images/layout/logo.png')") "background:#abc url('/web/20131010/http://example.com/static/images/layout/logo.png')" @@ -157,18 +163,18 @@ r""" "background: url('')" >>> _test_css("background: url (\"weirdpath\')") -'background: url ("/web/20131010/http://example.com/weirdpath\')' +'background: url ("weirdpath\')' ->>> _test_css("@import url ('path.css')") +>>> _test_css("@import url ('/path.css')") "@import url ('/web/20131010/http://example.com/path.css')" >>> _test_css("@import url('path.css')") -"@import url('/web/20131010/http://example.com/path.css')" +"@import url('path.css')" >>> _test_css("@import ( 'path.css')") -"@import ( '/web/20131010/http://example.com/path.css')" +"@import ( 'path.css')" ->>> _test_css("@import \"path.css\"") +>>> _test_css("@import \"/path.css\"") '@import "/web/20131010/http://example.com/path.css"' >>> _test_css("@import ('../path.css\"") @@ -178,7 +184,7 @@ r""" '@import (\'/web/20131010/http://example.com/url.css"' >>> _test_css("@import (\"url.css\")") -'@import ("/web/20131010/http://example.com/url.css")' +'@import ("url.css")' >>> _test_css("@import url(/url.css)\n@import url(/anotherurl.css)\n @import url(/and_a_third.css)") '@import url(/web/20131010/http://example.com/url.css)\n@import url(/web/20131010/http://example.com/anotherurl.css)\n @import url(/web/20131010/http://example.com/and_a_third.css)' diff --git a/pywb/rewrite/test/test_rewrite_live.py b/pywb/rewrite/test/test_rewrite_live.py index d3ffc3d8..62fa3bf9 100644 --- a/pywb/rewrite/test/test_rewrite_live.py +++ b/pywb/rewrite/test/test_rewrite_live.py @@ -123,7 +123,7 @@ def test_local_no_head_banner_only(): assert 'window.location = "/other.html"' in buff # link NOT rewritten - assert '"another.html"' in buff + assert '"/some/path/another.html"' in buff def test_local_banner_only_no_rewrite(): status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html', @@ -138,7 +138,7 @@ def test_local_banner_only_no_rewrite(): assert 'window.location = "http:\/\/example.com/dynamic_page.html"' in buff, buff # link NOT rewritten - assert '"another.html"' in buff + assert '"/some/path/another.html"' in buff def test_local_2_link_only_rewrite(): status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html', diff --git a/pywb/rewrite/test/test_url_rewriter.py b/pywb/rewrite/test/test_url_rewriter.py index ac23051a..da243cd7 100644 --- a/pywb/rewrite/test/test_url_rewriter.py +++ b/pywb/rewrite/test/test_url_rewriter.py @@ -21,19 +21,19 @@ # UrlRewriter tests >>> do_rewrite('other.html', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/') -'/web/20131010/http://example.com/path/other.html' +'other.html' ->>> do_rewrite('file.js', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/', 'js_') +>>> do_rewrite('/path/file.js', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/', 'js_') '/web/20131010js_/http://example.com/path/file.js' ->>> do_rewrite('file.js', '20131010/http://example.com/', '/coll/') +>>> do_rewrite('/file.js', '20131010/http://example.com/', '/coll/') '/coll/20131010/http://example.com/file.js' ->>> do_rewrite('file.js', '20131010/http://example.com', '/coll/', 'js_') +>>> do_rewrite('/file.js', '20131010/http://example.com', '/coll/', 'js_') '/coll/20131010js_/http://example.com/file.js' >>> do_rewrite('file.js', '20131010/http://example.com', '/coll/', '') -'/coll/20131010/http://example.com/file.js' +'file.js' >>> do_rewrite('/other.html', '20130907*/http://example.com/path/page.html', 'http://localhost:8080/coll/') '/coll/20130907*/http://example.com/other.html' @@ -41,8 +41,8 @@ >>> do_rewrite('/other.html', '20130907*/http://example.com/path/page.html', '/coll/') '/coll/20130907*/http://example.com/other.html' ->>> do_rewrite('./other.html', '20130907*/http://example.com/path/page.html', '/coll/') -'/coll/20130907*/http://example.com/path/other.html' +>>> do_rewrite('other.html', '20130907*/http://example.com/path/page.html', '/coll/') +'other.html' >>> do_rewrite('../other.html', '20131112im_/http://example.com/path/page.html', '/coll/') '/coll/20131112im_/http://example.com/other.html' @@ -87,7 +87,7 @@ '2020/http://example.com/other.html' >>> do_rewrite('', '20131010010203/http://example.com/file.html', '/web/') -'/web/20131010010203/http://example.com/file.html' +'' >>> do_rewrite('#anchor', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/') '#anchor' diff --git a/pywb/rewrite/url_rewriter.py b/pywb/rewrite/url_rewriter.py index 4774bc03..d6fda47f 100644 --- a/pywb/rewrite/url_rewriter.py +++ b/pywb/rewrite/url_rewriter.py @@ -19,6 +19,9 @@ class UrlRewriter(object): REL_SCHEME = ('//', r'\/\/', r'\\/\\/') + PARENT_PATH = '../' + REL_PATH = '/' + def __init__(self, wburl, prefix='', full_prefix=None, rel_prefix=None, root_path=None, cookie_scope=None, rewrite_opts=None): self.wburl = wburl if isinstance(wburl, WbUrl) else WbUrl(wburl) @@ -60,6 +63,11 @@ class UrlRewriter(object): if url.startswith(self.REL_SCHEME): is_abs = True scheme_rel = True + elif (not is_abs and + not url.startswith(self.REL_PATH) and + self.PARENT_PATH not in url): + return url + # if prefix starts with a scheme #if self.prefix_scheme: # url = self.prefix_scheme + ':' + url diff --git a/pywb/rules.yaml b/pywb/rules.yaml index 3b4e08cf..73eab273 100644 --- a/pywb/rules.yaml +++ b/pywb/rules.yaml @@ -135,6 +135,15 @@ rules: fuzzy_lookup: 'com,google,plus\)/_/.*?.*(f.sid=[^&]+)' + # periscope + #================================================================= + + - url_prefix: 'tv,periscope,assets)/js/' + + rewrite: + js_regexs: + - match: '"location"' + replace: '"WB_wombat_location"' # vimeo rules #================================================================= diff --git a/pywb/static/vidrw.js b/pywb/static/vidrw.js index 3deb2343..f4368ece 100644 --- a/pywb/static/vidrw.js +++ b/pywb/static/vidrw.js @@ -165,7 +165,7 @@ __wbvidrw = (function() { var name = child.getAttribute("name"); name = name.toLowerCase(); - if (name == "movie") { + if (name == "movie" || name == "src") { var value = child.getAttribute("value"); obj_url = value; } diff --git a/pywb/static/wb.js b/pywb/static/wb.js index f57e833b..6b5693c1 100644 --- a/pywb/static/wb.js +++ b/pywb/static/wb.js @@ -110,7 +110,7 @@ function remove_event(name, func, object) { } } -function notify_top() { +function notify_top(event) { if (!window.__WB_top_frame) { return; } @@ -123,25 +123,18 @@ function notify_top() { return; } - //if (window.__WB_top_frame.update_wb_url) { - // window.__WB_top_frame.update_wb_url(window.WB_wombat_location.href, - // wbinfo.timestamp, - // wbinfo.request_ts, - // wbinfo.is_live); - //} - var message = { "url": window.WB_wombat_location.href, "ts": wbinfo.timestamp, "request_ts": wbinfo.request_ts, "is_live": wbinfo.is_live, - "title": "", + "title": document ? document.title : "", "wb_type": "load", } window.__WB_top_frame.postMessage(message, "*"); - remove_event("readystatechange", notify_top, document); + //remove_event("readystatechange", notify_top, document); } this.load = function() { @@ -152,7 +145,7 @@ this.load = function() { window._wb_js_inited = true; // Non-Framed Replay OR top frame for framed replay! - if (window.wbinfo && (!window.__WB_top_frame || window.__WB_top_frame == window)) { + if (window.wbinfo && !window.__WB_top_frame) { if (wbinfo.is_framed && wbinfo.mod != "bn_") { var hash = window.location.hash; @@ -171,7 +164,7 @@ this.load = function() { add_event("readystatechange", init_banner, document); // Framed Replay - } else if (window.__WB_top_frame && window != window.__WB_top_frame && window.__WB_top_frame.update_wb_url) { + } else if (window.__WB_top_frame) { add_event("readystatechange", notify_top, document); } } diff --git a/pywb/static/wb_frame.js b/pywb/static/wb_frame.js index 168b914f..2f32f43b 100644 --- a/pywb/static/wb_frame.js +++ b/pywb/static/wb_frame.js @@ -19,26 +19,31 @@ This file is part of pywb, https://github.com/ikreymer/pywb var LIVE_COOKIE_REGEX = /pywb.timestamp=([\d]{1,14})/; -var TS_REGEX = /\/([\d]{1,14})\//; +var TS_REGEX = /\/([\d]{1,14})(?:\w+_)?\/(?:\w+[:])?\/\//; -var curr_state = {}; +//var curr_state = {}; var IFRAME_ID = "replay_iframe"; -function make_url(url, ts, mod) +var last_inner_hash = undefined; + +function make_url(url, ts, mod, prefix) { if (ts || mod) { mod += "/"; } + prefix = prefix || wbinfo.prefix; + if (ts) { - return wbinfo.prefix + ts + mod + url; + return prefix + ts + mod + url; } else { - return wbinfo.prefix + mod + url; + return prefix + mod + url; } } function push_state(state) { + /* var frame = document.getElementById(IFRAME_ID).contentWindow; if (frame.WB_wombat_location) { var curr_href = frame.WB_wombat_location.href; @@ -48,13 +53,19 @@ function push_state(state) { return; } } + */ - state.outer_url = make_url(state.url, state.request_ts, wbinfo.frame_mod); + state.outer_url = make_url(state.url, state.request_ts, wbinfo.frame_mod, wbinfo.outer_prefix); state.inner_url = make_url(state.url, state.request_ts, wbinfo.replay_mod); - var canon_url = make_url(state.url, state.request_ts, ""); + var canon_url = make_url(state.url, state.request_ts, "", wbinfo.outer_prefix); + if (window.location.href != canon_url) { - window.history.replaceState(state, "", canon_url); + if (state.wb_type != "pushState") { + window.history.replaceState(state, "", canon_url); + } else { + window.history.pushState(state, "", canon_url); + } } set_state(state); @@ -63,8 +74,8 @@ function push_state(state) { function pop_state(state) { set_state(state); - var frame = document.getElementById(IFRAME_ID).contentWindow; - frame.src = state.inner_url; + //var frame = document.getElementById(IFRAME_ID); + //frame.src = state.inner_url; } function extract_ts(url) @@ -103,7 +114,7 @@ function set_state(state) { } } - curr_state = state; + //curr_state = state; } window.onpopstate = function(event) { @@ -123,43 +134,6 @@ function extract_ts_cookie(value) { } } -function iframe_loaded(event) { - var url; - var ts; - var request_ts; - var capture_str; - var is_live = false; - var iframe = document.getElementById(IFRAME_ID).contentWindow; - - if (iframe.WB_wombat_location) { - url = iframe.WB_wombat_location.href; - } else { - url = extract_replay_url(iframe.location.href); - } - - if (iframe.wbinfo) { - ts = iframe.wbinfo.timestamp; - request_ts = iframe.wbinfo.request_ts; - is_live = iframe.wbinfo.is_live; - } else { - ts = extract_ts_cookie(iframe.document.cookie); - if (ts) { - is_live = true; - } else { - ts = extract_ts(iframe.location.href); - } - request_ts = ts; - } - - var state = {} - state["url"] = url; - state["ts"] = ts; - state["request_ts"] = request_ts; - state["is_live"] = is_live - - update_wb_url(state); -} - function init_pm() { var frame = document.getElementById(IFRAME_ID).contentWindow; @@ -172,7 +146,8 @@ function init_pm() { // Check if iframe url change message if (typeof(event.data) == "object" && event.data["wb_type"]) { - update_wb_url(event.data); + handle_message(event.data); + } else { // Pass to parent window.parent.postMessage(event.data, "*"); @@ -187,55 +162,67 @@ function init_pm() { } -function update_wb_url(state) { - if (curr_state.url == state.url && curr_state.ts == state.ts) { - return; +function handle_message(state) { + var type = state.wb_type; + + if (type == "load" || type == "pushState" || type == "replaceState") { + update_wb_url(state); + } else if (type == "go") { + window.history.go(state.param); + } else if (type == "back") { + window.history.back(); + } else if (type == "forward") { + window.history.forward(); + } else if (type == "hashchange") { + inner_hash_changed(state); } +} + + +function update_wb_url(state) { + //if (curr_state && curr_state.url == state.url && curr_state.ts == state.ts) { + // return; + //} state['capture_str'] = _wb_js.ts_to_date(state.ts, true); push_state(state); } -// Load Banner -if (_wb_js) { - _wb_js.load(); +function inner_hash_changed(state) { + if (window.location.hash != state.hash) { + window.location.hash = state.hash; + } + last_inner_hash = state.hash; +} + +function outer_hash_changed(event) { + if (window.location.hash == last_inner_hash) { + return; + } + + var frame = document.getElementById(IFRAME_ID).contentWindow; + + var message = {"wb_type": "outer_hashchange", "hash": window.location.hash} + + frame.postMessage(message, "*", undefined, true); } function init_hash_connect() { - var frame = document.getElementById(IFRAME_ID).contentWindow; + var frame = document.getElementById(IFRAME_ID); if (window.location.hash) { var curr_url = wbinfo.capture_url + window.location.hash; - - frame.location.href = make_url(curr_url, wbinfo.request_ts, wbinfo.replay_mod); + + frame.src = make_url(curr_url, wbinfo.request_ts, wbinfo.replay_mod); + + last_inner_hash = window.location.hash; + //frame.location.href = make_url(curr_url, wbinfo.request_ts, wbinfo.replay_mod); //frame.location.hash = window.location.hash; } - - function outer_hash_changed() { - var the_frame = document.getElementById(IFRAME_ID).contentWindow; - - if (window.location.hash == the_frame.location.hash) { - return; - } - - the_frame.location.hash = window.location.hash; - //the_frame.location.href = make_url(curr_url, curr_state.request_ts, wbinfo.replay_mod); - } - - function inner_hash_changed() { - var the_frame = document.getElementById(IFRAME_ID).contentWindow; - - if (window.location.hash == the_frame.location.hash) { - return; - } - - window.location.hash = the_frame.location.hash; - } if ("onhashchange" in window) { window.addEventListener("hashchange", outer_hash_changed, false); - frame.addEventListener("hashchange", inner_hash_changed, false); } // Init Post Message connect @@ -244,3 +231,10 @@ function init_hash_connect() { document.addEventListener("DOMContentLoaded", init_hash_connect); +// Load Banner +if (_wb_js) { + _wb_js.load(); +} + + + diff --git a/pywb/static/wombat.js b/pywb/static/wombat.js index af280f94..2d75f577 100644 --- a/pywb/static/wombat.js +++ b/pywb/static/wombat.js @@ -18,7 +18,7 @@ This file is part of pywb, https://github.com/ikreymer/pywb */ //============================================ -// Wombat JS-Rewriting Library v2.12 +// Wombat JS-Rewriting Library v2.16 //============================================ @@ -314,6 +314,8 @@ var wombat_internal = function($wbwindow) { return ""; } + var orig_href = href; + // proxy mode: no extraction needed if (!wb_replay_prefix) { return href; @@ -348,7 +350,7 @@ var wombat_internal = function($wbwindow) { href = href.substr(4); } - if (!starts_with(href, VALID_PREFIXES)) { + if (href != orig_href && !starts_with(href, VALID_PREFIXES)) { href = HTTP_PREFIX + href; } } @@ -402,7 +404,17 @@ var wombat_internal = function($wbwindow) { function make_parser(href) { href = extract_orig(href); - var p = $wbwindow.document.createElement("a", true); + var baseWin; + + // special case: for newly opened blank windows, use the opener + // to create parser to have the proper baseURI + if ($wbwindow.location.href == "about:blank" && $wbwindow.opener) { + baseWin = $wbwindow.opener; + } else { + baseWin = $wbwindow; + } + + var p = baseWin.document.createElement("a", true); p.href = href; return p; } @@ -712,21 +724,21 @@ var wombat_internal = function($wbwindow) { function rewritten_func(state_obj, title, url) { url = rewrite_url(url); + var abs_url = extract_orig(url); + + if (abs_url && !starts_with(abs_url, $wbwindow.WB_wombat_location.origin + "/")) { + throw new DOMException("Invalid history change: " + abs_url); + } + if (url == $wbwindow.location.href) { return; } orig_func.call(this, state_obj, title, url); - //if ($wbwindow.__WB_top_frame && $wbwindow != $wbwindow.__WB_top_frame && $wbwindow.__WB_top_frame.update_wb_url) { - // $wbwindow.__WB_top_frame.update_wb_url($wbwindow.WB_wombat_location.href, - // wb_info.timestamp, - // wb_info.request_ts, - // wb_info.is_live); - //} - if ($wbwindow.__WB_top_frame && $wbwindow != $wbwindow.__WB_top_frame) { + if ($wbwindow.__WB_top_frame) { var message = { - "url": url, + "url": abs_url, "ts": wb_info.timestamp, "request_ts": wb_info.request_ts, "is_live": wb_info.is_live, @@ -734,7 +746,7 @@ var wombat_internal = function($wbwindow) { "wb_type": func_name, } - $wbwindow.__WB_top_frame.postMessage(message, "*"); + $wbwindow.__WB_top_frame.postMessage(message, wb_info.top_host); } } @@ -746,6 +758,45 @@ var wombat_internal = function($wbwindow) { return rewritten_func; } + //============================================ + function override_history_nav(func_name) { + if (!$wbwindow.history) { + return; + } + + // Only useful for framed replay + if (!$wbwindow.__WB_top_frame) { + return; + } + + var orig_func = $wbwindow.history[func_name]; + + if (!orig_func) { + return; + } + + function rewritten_func() { + orig_func.apply(this, arguments); + + var message = { + "wb_type": func_name, + } + + if (func_name == "go") { + message["param"] = arguments[0]; + } + + $wbwindow.__WB_top_frame.postMessage(message, wb_info.top_host); + } + + $wbwindow.history[func_name] = rewritten_func; + if ($wbwindow.History && $wbwindow.History.prototype) { + $wbwindow.History.prototype[func_name] = rewritten_func; + } + + return rewritten_func; + } + //============================================ function init_ajax_rewrite() { if (!$wbwindow.XMLHttpRequest || @@ -1157,6 +1208,35 @@ var wombat_internal = function($wbwindow) { return value; } + //============================================ + function rewrite_frame_src(elem, name) + { + var value = wb_getAttribute.call(elem, name); + var new_value = undefined; + + // special case for rewriting javascript: urls that contain WB_wombat_ + // must insert wombat init first! + if (starts_with(value, "javascript:")) { + if (value.indexOf("WB_wombat_") >= 0) { + var JS = "javascript:"; + new_value = JS; + new_value += "window.parent._wb_wombat.init_new_window_wombat(window);" + new_value += value.substr(JS.length); + } + } + + if (!new_value) { + new_value = rewrite_url(value, false); + } + + if (new_value != value) { + wb_setAttribute.call(elem, name, new_value); + return true; + } + + return false; + } + //============================================ function rewrite_elem(elem) { @@ -1169,6 +1249,7 @@ var wombat_internal = function($wbwindow) { if (elem.tagName == "STYLE") { var new_content = rewrite_style(elem.textContent); if (elem.textContent != new_content) { + elem.textContent = new_content; changed = true; } } else if (elem.tagName == "OBJECT") { @@ -1177,10 +1258,13 @@ var wombat_internal = function($wbwindow) { changed = rewrite_attr(elem, "action", true); } else if (elem.tagName == "INPUT") { changed = rewrite_attr(elem, "value", true); + } else if (elem.tagName == "IFRAME" || elem.tagName == "FRAME") { + changed = rewrite_frame_src(elem, "src"); } else { changed = rewrite_attr(elem, "src"); changed = rewrite_attr(elem, "href") || changed; changed = rewrite_attr(elem, "style") || changed; + changed = rewrite_attr(elem, "poster") || changed; } if (elem.getAttribute) { @@ -1648,6 +1732,47 @@ var wombat_internal = function($wbwindow) { } } + //============================================ + function init_hash_change() + { + if (!$wbwindow.__WB_top_frame) { + return; + } + + function receive_hash_change(event) + { + if (!event.data || event.source != $wbwindow.__WB_top_frame) { + return; + } + + var message = event.data; + + if (!message.wb_type) { + return; + } + + if (message.wb_type == "outer_hashchange") { + if ($wbwindow.location.hash != message.hash) { + $wbwindow.location.hash = message.hash; + } + } + } + + function send_hash_change() { + var message = {"wb_type": "hashchange", + "hash": $wbwindow.location.hash + } + + if ($wbwindow.__WB_top_frame) { + $wbwindow.__WB_top_frame.postMessage(message, wb_info.top_host); + } + } + + $wbwindow.addEventListener("message", receive_hash_change); + + $wbwindow.addEventListener("hashchange", send_hash_change); + } + //============================================ function init_postmessage_override($wbwindow) { @@ -1659,7 +1784,7 @@ var wombat_internal = function($wbwindow) { $wbwindow.__orig_postMessage = orig; - var postmessage_rewritten = function(message, targetOrigin, transfer) { + var postmessage_rewritten = function(message, targetOrigin, transfer, from_top) { var from = undefined; var src_id = undefined; @@ -1699,7 +1824,9 @@ var wombat_internal = function($wbwindow) { var new_message = {"from": from, "to_host": to, "src_id": src_id, - "message": message}; + "message": message, + "from_top": from_top, + } if (targetOrigin != "*") { targetOrigin = this.location.origin; @@ -1737,7 +1864,9 @@ var wombat_internal = function($wbwindow) { var source = event.source; - if (event.data.src_id && win.__WB_win_id && win.__WB_win_id[event.data.src_id]) { + if (event.data.from_top) { + source = win.__WB_top_frame; + } else if (event.data.src_id && win.__WB_win_id && win.__WB_win_id[event.data.src_id]) { source = win.__WB_win_id[event.data.src_id]; } @@ -1804,7 +1933,9 @@ var wombat_internal = function($wbwindow) { var open_rewritten = function(strUrl, strWindowName, strWindowFeatures) { strUrl = rewrite_url(strUrl, false, ""); - return orig.call(this, strUrl, strWindowName, strWindowFeatures); + var res = orig.call(this, strUrl, strWindowName, strWindowFeatures); + init_new_window_wombat(res, strUrl); + return res; } $wbwindow.open = open_rewritten; @@ -1845,6 +1976,24 @@ var wombat_internal = function($wbwindow) { cookie = cookie.replace(wb_abs_prefix, ''); cookie = cookie.replace(wb_rel_prefix, ''); + // rewrite domain + cookie = cookie.replace(cookie_domain_regex, function(m, m1) { + var message = {"domain": m1, + "cookie": cookie, + "wb_type": "cookie", + } + + // norify of cookie setting to allow server-side tracking + $wbwindow.__WB_top_frame.postMessage(message, wb_info.top_host); + + // if no subdomain, eg. "localhost", just remove domain altogether + if ($wbwindow.location.hostname.indexOf(".") >= 0 && !IP_RX.test($wbwindow.location.hostname)) { + return "Domain=." + $wbwindow.location.hostname; + } else { + return ""; + } + }); + // rewrite path cookie = cookie.replace(cookie_path_regex, function(m, m1) { var rewritten = rewrite_url(m1); @@ -1856,16 +2005,6 @@ var wombat_internal = function($wbwindow) { return "Path=" + rewritten; }); - // rewrite domain - cookie = cookie.replace(cookie_domain_regex, function(m, m1) { - // if no subdomain, eg. "localhost", just remove domain altogether - if ($wbwindow.location.hostname.indexOf(".") >= 0 && !IP_RX.test($wbwindow.location.hostname)) { - return "Domain=." + $wbwindow.location.hostname; - } else { - return ""; - } - }); - // rewrite secure, if needed if ($wbwindow.location.protocol != "https:") { cookie = cookie.replace("secure", ""); @@ -1988,7 +2127,11 @@ var wombat_internal = function($wbwindow) { //var src = iframe.src; var src = wb_getAttribute.call(iframe, "src"); - + + init_new_window_wombat(win, src); + } + + function init_new_window_wombat(win, src) { if (!src || src == "" || src == "about:blank" || src.indexOf("javascript:") >= 0) { win._WBWombat = wombat_internal(win); win._wb_wombat = new win._WBWombat(wb_info); @@ -2100,6 +2243,20 @@ var wombat_internal = function($wbwindow) { } } + //============================================ + function init_beacon_override() + { + if (!$wbwindow.navigator.sendBeacon) { + return; + } + + var orig_sendBeacon = $wbwindow.navigator.sendBeacon; + + $wbwindow.navigator.sendBeacon = function(url, data) { + return orig_sendBeacon.call(this, rewrite_url(url), data); + } + } + //============================================ function get_final_url(prefix, mod, url) { if (mod == undefined) { @@ -2126,6 +2283,8 @@ var wombat_internal = function($wbwindow) { wb_opts = wbinfo.wombat_opts; wb_replay_prefix = wbinfo.prefix; + wb_info.top_host = wb_info.top_host || "*"; + init_top_frame($wbwindow); init_wombat_top($wbwindow); @@ -2174,6 +2333,10 @@ var wombat_internal = function($wbwindow) { override_history_func("pushState"); override_history_func("replaceState"); + override_history_nav("go"); + override_history_nav("back"); + override_history_nav("forward"); + // open init_open_override(); @@ -2183,6 +2346,8 @@ var wombat_internal = function($wbwindow) { init_postmessage_override($wbwindow); } + init_hash_change(); + // write init_write_override(); @@ -2242,14 +2407,17 @@ var wombat_internal = function($wbwindow) { // Date init_date_override(wbinfo.wombat_sec); - // registerProtocolHandler override init_registerPH_override(); + //sendBeacon override + init_beacon_override(); + // expose functions this.extract_orig = extract_orig; this.rewrite_url = rewrite_url; this.watch_elem = watch_elem; + this.init_new_window_wombat = init_new_window_wombat; } function init_top_frame($wbwindow) { @@ -2290,11 +2458,14 @@ var wombat_internal = function($wbwindow) { var real_parent = replay_top.__WB_orig_parent || replay_top.parent; // Check to ensure top frame is different window and directly accessible (later refactor to support postMessage) - try { - if ((real_parent == $wbwindow) || !real_parent.wbinfo || !real_parent.wbinfo.is_frame) { - real_parent = undefined; - } - } catch (e) { + //try { + // if ((real_parent == $wbwindow) || !real_parent.wbinfo || !real_parent.wbinfo.is_frame) { + // real_parent = undefined; + // } + //} catch (e) { + // real_parent = undefined; + //} + if (real_parent == $wbwindow || !wb_info.is_framed) { real_parent = undefined; } diff --git a/pywb/templates/frame_insert.html b/pywb/templates/frame_insert.html index af6a81af..e400d1dd 100644 --- a/pywb/templates/frame_insert.html +++ b/pywb/templates/frame_insert.html @@ -31,7 +31,7 @@ html, body
- +
diff --git a/pywb/templates/head_insert.html b/pywb/templates/head_insert.html index f398cc01..9e838b61 100644 --- a/pywb/templates/head_insert.html +++ b/pywb/templates/head_insert.html @@ -4,7 +4,7 @@ wbinfo.url = "{{ cdx.url }}"; wbinfo.timestamp = "{{ cdx.timestamp }}"; wbinfo.request_ts = "{{ wbrequest.wb_url.timestamp }}"; - wbinfo.prefix = "{{ wbrequest.wb_prefix }}"; + wbinfo.prefix = decodeURI("{{ wbrequest.wb_prefix }}"); wbinfo.mod = "{{ wbrequest.wb_url.mod }}"; wbinfo.top_url = "{{ top_url }}"; wbinfo.is_framed = {{ "true" if wbrequest.options.is_framed else "false" }}; diff --git a/pywb/utils/loaders.py b/pywb/utils/loaders.py index 4c298334..3841134b 100644 --- a/pywb/utils/loaders.py +++ b/pywb/utils/loaders.py @@ -392,6 +392,7 @@ class HttpLoader(BaseLoader): self.session = requests.Session() r = self.session.get(url, headers=headers, stream=True) + r.raise_for_status() return r.raw diff --git a/pywb/utils/statusandheaders.py b/pywb/utils/statusandheaders.py index a7188c38..124b7f75 100644 --- a/pywb/utils/statusandheaders.py +++ b/pywb/utils/statusandheaders.py @@ -7,6 +7,7 @@ from copy import copy from six.moves import range from six import iteritems from pywb.utils.loaders import to_native_str +import uuid WRAP_WIDTH = 80 @@ -257,6 +258,12 @@ class StatusAndHeadersParser(object): plen = len(prefix) return (key_upper[:plen], key[plen:]) + @staticmethod + def make_warc_id(id_=None): + if not id_: + id_ = uuid.uuid1() + return ''.format(id_) + #================================================================= class StatusAndHeadersParserException(Exception): diff --git a/pywb/utils/test/test_loaders.py b/pywb/utils/test/test_loaders.py index 5d71a711..4b755726 100644 --- a/pywb/utils/test/test_loaders.py +++ b/pywb/utils/test/test_loaders.py @@ -138,6 +138,7 @@ import pytest import six from six import StringIO from io import BytesIO +import requests from pywb.utils.loaders import BlockLoader, HMACCookieMaker, to_file_url from pywb.utils.loaders import LimitReader, extract_client_cookie, extract_post_query @@ -176,6 +177,14 @@ def test_s3_read_1(): assert reader.readline() == b'WARC/1.0\r\n' assert reader.readline() == b'WARC-Type: response\r\n' +def test_limit_post(): + reader = LimitReader(BytesIO(b'abcdefg'), 3) + r = requests.request(method='POST', + url='http://httpbin.org/post', + data=reader, + headers={'Content-Length': '3'}) + + assert '"abc"' in r.text # Error def test_err_no_such_file(): diff --git a/pywb/utils/timeutils.py b/pywb/utils/timeutils.py index 08ea700d..3c5cd9f3 100644 --- a/pywb/utils/timeutils.py +++ b/pywb/utils/timeutils.py @@ -121,6 +121,18 @@ def iso_date_to_timestamp(string): return datetime_to_timestamp(iso_date_to_datetime(string)) +def timestamp_to_iso_date(string): + """ + >>> timestamp_to_iso_date('20131226101112') + '2013-12-26T10:11:12Z' + + >>> timestamp_to_iso_date('20131226101112') + '2013-12-26T10:11:12Z' + """ + + + return datetime_to_iso_date(timestamp_to_datetime(string)) + def http_date_to_timestamp(string): """ diff --git a/pywb/warc/archiveiterator.py b/pywb/warc/archiveiterator.py index 4ff500d4..69883304 100644 --- a/pywb/warc/archiveiterator.py +++ b/pywb/warc/archiveiterator.py @@ -54,15 +54,18 @@ class ArchiveIterator(object): def __init__(self, fileobj, no_record_parse=False, - verify_http=False): + verify_http=False, arc2warc=False): self.fh = fileobj - self.loader = ArcWarcRecordLoader(verify_http=verify_http) + self.loader = ArcWarcRecordLoader(verify_http=verify_http, + arc2warc=arc2warc) self.reader = None self.offset = 0 self.known_format = None + self.mixed_arc_warc = arc2warc + self.member_info = None self.no_record_parse = no_record_parse @@ -226,7 +229,8 @@ class ArchiveIterator(object): self.member_info = None # Track known format for faster parsing of other records - self.known_format = record.format + if not self.mixed_arc_warc: + self.known_format = record.format return record @@ -359,6 +363,9 @@ class DefaultRecordParser(object): if not include_all and not minimal and (record.status_headers.get_statuscode() == '-'): continue + if record.rec_type == 'arc_header': + continue + if record.format == 'warc': if (record.rec_type in ('request', 'warcinfo') and not include_all and @@ -495,9 +502,6 @@ class DefaultRecordParser(object): def parse_arc_record(self, record): """ Parse arc record """ - if record.rec_type == 'arc_header': - return None - url = record.rec_headers.get_header('uri') url = url.replace('\r', '%0D') url = url.replace('\n', '%0A') @@ -528,7 +532,8 @@ class DefaultRecordParser(object): def __call__(self, fh): aiter = ArchiveIterator(fh, self.options.get('minimal', False), - self.options.get('verify_http', False)) + self.options.get('verify_http', False), + self.options.get('arc2warc', False)) entry_iter = self.create_record_iter(aiter) diff --git a/pywb/warc/recordloader.py b/pywb/warc/recordloader.py index 402d1524..43931958 100644 --- a/pywb/warc/recordloader.py +++ b/pywb/warc/recordloader.py @@ -9,6 +9,7 @@ from pywb.utils.loaders import to_native_str from pywb.utils.bufferedreaders import DecompressingBufferedReader from pywb.utils.wbexception import WbException +from pywb.utils.timeutils import timestamp_to_iso_date from six.moves import zip import six @@ -37,11 +38,6 @@ class ArchiveLoadFailed(WbException): #================================================================= class ArcWarcRecordLoader(object): - # Standard ARC v1.0 headers - # TODO: support ARC v2.0 also? - ARC_HEADERS = ["uri", "ip-address", "archive-date", - "content-type", "length"] - WARC_TYPES = ['WARC/1.0', 'WARC/0.17', 'WARC/0.18'] HTTP_TYPES = ['HTTP/1.0', 'HTTP/1.1'] @@ -55,14 +51,17 @@ class ArcWarcRecordLoader(object): HTTP_SCHEMES = ('http:', 'https:') def __init__(self, loader=None, cookie_maker=None, block_size=8192, - verify_http=True): + verify_http=True, arc2warc=True): if not loader: loader = BlockLoader(cookie_maker=cookie_maker) self.loader = loader self.block_size = block_size - self.arc_parser = ARCHeadersParser(self.ARC_HEADERS) + if arc2warc: + self.arc_parser = ARC2WARCHeadersParser() + else: + self.arc_parser = ARCHeadersParser() self.warc_parser = StatusAndHeadersParser(self.WARC_TYPES) self.http_parser = StatusAndHeadersParser(self.HTTP_TYPES, verify_http) @@ -114,12 +113,16 @@ class ArcWarcRecordLoader(object): else: rec_type = 'response' - elif the_format == 'warc': + elif the_format in ('warc', 'arc2warc'): rec_type = rec_headers.get_header('WARC-Type') uri = rec_headers.get_header('WARC-Target-URI') length = rec_headers.get_header('Content-Length') content_type = rec_headers.get_header('Content-Type') - sub_len = 0 + if the_format == 'warc': + sub_len = 0 + else: + sub_len = rec_headers.total_len + the_format = 'warc' is_err = False @@ -201,7 +204,7 @@ class ArcWarcRecordLoader(object): # now try as arc try: rec_headers = self.arc_parser.parse(stream, statusline) - return 'arc', rec_headers + return self.arc_parser.get_rec_type(), rec_headers except StatusAndHeadersParserException as se: if known_format == 'arc': msg = 'Invalid ARC record, first line: ' @@ -212,8 +215,15 @@ class ArcWarcRecordLoader(object): #================================================================= class ARCHeadersParser(object): - def __init__(self, headernames): - self.headernames = headernames + # ARC 1.0 headers + ARC_HEADERS = ["uri", "ip-address", "archive-date", + "content-type", "length"] + + def __init__(self): + self.headernames = self.get_header_names() + + def get_rec_type(self): + return 'arc' def parse(self, stream, headerline=None): total_read = 0 @@ -250,12 +260,60 @@ class ARCHeadersParser(object): msg = msg.format(headernames, parts) raise StatusAndHeadersParserException(msg, parts) - headers = [] - for name, value in zip(headernames, parts): - headers.append((name, value)) + protocol, headers = self._get_protocol_and_headers(headerline, parts) return StatusAndHeaders(statusline='', headers=headers, - protocol='ARC/1.0', + protocol='WARC/1.0', total_len=total_read) + + @classmethod + def get_header_names(cls): + return cls.ARC_HEADERS + + def _get_protocol_and_headers(self, headerline, parts): + headers = [] + + for name, value in zip(self.headernames, parts): + headers.append((name, value)) + + return ('ARC/1.0', headers) + + +#================================================================= +class ARC2WARCHeadersParser(ARCHeadersParser): + # Headers for converting ARC -> WARC Header + ARC_TO_WARC_HEADERS = ["WARC-Target-URI", + "WARC-IP-Address", + "WARC-Date", + "Content-Type", + "Content-Length"] + + def get_rec_type(self): + return 'arc2warc' + + @classmethod + def get_header_names(cls): + return cls.ARC_TO_WARC_HEADERS + + def _get_protocol_and_headers(self, headerline, parts): + headers = [] + + for name, value in zip(self.headernames, parts): + if name == 'WARC-Date': + value = timestamp_to_iso_date(value) + + headers.append((name, value)) + + if headerline.startswith('filedesc://'): + rec_type = 'arc_header' + else: + rec_type = 'response' + + headers.append(('WARC-Type', rec_type)) + headers.append(('WARC-Record-ID', StatusAndHeadersParser.make_warc_id())) + + return ('WARC/1.0', headers) + + diff --git a/pywb/warc/test/test_indexing.py b/pywb/warc/test/test_indexing.py index c38d3a08..daabf7bb 100644 --- a/pywb/warc/test/test_indexing.py +++ b/pywb/warc/test/test_indexing.py @@ -44,6 +44,19 @@ com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ CDX N b a m s k r M S V g com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1656 151 example.arc +# arc.gz +>>> print_cdx_index('example.arc.gz', arc2warc=True) + CDX N b a m s k r M S V g +com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 856 171 example.arc.gz + +# arc +>>> print_cdx_index('example.arc', arc2warc=True) + CDX N b a m s k r M S V g +com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1656 151 example.arc + + + + # wget warc, includes metadata by default >>> print_cdx_index('example-wget-1-14.warc.gz') CDX N b a m s k r M S V g @@ -328,6 +341,22 @@ def test_cdxj_arc_minimal(): com,example)/ 20140216050221 {"url": "http://example.com/", "digest": "PEWDX5GTH66WU74WBPGFECIYBMPMP3FP", "length": "856", "offset": "171", "filename": "example.arc.gz"} """) +def test_cdxj_arc_conv(): + # arc.gz -- json + res = cdx_index('example.arc.gz', cdxj=True, arc2warc=True) + assert parse_cdxj(res) == parse_cdxj(b""" +com,example)/ 20140216050221 {"url": "http://example.com/", "mime": "text/html", "status": "200", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "856", "offset": "171", "filename": "example.arc.gz"} +""") + +def test_cdxj_arc_minimal_conv(): + # arc.gz -- minimal + json + res = cdx_index('example.arc.gz', cdxj=True, minimal=True, arc2warc=True) + assert parse_cdxj(res) == parse_cdxj(b""" +com,example)/ 20140216050221 {"url": "http://example.com/", "digest": "PEWDX5GTH66WU74WBPGFECIYBMPMP3FP", "length": "856", "offset": "171", "filename": "example.arc.gz"} +""") + + + def test_cdxj_empty(): options = dict(cdxj=True) diff --git a/pywb/webapp/cdx_api_handler.py b/pywb/webapp/cdx_api_handler.py index 1835647a..1ebd0018 100644 --- a/pywb/webapp/cdx_api_handler.py +++ b/pywb/webapp/cdx_api_handler.py @@ -1,11 +1,13 @@ from pywb.cdx.cdxserver import create_cdx_server +from pywb.utils.wbexception import NotFoundException from pywb.framework.basehandlers import BaseHandler from pywb.framework.wbrequestresponse import WbResponse from pywb.webapp.query_handler import QueryHandler from six.moves.urllib.parse import parse_qs +import json import six @@ -21,7 +23,18 @@ class CDXAPIHandler(BaseHandler): def __call__(self, wbrequest): params = self.extract_params_from_wsgi_env(wbrequest.env) - cdx_iter = self.index_handler.load_cdx(wbrequest, params) + try: + cdx_iter = self.index_handler.load_cdx(wbrequest, params) + except NotFoundException: + msg = 'No Captures found for: ' + params.get('url') + if params.get('output') == 'json': + msg = json.dumps(dict(error=msg)) + content_type='application/json' + else: + content_type='text/plain' + + return WbResponse.text_response(msg, content_type=content_type, + status='404 Not Found') return WbResponse.text_stream(cdx_iter, content_type='text/plain') diff --git a/sample_archive/text_content/sample.html b/sample_archive/text_content/sample.html index f2ed6842..fc2d66d0 100644 --- a/sample_archive/text_content/sample.html +++ b/sample_archive/text_content/sample.html @@ -10,5 +10,5 @@ if (some_val) { } Test Content -Some Link +Some Link diff --git a/sample_archive/text_content/sample_no_head.html b/sample_archive/text_content/sample_no_head.html index ed4bc4f3..4242336f 100644 --- a/sample_archive/text_content/sample_no_head.html +++ b/sample_archive/text_content/sample_no_head.html @@ -5,4 +5,4 @@ if (some_val) { } Test Content -Some Link +Some Link diff --git a/tests/test_live_proxy.py b/tests/test_live_proxy.py index 6c48c5de..575c51a8 100644 --- a/tests/test_live_proxy.py +++ b/tests/test_live_proxy.py @@ -125,7 +125,7 @@ class TestProxyLiveRewriter: def test_echo_proxy_start_unbounded_remove_range(self): headers = [('Range', 'bytes=0-')] - resp = self.testapp.get('/rewrite/http://example.com/', headers=headers) + resp = self.testapp.get('/rewrite/http://httpbin.org/range/100', headers=headers) # actual response is with range assert resp.status_int == 206 @@ -138,7 +138,7 @@ class TestProxyLiveRewriter: assert self.requestlog[0] == resp.text assert resp.headers['x-archive-orig-x-proxy'] == 'test' - assert self.requestlog[0].startswith('GET http://example.com/ HTTP/1.1') + assert self.requestlog[0].startswith('GET http://httpbin.org/range/100 HTTP/1.1') assert 'range: ' not in self.requestlog[0] assert len(self.cache) == 0 diff --git a/tests/test_live_rewriter.py b/tests/test_live_rewriter.py index fdc94415..0f4129f4 100644 --- a/tests/test_live_rewriter.py +++ b/tests/test_live_rewriter.py @@ -4,7 +4,6 @@ from pywb.framework.wsgi_wrappers import init_app import webtest import pywb.rewrite.rewrite_live - #================================================================= class MockYTDWrapper(object): def extract_info(self, url): @@ -47,6 +46,7 @@ class TestLiveRewriter: def test_live_live_frame(self): resp = self.testapp.get('/live/http://example.com/') assert resp.status_int == 200 + resp.charset = 'utf-8' assert '