mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
Merge branch 'develop'
This commit is contained in:
commit
f4e5a7df5d
@ -1,4 +1,4 @@
|
||||
PyWb 0.30.1
|
||||
PyWb 0.31.0
|
||||
===========
|
||||
|
||||
.. image:: https://travis-ci.org/ikreymer/pywb.svg?branch=master
|
||||
|
@ -1,4 +1,4 @@
|
||||
__version__ = '0.30.1'
|
||||
__version__ = '0.31.0'
|
||||
|
||||
DEFAULT_CONFIG = 'pywb/default_config.yaml'
|
||||
|
||||
|
@ -153,7 +153,7 @@ class CDXObject(OrderedDict):
|
||||
raise CDXException(msg)
|
||||
|
||||
for header, field in zip(cdxformat, fields):
|
||||
self[header] = field.decode('utf-8')
|
||||
self[header] = to_native_str(field, 'utf-8')
|
||||
|
||||
self.cdxline = cdxline
|
||||
|
||||
@ -213,7 +213,7 @@ class CDXObject(OrderedDict):
|
||||
|
||||
def __str__(self):
|
||||
if self.cdxline:
|
||||
return self.cdxline.decode('utf-8')
|
||||
return to_native_str(self.cdxline, 'utf-8')
|
||||
|
||||
if not self._from_json:
|
||||
return ' '.join(str(val) for val in six.itervalues(self))
|
||||
@ -263,7 +263,7 @@ class IDXObject(OrderedDict):
|
||||
raise CDXException(msg.format(len(fields), self.NUM_REQ_FIELDS))
|
||||
|
||||
for header, field in zip(self.FORMAT, fields):
|
||||
self[header] = field.decode('utf-8')
|
||||
self[header] = to_native_str(field, 'utf-8')
|
||||
|
||||
self['offset'] = int(self['offset'])
|
||||
self['length'] = int(self['length'])
|
||||
@ -285,4 +285,4 @@ class IDXObject(OrderedDict):
|
||||
return json_encode(self) + '\n'
|
||||
|
||||
def __str__(self):
|
||||
return self.idxline.decode('utf-8')
|
||||
return to_native_str(self.idxline, 'utf-8')
|
||||
|
@ -184,14 +184,15 @@ class WbRequest(object):
|
||||
if not self.wb_url:
|
||||
return
|
||||
|
||||
mime = self.env.get('CONTENT_TYPE', '').split(';')[0]
|
||||
mime = self.env.get('CONTENT_TYPE', '')
|
||||
length = self.env.get('CONTENT_LENGTH')
|
||||
stream = self.env['wsgi.input']
|
||||
|
||||
buffered_stream = BytesIO()
|
||||
|
||||
post_query = extract_post_query('POST', mime, length, stream,
|
||||
buffered_stream=buffered_stream)
|
||||
buffered_stream=buffered_stream,
|
||||
environ=self.env)
|
||||
|
||||
if post_query:
|
||||
self.env['wsgi.input'] = buffered_stream
|
||||
|
@ -120,7 +120,7 @@ class HTMLRewriterMixin(object):
|
||||
|
||||
def _rewrite_meta_refresh(self, meta_refresh):
|
||||
if not meta_refresh:
|
||||
return None
|
||||
return ''
|
||||
|
||||
m = self.META_REFRESH_REGEX.match(meta_refresh)
|
||||
if not m:
|
||||
@ -133,6 +133,9 @@ class HTMLRewriterMixin(object):
|
||||
return meta_refresh
|
||||
|
||||
def _rewrite_base(self, url, mod=''):
|
||||
if not url:
|
||||
return ''
|
||||
|
||||
url = self._ensure_url_has_path(url)
|
||||
|
||||
base_url = self._rewrite_url(url, mod)
|
||||
@ -183,11 +186,11 @@ class HTMLRewriterMixin(object):
|
||||
|
||||
def _rewrite_url(self, value, mod=None):
|
||||
if not value:
|
||||
return None
|
||||
return ''
|
||||
|
||||
value = value.strip()
|
||||
if not value:
|
||||
return None
|
||||
return ''
|
||||
|
||||
value = self.try_unescape(value)
|
||||
return self.url_rewriter.rewrite(value, mod)
|
||||
@ -209,21 +212,24 @@ class HTMLRewriterMixin(object):
|
||||
return new_value
|
||||
|
||||
def _rewrite_srcset(self, value, mod=''):
|
||||
if not value:
|
||||
return ''
|
||||
|
||||
values = value.split(',')
|
||||
values = map(lambda x: self._rewrite_url(x.strip()), values)
|
||||
values = [self._rewrite_url(v.strip()) for v in values]
|
||||
return ', '.join(values)
|
||||
|
||||
def _rewrite_css(self, css_content):
|
||||
if css_content:
|
||||
return self.css_rewriter.rewrite(css_content)
|
||||
else:
|
||||
return None
|
||||
return ''
|
||||
|
||||
def _rewrite_script(self, script_content):
|
||||
if script_content:
|
||||
return self.js_rewriter.rewrite(script_content)
|
||||
else:
|
||||
return None
|
||||
return ''
|
||||
|
||||
def has_attr(self, tag_attrs, attr):
|
||||
name, value = attr
|
||||
@ -252,6 +258,11 @@ class HTMLRewriterMixin(object):
|
||||
self.out.write('<' + tag)
|
||||
|
||||
for attr_name, attr_value in tag_attrs:
|
||||
empty_attr = False
|
||||
if attr_value is None:
|
||||
attr_value = ''
|
||||
empty_attr = True
|
||||
|
||||
# special case: inline JS/event handler
|
||||
if ((attr_value and attr_value.startswith('javascript:'))
|
||||
or attr_name.startswith('on')):
|
||||
@ -324,7 +335,7 @@ class HTMLRewriterMixin(object):
|
||||
attr_value = self._rewrite_url(attr_value, rw_mod)
|
||||
|
||||
# write the attr!
|
||||
self._write_attr(attr_name, attr_value)
|
||||
self._write_attr(attr_name, attr_value, empty_attr)
|
||||
|
||||
return True
|
||||
|
||||
@ -347,11 +358,17 @@ class HTMLRewriterMixin(object):
|
||||
|
||||
return True
|
||||
|
||||
def _write_attr(self, name, value):
|
||||
# parser doesn't differentiate between 'attr=""' and just 'attr'
|
||||
# 'attr=""' is more common, so use that form
|
||||
if value:
|
||||
def _write_attr(self, name, value, empty_attr):
|
||||
# if empty_attr is set, just write 'attr'!
|
||||
if empty_attr:
|
||||
self.out.write(' ' + name)
|
||||
|
||||
# write with value, if set
|
||||
elif value:
|
||||
|
||||
self.out.write(' ' + name + '="' + value.replace('"', '"') + '"')
|
||||
|
||||
# otherwise, 'attr=""' is more common, so use that form
|
||||
else:
|
||||
self.out.write(' ' + name + '=""')
|
||||
|
||||
@ -421,8 +438,9 @@ class HTMLRewriter(HTMLRewriterMixin, HTMLParser):
|
||||
def feed(self, string):
|
||||
try:
|
||||
HTMLParser.feed(self, string)
|
||||
except Exception: # pragma: no cover
|
||||
# only raised in 2.6
|
||||
except Exception as e: # pragma: no cover
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
self.out.write(string)
|
||||
|
||||
def _internal_close(self):
|
||||
|
52
pywb/rewrite/rewrite_amf.py
Normal file
52
pywb/rewrite/rewrite_amf.py
Normal file
@ -0,0 +1,52 @@
|
||||
from io import BytesIO
|
||||
from six.moves import zip
|
||||
from pywb.rewrite.rewrite_content import RewriteContent
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Expiermental: not fully tested
|
||||
class RewriteContentAMF(RewriteContent): #pragma: no cover
|
||||
def handle_custom_rewrite(self, text_type, status_headers, stream, env):
|
||||
|
||||
if status_headers.get_header('Content-Type') == 'application/x-amf':
|
||||
stream = self.rewrite_amf(stream, env)
|
||||
|
||||
return (super(RewriteContentAMF, self).
|
||||
handle_custom_rewrite(text_type, status_headers, stream, env))
|
||||
|
||||
def rewrite_amf(self, stream, env):
|
||||
try:
|
||||
from pyamf import remoting
|
||||
|
||||
iobuff = BytesIO()
|
||||
while True:
|
||||
buff = stream.read()
|
||||
if not buff:
|
||||
break
|
||||
iobuff.write(buff)
|
||||
|
||||
iobuff.seek(0)
|
||||
res = remoting.decode(iobuff)
|
||||
|
||||
if env and env.get('pywb.inputdata'):
|
||||
inputdata = env.get('pywb.inputdata')
|
||||
|
||||
new_list = []
|
||||
|
||||
for src, target in zip(inputdata.bodies, res.bodies):
|
||||
#print(target[0] + ' = ' + src[0])
|
||||
|
||||
#print('messageId => corrId ' + target[1].body.correlationId + ' => ' + src[1].body[0].messageId)
|
||||
target[1].body.correlationId = src[1].body[0].messageId
|
||||
|
||||
new_list.append((src[0], target[1]))
|
||||
|
||||
res.bodies = new_list
|
||||
|
||||
return BytesIO(remoting.encode(res).getvalue())
|
||||
|
||||
except Exception as e:
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
print(e)
|
||||
return stream
|
@ -4,7 +4,7 @@ import webencodings
|
||||
import yaml
|
||||
import re
|
||||
|
||||
from chardet.universaldetector import UniversalDetector
|
||||
#from chardet.universaldetector import UniversalDetector
|
||||
from io import BytesIO
|
||||
|
||||
from pywb.rewrite.header_rewriter import RewrittenStatusAndHeaders
|
||||
@ -21,7 +21,7 @@ from pywb.rewrite.regex_rewriters import JSNoneRewriter, JSLinkOnlyRewriter
|
||||
|
||||
|
||||
#=================================================================
|
||||
class RewriteContent:
|
||||
class RewriteContent(object):
|
||||
HEAD_REGEX = re.compile(b'<\s*head\\b[^>]*[>]+', re.I)
|
||||
|
||||
TAG_REGEX = re.compile(b'^\s*\<')
|
||||
@ -77,6 +77,7 @@ class RewriteContent:
|
||||
|
||||
|
||||
def _check_encoding(self, rewritten_headers, stream, enc):
|
||||
matched = False
|
||||
if (rewritten_headers.
|
||||
contains_removed_header('content-encoding', enc)):
|
||||
|
||||
@ -87,14 +88,15 @@ class RewriteContent:
|
||||
stream = DecompressingBufferedReader(stream, decomp_type=enc)
|
||||
|
||||
rewritten_headers.status_headers.remove_header('content-length')
|
||||
matched = True
|
||||
|
||||
return stream
|
||||
return matched, stream
|
||||
|
||||
|
||||
|
||||
def rewrite_content(self, urlrewriter, status_headers, stream,
|
||||
head_insert_func=None, urlkey='',
|
||||
cdx=None, cookie_rewriter=None):
|
||||
cdx=None, cookie_rewriter=None, env=None):
|
||||
|
||||
wb_url = urlrewriter.wburl
|
||||
|
||||
@ -118,9 +120,12 @@ class RewriteContent:
|
||||
|
||||
status_headers = rewritten_headers.status_headers
|
||||
|
||||
# use rewritten headers, but no further rewriting needed
|
||||
if rewritten_headers.text_type is None:
|
||||
return (status_headers, self.stream_to_gen(stream), False)
|
||||
res = self.handle_custom_rewrite(rewritten_headers.text_type,
|
||||
status_headers,
|
||||
stream,
|
||||
env)
|
||||
if res:
|
||||
return res
|
||||
|
||||
# Handle text content rewriting
|
||||
# ====================================================================
|
||||
@ -136,8 +141,12 @@ class RewriteContent:
|
||||
encoding = None
|
||||
first_buff = b''
|
||||
|
||||
stream = self._check_encoding(rewritten_headers, stream, 'gzip')
|
||||
stream = self._check_encoding(rewritten_headers, stream, 'deflate')
|
||||
for decomp_type in BufferedReader.get_supported_decompressors():
|
||||
matched, stream = self._check_encoding(rewritten_headers,
|
||||
stream,
|
||||
decomp_type)
|
||||
if matched:
|
||||
break
|
||||
|
||||
if mod == 'js_':
|
||||
text_type, stream = self._resolve_text_type('js',
|
||||
@ -237,6 +246,11 @@ class RewriteContent:
|
||||
|
||||
return (status_headers, gen, True)
|
||||
|
||||
def handle_custom_rewrite(self, text_type, status_headers, stream, env):
|
||||
# use rewritten headers, but no further rewriting needed
|
||||
if text_type is None:
|
||||
return (status_headers, self.stream_to_gen(stream), False)
|
||||
|
||||
@staticmethod
|
||||
def _extract_html_charset(buff, status_headers):
|
||||
charset = None
|
||||
@ -360,3 +374,5 @@ class RewriteContent:
|
||||
|
||||
finally:
|
||||
stream.close()
|
||||
|
||||
|
||||
|
@ -49,6 +49,12 @@ r"""
|
||||
>>> parse('<base href="static/"/><img src="image.gif"/>', urlrewriter=no_base_canon_rewriter)
|
||||
<base href="static/"/><img src="/web/20131226101010im_/http://example.com/some/path/static/image.gif"/>
|
||||
|
||||
# Empty url
|
||||
>>> parse('<base href="">')
|
||||
<base href="">
|
||||
|
||||
>>> parse('<base href>')
|
||||
<base href>
|
||||
|
||||
|
||||
# HTML Entities
|
||||
@ -66,6 +72,10 @@ r"""
|
||||
>>> parse('<input value="&X&"">X</input>')
|
||||
<input value="&X&"">X</input>
|
||||
|
||||
# Empty values should be ignored
|
||||
>>> parse('<input name="foo" value>')
|
||||
<input name="foo" value>
|
||||
|
||||
# SKIPPED
|
||||
# Unicode -- default with %-encoding
|
||||
#>>> parse(u'<a href="http://испытание.испытание/">испытание</a>')
|
||||
@ -92,7 +102,7 @@ r"""
|
||||
<meta http-equiv="refresh" content="text/html; charset=utf-8"/>
|
||||
|
||||
>>> parse('<META http-equiv="refresh" content>')
|
||||
<meta http-equiv="refresh" content="">
|
||||
<meta http-equiv="refresh" content>
|
||||
|
||||
>>> parse('<meta property="og:image" content="http://example.com/example.jpg">')
|
||||
<meta property="og:image" content="/web/20131226101010/http://example.com/example.jpg">
|
||||
@ -115,6 +125,10 @@ r"""
|
||||
>>> parse('<img srcset="//example.com/1x 1x, //example.com/foo 2x, https://example.com/bar 4x">')
|
||||
<img srcset="/web/20131226101010///example.com/1x 1x, /web/20131226101010///example.com/foo 2x, /web/20131226101010/https://example.com/bar 4x">
|
||||
|
||||
# empty srcset attrib
|
||||
>>> parse('<img srcset="">')
|
||||
<img srcset="">
|
||||
|
||||
# Script tag
|
||||
>>> parse('<script>window.location = "http://example.com/a/b/c.html"</script>')
|
||||
<script>window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html"</script>
|
||||
@ -131,7 +145,7 @@ r"""
|
||||
<script>/*<![CDATA[*/window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html;/*]]>*/"</script>
|
||||
|
||||
>>> parse('<div style="background: url(\'abc.html\')" onblah onclick="location = \'redirect.html\'"></div>')
|
||||
<div style="background: url('/web/20131226101010/http://example.com/some/path/abc.html')" onblah="" onclick="WB_wombat_location = 'redirect.html'"></div>
|
||||
<div style="background: url('/web/20131226101010/http://example.com/some/path/abc.html')" onblah onclick="WB_wombat_location = 'redirect.html'"></div>
|
||||
|
||||
>>> parse('<i style="background-image: url(http://foo-.bar_.example.com/)"></i>')
|
||||
<i style="background-image: url(/web/20131226101010/http://foo-.bar_.example.com/)"></i>
|
||||
|
@ -123,12 +123,23 @@ function notify_top() {
|
||||
return;
|
||||
}
|
||||
|
||||
if (window.__WB_top_frame.update_wb_url) {
|
||||
window.__WB_top_frame.update_wb_url(window.WB_wombat_location.href,
|
||||
wbinfo.timestamp,
|
||||
wbinfo.request_ts,
|
||||
wbinfo.is_live);
|
||||
}
|
||||
//if (window.__WB_top_frame.update_wb_url) {
|
||||
// window.__WB_top_frame.update_wb_url(window.WB_wombat_location.href,
|
||||
// wbinfo.timestamp,
|
||||
// wbinfo.request_ts,
|
||||
// wbinfo.is_live);
|
||||
//}
|
||||
|
||||
var message = {
|
||||
"url": window.WB_wombat_location.href,
|
||||
"ts": wbinfo.timestamp,
|
||||
"request_ts": wbinfo.request_ts,
|
||||
"is_live": wbinfo.is_live,
|
||||
"title": "",
|
||||
"wb_type": "load",
|
||||
}
|
||||
|
||||
window.__WB_top_frame.postMessage(message, "*");
|
||||
|
||||
remove_event("readystatechange", notify_top, document);
|
||||
}
|
||||
|
@ -38,27 +38,21 @@ function make_url(url, ts, mod)
|
||||
}
|
||||
}
|
||||
|
||||
function push_state(url, timestamp, request_ts, capture_str, is_live) {
|
||||
function push_state(state) {
|
||||
var frame = document.getElementById(IFRAME_ID).contentWindow;
|
||||
if (frame.WB_wombat_location) {
|
||||
var curr_href = frame.WB_wombat_location.href;
|
||||
|
||||
// If not current url, don't update
|
||||
if (url != curr_href) {
|
||||
if (state.url != curr_href) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
var state = {}
|
||||
state.timestamp = timestamp;
|
||||
state.request_ts = request_ts;
|
||||
state.outer_url = make_url(url, state.request_ts, wbinfo.frame_mod);
|
||||
state.inner_url = make_url(url, state.request_ts, wbinfo.replay_mod);
|
||||
state.url = url;
|
||||
state.capture_str = capture_str;
|
||||
state.is_live = is_live;
|
||||
state.outer_url = make_url(state.url, state.request_ts, wbinfo.frame_mod);
|
||||
state.inner_url = make_url(state.url, state.request_ts, wbinfo.replay_mod);
|
||||
|
||||
var canon_url = make_url(url, state.request_ts, "");
|
||||
var canon_url = make_url(state.url, state.request_ts, "");
|
||||
if (window.location.href != canon_url) {
|
||||
window.history.replaceState(state, "", canon_url);
|
||||
}
|
||||
@ -157,7 +151,13 @@ function iframe_loaded(event) {
|
||||
request_ts = ts;
|
||||
}
|
||||
|
||||
update_wb_url(url, ts, request_ts, is_live);
|
||||
var state = {}
|
||||
state["url"] = url;
|
||||
state["ts"] = ts;
|
||||
state["request_ts"] = request_ts;
|
||||
state["is_live"] = is_live
|
||||
|
||||
update_wb_url(state);
|
||||
}
|
||||
|
||||
|
||||
@ -165,12 +165,18 @@ function init_pm() {
|
||||
var frame = document.getElementById(IFRAME_ID).contentWindow;
|
||||
|
||||
window.addEventListener("message", function(event) {
|
||||
// Pass to replay frame
|
||||
if (event.source == window.parent) {
|
||||
// Pass to replay frame
|
||||
frame.postMessage(event.data, "*");
|
||||
} else if (event.source == frame) {
|
||||
// Pass to parent
|
||||
window.parent.postMessage(event.data, "*");
|
||||
|
||||
// Check if iframe url change message
|
||||
if (typeof(event.data) == "object" && event.data["wb_type"]) {
|
||||
update_wb_url(event.data);
|
||||
} else {
|
||||
// Pass to parent
|
||||
window.parent.postMessage(event.data, "*");
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
@ -181,14 +187,14 @@ function init_pm() {
|
||||
}
|
||||
|
||||
|
||||
function update_wb_url(url, ts, request_ts, is_live) {
|
||||
if (curr_state.url == url && curr_state.timestamp == ts) {
|
||||
function update_wb_url(state) {
|
||||
if (curr_state.url == state.url && curr_state.ts == state.ts) {
|
||||
return;
|
||||
}
|
||||
|
||||
capture_str = _wb_js.ts_to_date(ts, true);
|
||||
state['capture_str'] = _wb_js.ts_to_date(state.ts, true);
|
||||
|
||||
push_state(url, ts, request_ts, capture_str, is_live);
|
||||
push_state(state);
|
||||
}
|
||||
|
||||
// Load Banner
|
||||
@ -237,3 +243,4 @@ function init_hash_connect() {
|
||||
}
|
||||
|
||||
document.addEventListener("DOMContentLoaded", init_hash_connect);
|
||||
|
||||
|
@ -292,6 +292,7 @@ var wombat_internal = function($wbwindow) {
|
||||
var parser = make_parser(extract_orig($wbwindow.document.baseURI));
|
||||
var href = parser.href;
|
||||
var hash = href.lastIndexOf("#");
|
||||
|
||||
if (hash >= 0) {
|
||||
href = href.substring(0, hash);
|
||||
}
|
||||
@ -300,8 +301,6 @@ var wombat_internal = function($wbwindow) {
|
||||
|
||||
if (lastslash >= 0 && lastslash != (href.length - 1)) {
|
||||
href = href.substring(0, lastslash + 1);
|
||||
} else {
|
||||
href += "/";
|
||||
}
|
||||
|
||||
parser.href = href + url;
|
||||
@ -667,15 +666,15 @@ var wombat_internal = function($wbwindow) {
|
||||
// Adapted from:
|
||||
// http://indiegamr.com/generate-repeatable-random-numbers-in-js/
|
||||
|
||||
Math.seed = parseInt(seed);
|
||||
$wbwindow.Math.seed = parseInt(seed);
|
||||
function seeded_random() {
|
||||
Math.seed = (Math.seed * 9301 + 49297) % 233280;
|
||||
var rnd = Math.seed / 233280;
|
||||
$wbwindow.Math.seed = ($wbwindow.Math.seed * 9301 + 49297) % 233280;
|
||||
var rnd = $wbwindow.Math.seed / 233280;
|
||||
|
||||
return rnd;
|
||||
}
|
||||
|
||||
Math.random = seeded_random;
|
||||
$wbwindow.Math.random = seeded_random;
|
||||
}
|
||||
|
||||
function init_crypto_random() {
|
||||
@ -687,7 +686,7 @@ var wombat_internal = function($wbwindow) {
|
||||
|
||||
var new_getrandom = function(array) {
|
||||
for (i = 0; i < array.length; i++) {
|
||||
array[i] = parseInt(Math.random() * 4294967296);
|
||||
array[i] = parseInt($wbwindow.Math.random() * 4294967296);
|
||||
}
|
||||
return array;
|
||||
}
|
||||
@ -719,11 +718,23 @@ var wombat_internal = function($wbwindow) {
|
||||
|
||||
orig_func.call(this, state_obj, title, url);
|
||||
|
||||
if ($wbwindow.__WB_top_frame && $wbwindow != $wbwindow.__WB_top_frame && $wbwindow.__WB_top_frame.update_wb_url) {
|
||||
$wbwindow.__WB_top_frame.update_wb_url($wbwindow.WB_wombat_location.href,
|
||||
wb_info.timestamp,
|
||||
wb_info.request_ts,
|
||||
wb_info.is_live);
|
||||
//if ($wbwindow.__WB_top_frame && $wbwindow != $wbwindow.__WB_top_frame && $wbwindow.__WB_top_frame.update_wb_url) {
|
||||
// $wbwindow.__WB_top_frame.update_wb_url($wbwindow.WB_wombat_location.href,
|
||||
// wb_info.timestamp,
|
||||
// wb_info.request_ts,
|
||||
// wb_info.is_live);
|
||||
//}
|
||||
if ($wbwindow.__WB_top_frame && $wbwindow != $wbwindow.__WB_top_frame) {
|
||||
var message = {
|
||||
"url": url,
|
||||
"ts": wb_info.timestamp,
|
||||
"request_ts": wb_info.request_ts,
|
||||
"is_live": wb_info.is_live,
|
||||
"title": title,
|
||||
"wb_type": func_name,
|
||||
}
|
||||
|
||||
$wbwindow.__WB_top_frame.postMessage(message, "*");
|
||||
}
|
||||
}
|
||||
|
||||
@ -931,7 +942,8 @@ var wombat_internal = function($wbwindow) {
|
||||
//var timezone = new Date().getTimezoneOffset() * 60 * 1000;
|
||||
// Already UTC!
|
||||
var timezone = 0;
|
||||
var timediff = $wbwindow.Date.now() - (timestamp - timezone);
|
||||
var start_now = $wbwindow.Date.now()
|
||||
var timediff = start_now - (timestamp - timezone);
|
||||
|
||||
if ($wbwindow.__wb_Date_now) {
|
||||
return;
|
||||
@ -1656,13 +1668,14 @@ var wombat_internal = function($wbwindow) {
|
||||
|
||||
var from = source.WB_wombat_location.origin;
|
||||
|
||||
if (!source.__WB_id) {
|
||||
source.__WB_id = Math.round(Math.random() * 1000) + source.WB_wombat_location.href;
|
||||
}
|
||||
if (!this.__WB_win_id) {
|
||||
this.__WB_win_id = {};
|
||||
this.__WB_counter = 0;
|
||||
}
|
||||
|
||||
if (!source.__WB_id) {
|
||||
source.__WB_id = (this.__WB_counter++) + source.WB_wombat_location.href;
|
||||
}
|
||||
this.__WB_win_id[source.__WB_id] = source;
|
||||
|
||||
src_id = source.__WB_id;
|
||||
@ -1783,19 +1796,22 @@ var wombat_internal = function($wbwindow) {
|
||||
//============================================
|
||||
function init_open_override()
|
||||
{
|
||||
if (!$wbwindow.Window.prototype.open) {
|
||||
return;
|
||||
var orig = $wbwindow.open;
|
||||
|
||||
if ($wbwindow.Window.prototype.open) {
|
||||
orig = $wbwindow.Window.prototype.open;
|
||||
}
|
||||
|
||||
var orig = $wbwindow.Window.prototype.open;
|
||||
|
||||
var open_rewritten = function(strUrl, strWindowName, strWindowFeatures) {
|
||||
strUrl = rewrite_url(strUrl);
|
||||
strUrl = rewrite_url(strUrl, false, "");
|
||||
return orig.call(this, strUrl, strWindowName, strWindowFeatures);
|
||||
}
|
||||
|
||||
$wbwindow.open = open_rewritten;
|
||||
$wbwindow.Window.prototype.open = open_rewritten;
|
||||
|
||||
if ($wbwindow.Window.prototype.open) {
|
||||
$wbwindow.Window.prototype.open = open_rewritten;
|
||||
}
|
||||
|
||||
for (var i = 0; i < $wbwindow.frames.length; i++) {
|
||||
try {
|
||||
@ -2086,7 +2102,7 @@ var wombat_internal = function($wbwindow) {
|
||||
|
||||
//============================================
|
||||
function get_final_url(prefix, mod, url) {
|
||||
if (!mod) {
|
||||
if (mod == undefined) {
|
||||
mod = wb_info.mod;
|
||||
}
|
||||
|
||||
|
@ -1,5 +1,6 @@
|
||||
from io import BytesIO
|
||||
import zlib
|
||||
import brotli
|
||||
|
||||
|
||||
#=================================================================
|
||||
@ -17,6 +18,11 @@ def deflate_decompressor():
|
||||
def deflate_decompressor_alt():
|
||||
return zlib.decompressobj(-zlib.MAX_WBITS)
|
||||
|
||||
def brotli_decompressor():
|
||||
decomp = brotli.Decompressor()
|
||||
decomp.unused_data = None
|
||||
return decomp
|
||||
|
||||
|
||||
#=================================================================
|
||||
class BufferedReader(object):
|
||||
@ -40,7 +46,9 @@ class BufferedReader(object):
|
||||
|
||||
DECOMPRESSORS = {'gzip': gzip_decompressor,
|
||||
'deflate': deflate_decompressor,
|
||||
'deflate_alt': deflate_decompressor_alt}
|
||||
'deflate_alt': deflate_decompressor_alt,
|
||||
'br': brotli_decompressor
|
||||
}
|
||||
|
||||
def __init__(self, stream, block_size=1024,
|
||||
decomp_type=None,
|
||||
@ -98,7 +106,7 @@ class BufferedReader(object):
|
||||
if self.decompressor and data:
|
||||
try:
|
||||
data = self.decompressor.decompress(data)
|
||||
except Exception:
|
||||
except Exception as e:
|
||||
# if first read attempt, assume non-gzipped stream
|
||||
if self.num_read == 0:
|
||||
if self.decomp_type == 'deflate':
|
||||
@ -108,7 +116,8 @@ class BufferedReader(object):
|
||||
self.decompressor = None
|
||||
# otherwise (partly decompressed), something is wrong
|
||||
else:
|
||||
raise
|
||||
print(str(e))
|
||||
return b''
|
||||
return data
|
||||
|
||||
def read(self, length=None):
|
||||
@ -180,6 +189,10 @@ class BufferedReader(object):
|
||||
self.stream.close()
|
||||
self.stream = None
|
||||
|
||||
@classmethod
|
||||
def get_supported_decompressors(cls):
|
||||
return cls.DECOMPRESSORS.keys()
|
||||
|
||||
|
||||
#=================================================================
|
||||
class DecompressingBufferedReader(BufferedReader):
|
||||
|
@ -9,10 +9,12 @@ import requests
|
||||
|
||||
import six
|
||||
from six.moves.urllib.request import pathname2url, url2pathname
|
||||
from six.moves.urllib.parse import urljoin, unquote_plus, urlsplit
|
||||
from six.moves.urllib.parse import urljoin, unquote_plus, urlsplit, urlencode
|
||||
|
||||
import time
|
||||
import pkg_resources
|
||||
import base64
|
||||
import cgi
|
||||
|
||||
from io import open, BytesIO
|
||||
|
||||
@ -65,19 +67,18 @@ def to_native_str(value, encoding='iso-8859-1', func=lambda x: x):
|
||||
|
||||
|
||||
#=================================================================
|
||||
def extract_post_query(method, mime, length, stream, buffered_stream=None):
|
||||
def extract_post_query(method, mime, length, stream,
|
||||
buffered_stream=None,
|
||||
environ=None):
|
||||
"""
|
||||
Extract a url-encoded form POST from stream
|
||||
If not a application/x-www-form-urlencoded, or no missing
|
||||
content length, return None
|
||||
Attempt to decode application/x-www-form-urlencoded or multipart/*,
|
||||
otherwise read whole block and b64encode
|
||||
"""
|
||||
if method.upper() != 'POST':
|
||||
return None
|
||||
|
||||
if ((not mime or
|
||||
not mime.lower().startswith('application/x-www-form-urlencoded'))):
|
||||
return None
|
||||
|
||||
try:
|
||||
length = int(length)
|
||||
except (ValueError, TypeError):
|
||||
@ -101,11 +102,79 @@ def extract_post_query(method, mime, length, stream, buffered_stream=None):
|
||||
buffered_stream.write(post_query)
|
||||
buffered_stream.seek(0)
|
||||
|
||||
post_query = to_native_str(post_query)
|
||||
post_query = unquote_plus(post_query)
|
||||
if not mime:
|
||||
mime = ''
|
||||
|
||||
if mime.startswith('application/x-www-form-urlencoded'):
|
||||
post_query = to_native_str(post_query)
|
||||
post_query = unquote_plus(post_query)
|
||||
|
||||
elif mime.startswith('multipart/'):
|
||||
env = {'REQUEST_METHOD': 'POST',
|
||||
'CONTENT_TYPE': mime,
|
||||
'CONTENT_LENGTH': len(post_query)}
|
||||
|
||||
args = dict(fp=BytesIO(post_query),
|
||||
environ=env,
|
||||
keep_blank_values=True)
|
||||
|
||||
if six.PY3:
|
||||
args['encoding'] = 'utf-8'
|
||||
|
||||
data = cgi.FieldStorage(**args)
|
||||
|
||||
values = []
|
||||
for item in data.list:
|
||||
values.append((item.name, item.value))
|
||||
|
||||
post_query = urlencode(values, True)
|
||||
|
||||
elif mime.startswith('application/x-amf'):
|
||||
post_query = amf_parse(post_query, environ)
|
||||
|
||||
else:
|
||||
post_query = base64.b64encode(post_query)
|
||||
post_query = to_native_str(post_query)
|
||||
post_query = '&__wb_post_data=' + post_query
|
||||
|
||||
return post_query
|
||||
|
||||
|
||||
#=================================================================
|
||||
def amf_parse(string, environ):
|
||||
try:
|
||||
from pyamf import remoting
|
||||
|
||||
res = remoting.decode(BytesIO(string))
|
||||
|
||||
#print(res)
|
||||
body = res.bodies[0][1].body[0]
|
||||
|
||||
values = {}
|
||||
|
||||
if hasattr(body, 'body'):
|
||||
values['body'] = body.body
|
||||
|
||||
if hasattr(body, 'source'):
|
||||
values['source'] = body.source
|
||||
|
||||
if hasattr(body, 'operation'):
|
||||
values['op'] = body.operation
|
||||
|
||||
if environ is not None:
|
||||
environ['pywb.inputdata'] = res
|
||||
|
||||
query = urlencode(values)
|
||||
#print(query)
|
||||
return query
|
||||
|
||||
except Exception as e:
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
print(e)
|
||||
return None
|
||||
|
||||
|
||||
#=================================================================
|
||||
def append_post_query(url, post_query):
|
||||
if not post_query:
|
||||
@ -167,23 +236,34 @@ def read_last_line(fh, offset=256):
|
||||
|
||||
|
||||
#=================================================================
|
||||
class BlockLoader(object):
|
||||
class BaseLoader(object):
|
||||
def __init__(self, **kwargs):
|
||||
pass
|
||||
|
||||
def load(self, url, offset=0, length=-1):
|
||||
raise NotImplemented()
|
||||
|
||||
|
||||
#=================================================================
|
||||
class BlockLoader(BaseLoader):
|
||||
"""
|
||||
a loader which can stream blocks of content
|
||||
given a uri, offset and optional length.
|
||||
Currently supports: http/https and file/local file system
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
loaders = {}
|
||||
profile_loader = None
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
self.cached = {}
|
||||
self.args = args
|
||||
self.kwargs = kwargs
|
||||
|
||||
def load(self, url, offset=0, length=-1):
|
||||
loader = self._get_loader_for(url)
|
||||
loader, url = self._get_loader_for_url(url)
|
||||
return loader.load(url, offset, length)
|
||||
|
||||
def _get_loader_for(self, url):
|
||||
def _get_loader_for_url(self, url):
|
||||
"""
|
||||
Determine loading method based on uri
|
||||
"""
|
||||
@ -193,18 +273,47 @@ class BlockLoader(object):
|
||||
else:
|
||||
type_ = parts[0]
|
||||
|
||||
if '+' in type_:
|
||||
profile_name, scheme = type_.split('+', 1)
|
||||
if len(parts) == 2:
|
||||
url = scheme + '://' + parts[1]
|
||||
else:
|
||||
profile_name = ''
|
||||
scheme = type_
|
||||
|
||||
loader = self.cached.get(type_)
|
||||
if loader:
|
||||
return loader
|
||||
return loader, url
|
||||
|
||||
loader_cls = self._get_loader_class_for_type(scheme)
|
||||
|
||||
loader_cls = LOADERS.get(type_)
|
||||
if not loader_cls:
|
||||
raise IOError('No Loader for type: ' + type_)
|
||||
raise IOError('No Loader for type: ' + scheme)
|
||||
|
||||
profile = self.kwargs
|
||||
|
||||
if self.profile_loader:
|
||||
profile = self.profile_loader(profile_name, scheme)
|
||||
|
||||
loader = loader_cls(**profile)
|
||||
|
||||
loader = loader_cls(*self.args, **self.kwargs)
|
||||
self.cached[type_] = loader
|
||||
return loader
|
||||
return loader, url
|
||||
|
||||
def _get_loader_class_for_type(self, type_):
|
||||
loader_cls = self.loaders.get(type_)
|
||||
return loader_cls
|
||||
|
||||
@staticmethod
|
||||
def init_default_loaders():
|
||||
BlockLoader.loaders['http'] = HttpLoader
|
||||
BlockLoader.loaders['https'] = HttpLoader
|
||||
BlockLoader.loaders['s3'] = S3Loader
|
||||
BlockLoader.loaders['file'] = LocalFileLoader
|
||||
|
||||
@staticmethod
|
||||
def set_profile_loader(src):
|
||||
BlockLoader.profile_loader = src
|
||||
|
||||
@staticmethod
|
||||
def _make_range_header(offset, length):
|
||||
@ -217,10 +326,7 @@ class BlockLoader(object):
|
||||
|
||||
|
||||
#=================================================================
|
||||
class LocalFileLoader(object):
|
||||
def __init__(self, *args, **kwargs):
|
||||
pass
|
||||
|
||||
class LocalFileLoader(BaseLoader):
|
||||
def load(self, url, offset=0, length=-1):
|
||||
"""
|
||||
Load a file-like reader from the local file system
|
||||
@ -260,9 +366,11 @@ class LocalFileLoader(object):
|
||||
|
||||
|
||||
#=================================================================
|
||||
class HttpLoader(object):
|
||||
def __init__(self, cookie_maker=None, *args, **kwargs):
|
||||
self.cookie_maker = cookie_maker
|
||||
class HttpLoader(BaseLoader):
|
||||
def __init__(self, **kwargs):
|
||||
self.cookie_maker = kwargs.get('cookie_maker')
|
||||
if not self.cookie_maker:
|
||||
self.cookie_maker = kwargs.get('cookie')
|
||||
self.session = None
|
||||
|
||||
def load(self, url, offset, length):
|
||||
@ -288,33 +396,47 @@ class HttpLoader(object):
|
||||
|
||||
|
||||
#=================================================================
|
||||
class S3Loader(object):
|
||||
def __init__(self, *args, **kwargs):
|
||||
class S3Loader(BaseLoader):
|
||||
def __init__(self, **kwargs):
|
||||
self.s3conn = None
|
||||
self.aws_access_key_id = kwargs.get('aws_access_key_id')
|
||||
self.aws_secret_access_key = kwargs.get('aws_secret_access_key')
|
||||
|
||||
def load(self, url, offset, length):
|
||||
if not s3_avail: #pragma: no cover
|
||||
raise IOError('To load from s3 paths, ' +
|
||||
'you must install boto: pip install boto')
|
||||
|
||||
if not self.s3conn:
|
||||
try:
|
||||
self.s3conn = connect_s3()
|
||||
except Exception: #pragma: no cover
|
||||
self.s3conn = connect_s3(anon=True)
|
||||
aws_access_key_id = self.aws_access_key_id
|
||||
aws_secret_access_key = self.aws_secret_access_key
|
||||
|
||||
parts = urlsplit(url)
|
||||
|
||||
bucket = self.s3conn.get_bucket(parts.netloc)
|
||||
if parts.username and parts.password:
|
||||
aws_access_key_id = unquote_plus(parts.username)
|
||||
aws_secret_access_key = unquote_plus(parts.password)
|
||||
bucket_name = parts.netloc.split('@', 1)[-1]
|
||||
else:
|
||||
bucket_name = parts.netloc
|
||||
|
||||
headers = {'Range': BlockLoader._make_range_header(offset, length)}
|
||||
if not self.s3conn:
|
||||
try:
|
||||
self.s3conn = connect_s3(aws_access_key_id, aws_secret_access_key)
|
||||
except Exception: #pragma: no cover
|
||||
self.s3conn = connect_s3(anon=True)
|
||||
|
||||
bucket = self.s3conn.get_bucket(bucket_name)
|
||||
|
||||
key = bucket.get_key(parts.path)
|
||||
|
||||
result = key.get_contents_as_string(headers=headers)
|
||||
key.close()
|
||||
if offset == 0 and length == -1:
|
||||
headers = {}
|
||||
else:
|
||||
headers = {'Range': BlockLoader._make_range_header(offset, length)}
|
||||
|
||||
return BytesIO(result)
|
||||
# Read range
|
||||
key.open_read(headers=headers)
|
||||
return key
|
||||
|
||||
|
||||
#=================================================================
|
||||
@ -414,12 +536,6 @@ class LimitReader(object):
|
||||
|
||||
return stream
|
||||
|
||||
|
||||
#=================================================================
|
||||
LOADERS = {'http': HttpLoader,
|
||||
'https': HttpLoader,
|
||||
's3': S3Loader,
|
||||
'file': LocalFileLoader
|
||||
}
|
||||
|
||||
# ============================================================================
|
||||
BlockLoader.init_default_loaders()
|
||||
|
||||
|
@ -133,6 +133,14 @@ def compress_alt(buff):
|
||||
|
||||
return compressed
|
||||
|
||||
# Brotli
|
||||
|
||||
def test_brotli():
|
||||
with open(get_test_dir() + 'text_content/quickfox_repeated.compressed', 'rb') as fh:
|
||||
x = DecompressingBufferedReader(fh, decomp_type='br')
|
||||
x.read() == b'The quick brown fox jumps over the lazy dog' * 4096
|
||||
|
||||
|
||||
|
||||
# Errors
|
||||
|
||||
@ -140,9 +148,11 @@ def test_err_compress_mix():
|
||||
# error: compressed member, followed by not compressed -- considered invalid
|
||||
x = DecompressingBufferedReader(BytesIO(compress('ABC') + b'123'), decomp_type = 'gzip')
|
||||
b = x.read()
|
||||
b = x.read_next_member()
|
||||
with pytest.raises(zlib.error):
|
||||
x.read()
|
||||
assert b == b'ABC'
|
||||
x.read_next_member()
|
||||
assert x.read() == b''
|
||||
#with pytest.raises(zlib.error):
|
||||
# x.read()
|
||||
#error: Error -3 while decompressing: incorrect header check
|
||||
|
||||
def test_err_chunk_cut_off():
|
||||
|
@ -37,17 +37,21 @@ Traceback (most recent call last):
|
||||
IOError: [Errno 2] No such file or directory: '_x_no_such_file_'
|
||||
|
||||
# HMAC Cookie Maker
|
||||
>>> print_str(BlockLoader(HMACCookieMaker('test', 'test', 5)).load('http://example.com', 41, 14).read())
|
||||
>>> print_str(BlockLoader(cookie_maker=HMACCookieMaker('test', 'test', 5)).load('http://example.com', 41, 14).read())
|
||||
'Example Domain'
|
||||
|
||||
# fixed cookie, range request
|
||||
>>> print_str(BlockLoader('some=value').load('http://example.com', 41, 14).read())
|
||||
>>> print_str(BlockLoader(cookie='some=value').load('http://example.com', 41, 14).read())
|
||||
'Example Domain'
|
||||
|
||||
# range request
|
||||
>>> print_str(BlockLoader().load('http://example.com', 1262).read())
|
||||
'</html>\n'
|
||||
|
||||
# custom profile
|
||||
>>> print_str(BlockLoader().load('local+http://example.com', 1262).read())
|
||||
'</html>\n'
|
||||
|
||||
# unknown loader error
|
||||
#>>> BlockLoader().load('foo://example.com', 10).read() # doctest: +IGNORE_EXCEPTION_DETAIL
|
||||
#Traceback (most recent call last):
|
||||
@ -90,8 +94,9 @@ IOError: [Errno 2] No such file or directory: '_x_no_such_file_'
|
||||
# unsupported method
|
||||
>>> extract_post_query('PUT', 'application/x-www-form-urlencoded', len(post_data), BytesIO(post_data))
|
||||
|
||||
# unsupported type
|
||||
# base64 encode
|
||||
>>> extract_post_query('POST', 'text/plain', len(post_data), BytesIO(post_data))
|
||||
'&__wb_post_data=Zm9vPWJhciZkaXI9JTJGYmF6'
|
||||
|
||||
# invalid length
|
||||
>>> extract_post_query('POST', 'application/x-www-form-urlencoded', 'abc', BytesIO(post_data))
|
||||
|
@ -253,6 +253,7 @@ class ArchiveIndexEntryMixin(object):
|
||||
self['mime'] = def_mime
|
||||
if mime:
|
||||
self['mime'] = self.MIME_RE.split(mime, 1)[0]
|
||||
self['_content_type'] = mime
|
||||
|
||||
def extract_status(self, status_headers):
|
||||
""" Extract status code only from status line
|
||||
@ -390,7 +391,7 @@ class DefaultRecordParser(object):
|
||||
len_ = record.status_headers.get_header('Content-Length')
|
||||
|
||||
post_query = extract_post_query(method,
|
||||
entry.get('mime'),
|
||||
entry.get('_content_type'),
|
||||
len_,
|
||||
record.stream)
|
||||
|
||||
|
@ -57,7 +57,7 @@ class ArcWarcRecordLoader(object):
|
||||
def __init__(self, loader=None, cookie_maker=None, block_size=8192,
|
||||
verify_http=True):
|
||||
if not loader:
|
||||
loader = BlockLoader(cookie_maker)
|
||||
loader = BlockLoader(cookie_maker=cookie_maker)
|
||||
|
||||
self.loader = loader
|
||||
self.block_size = block_size
|
||||
|
@ -174,7 +174,8 @@ class ReplayView(object):
|
||||
stream=stream,
|
||||
head_insert_func=head_insert_func,
|
||||
urlkey=cdx['urlkey'],
|
||||
cdx=cdx))
|
||||
cdx=cdx,
|
||||
env=wbrequest.env))
|
||||
|
||||
(status_headers, response_iter, is_rewritten) = result
|
||||
|
||||
|
2
sample_archive/text_content/quickfox_repeated.compressed
Normal file
2
sample_archive/text_content/quickfox_repeated.compressed
Normal file
@ -0,0 +1,2 @@
|
||||
[яЇА"y\ыZЊB;ф%UZ’™±5Ићћ
|
||||
{Kђ№<<3C>И @ужЩMдme'‡_¦й0–{<ШS
|
6
setup.py
6
setup.py
@ -78,7 +78,8 @@ setup(
|
||||
'requests',
|
||||
'redis',
|
||||
'jinja2',
|
||||
'surt==0.3b4',
|
||||
'surt>=0.3.0',
|
||||
'brotlipy',
|
||||
'pyyaml',
|
||||
'watchdog',
|
||||
'webencodings',
|
||||
@ -90,9 +91,6 @@ setup(
|
||||
'fakeredis',
|
||||
'mock',
|
||||
],
|
||||
dependency_links=[
|
||||
'git+https://github.com/ikreymer/fakeredis.git@zset-lex-ops#egg=fakeredis-0.6.2-zset-lex-ops',
|
||||
],
|
||||
cmdclass={'test': PyTest},
|
||||
test_suite='',
|
||||
entry_points="""
|
||||
|
Loading…
x
Reference in New Issue
Block a user