diff --git a/README.rst b/README.rst
index fc257400..010a6f3e 100644
--- a/README.rst
+++ b/README.rst
@@ -1,4 +1,4 @@
-PyWb 0.30.1
+PyWb 0.31.0
===========
.. image:: https://travis-ci.org/ikreymer/pywb.svg?branch=master
diff --git a/pywb/__init__.py b/pywb/__init__.py
index c3b4b701..9f66d658 100644
--- a/pywb/__init__.py
+++ b/pywb/__init__.py
@@ -1,4 +1,4 @@
-__version__ = '0.30.1'
+__version__ = '0.31.0'
DEFAULT_CONFIG = 'pywb/default_config.yaml'
diff --git a/pywb/cdx/cdxobject.py b/pywb/cdx/cdxobject.py
index 432d69e4..3e8dddc5 100644
--- a/pywb/cdx/cdxobject.py
+++ b/pywb/cdx/cdxobject.py
@@ -153,7 +153,7 @@ class CDXObject(OrderedDict):
raise CDXException(msg)
for header, field in zip(cdxformat, fields):
- self[header] = field.decode('utf-8')
+ self[header] = to_native_str(field, 'utf-8')
self.cdxline = cdxline
@@ -213,7 +213,7 @@ class CDXObject(OrderedDict):
def __str__(self):
if self.cdxline:
- return self.cdxline.decode('utf-8')
+ return to_native_str(self.cdxline, 'utf-8')
if not self._from_json:
return ' '.join(str(val) for val in six.itervalues(self))
@@ -263,7 +263,7 @@ class IDXObject(OrderedDict):
raise CDXException(msg.format(len(fields), self.NUM_REQ_FIELDS))
for header, field in zip(self.FORMAT, fields):
- self[header] = field.decode('utf-8')
+ self[header] = to_native_str(field, 'utf-8')
self['offset'] = int(self['offset'])
self['length'] = int(self['length'])
@@ -285,4 +285,4 @@ class IDXObject(OrderedDict):
return json_encode(self) + '\n'
def __str__(self):
- return self.idxline.decode('utf-8')
+ return to_native_str(self.idxline, 'utf-8')
diff --git a/pywb/framework/wbrequestresponse.py b/pywb/framework/wbrequestresponse.py
index 0d2634f5..36afff40 100644
--- a/pywb/framework/wbrequestresponse.py
+++ b/pywb/framework/wbrequestresponse.py
@@ -184,14 +184,15 @@ class WbRequest(object):
if not self.wb_url:
return
- mime = self.env.get('CONTENT_TYPE', '').split(';')[0]
+ mime = self.env.get('CONTENT_TYPE', '')
length = self.env.get('CONTENT_LENGTH')
stream = self.env['wsgi.input']
buffered_stream = BytesIO()
post_query = extract_post_query('POST', mime, length, stream,
- buffered_stream=buffered_stream)
+ buffered_stream=buffered_stream,
+ environ=self.env)
if post_query:
self.env['wsgi.input'] = buffered_stream
diff --git a/pywb/rewrite/html_rewriter.py b/pywb/rewrite/html_rewriter.py
index e57f8591..90148c1f 100644
--- a/pywb/rewrite/html_rewriter.py
+++ b/pywb/rewrite/html_rewriter.py
@@ -120,7 +120,7 @@ class HTMLRewriterMixin(object):
def _rewrite_meta_refresh(self, meta_refresh):
if not meta_refresh:
- return None
+ return ''
m = self.META_REFRESH_REGEX.match(meta_refresh)
if not m:
@@ -133,6 +133,9 @@ class HTMLRewriterMixin(object):
return meta_refresh
def _rewrite_base(self, url, mod=''):
+ if not url:
+ return ''
+
url = self._ensure_url_has_path(url)
base_url = self._rewrite_url(url, mod)
@@ -183,11 +186,11 @@ class HTMLRewriterMixin(object):
def _rewrite_url(self, value, mod=None):
if not value:
- return None
+ return ''
value = value.strip()
if not value:
- return None
+ return ''
value = self.try_unescape(value)
return self.url_rewriter.rewrite(value, mod)
@@ -209,21 +212,24 @@ class HTMLRewriterMixin(object):
return new_value
def _rewrite_srcset(self, value, mod=''):
+ if not value:
+ return ''
+
values = value.split(',')
- values = map(lambda x: self._rewrite_url(x.strip()), values)
+ values = [self._rewrite_url(v.strip()) for v in values]
return ', '.join(values)
def _rewrite_css(self, css_content):
if css_content:
return self.css_rewriter.rewrite(css_content)
else:
- return None
+ return ''
def _rewrite_script(self, script_content):
if script_content:
return self.js_rewriter.rewrite(script_content)
else:
- return None
+ return ''
def has_attr(self, tag_attrs, attr):
name, value = attr
@@ -252,6 +258,11 @@ class HTMLRewriterMixin(object):
self.out.write('<' + tag)
for attr_name, attr_value in tag_attrs:
+ empty_attr = False
+ if attr_value is None:
+ attr_value = ''
+ empty_attr = True
+
# special case: inline JS/event handler
if ((attr_value and attr_value.startswith('javascript:'))
or attr_name.startswith('on')):
@@ -324,7 +335,7 @@ class HTMLRewriterMixin(object):
attr_value = self._rewrite_url(attr_value, rw_mod)
# write the attr!
- self._write_attr(attr_name, attr_value)
+ self._write_attr(attr_name, attr_value, empty_attr)
return True
@@ -347,11 +358,17 @@ class HTMLRewriterMixin(object):
return True
- def _write_attr(self, name, value):
- # parser doesn't differentiate between 'attr=""' and just 'attr'
- # 'attr=""' is more common, so use that form
- if value:
+ def _write_attr(self, name, value, empty_attr):
+ # if empty_attr is set, just write 'attr'!
+ if empty_attr:
+ self.out.write(' ' + name)
+
+ # write with value, if set
+ elif value:
+
self.out.write(' ' + name + '="' + value.replace('"', '"') + '"')
+
+ # otherwise, 'attr=""' is more common, so use that form
else:
self.out.write(' ' + name + '=""')
@@ -421,8 +438,9 @@ class HTMLRewriter(HTMLRewriterMixin, HTMLParser):
def feed(self, string):
try:
HTMLParser.feed(self, string)
- except Exception: # pragma: no cover
- # only raised in 2.6
+ except Exception as e: # pragma: no cover
+ import traceback
+ traceback.print_exc()
self.out.write(string)
def _internal_close(self):
diff --git a/pywb/rewrite/rewrite_amf.py b/pywb/rewrite/rewrite_amf.py
new file mode 100644
index 00000000..07a73470
--- /dev/null
+++ b/pywb/rewrite/rewrite_amf.py
@@ -0,0 +1,52 @@
+from io import BytesIO
+from six.moves import zip
+from pywb.rewrite.rewrite_content import RewriteContent
+
+
+# ============================================================================
+# Expiermental: not fully tested
+class RewriteContentAMF(RewriteContent): #pragma: no cover
+ def handle_custom_rewrite(self, text_type, status_headers, stream, env):
+
+ if status_headers.get_header('Content-Type') == 'application/x-amf':
+ stream = self.rewrite_amf(stream, env)
+
+ return (super(RewriteContentAMF, self).
+ handle_custom_rewrite(text_type, status_headers, stream, env))
+
+ def rewrite_amf(self, stream, env):
+ try:
+ from pyamf import remoting
+
+ iobuff = BytesIO()
+ while True:
+ buff = stream.read()
+ if not buff:
+ break
+ iobuff.write(buff)
+
+ iobuff.seek(0)
+ res = remoting.decode(iobuff)
+
+ if env and env.get('pywb.inputdata'):
+ inputdata = env.get('pywb.inputdata')
+
+ new_list = []
+
+ for src, target in zip(inputdata.bodies, res.bodies):
+ #print(target[0] + ' = ' + src[0])
+
+ #print('messageId => corrId ' + target[1].body.correlationId + ' => ' + src[1].body[0].messageId)
+ target[1].body.correlationId = src[1].body[0].messageId
+
+ new_list.append((src[0], target[1]))
+
+ res.bodies = new_list
+
+ return BytesIO(remoting.encode(res).getvalue())
+
+ except Exception as e:
+ import traceback
+ traceback.print_exc()
+ print(e)
+ return stream
diff --git a/pywb/rewrite/rewrite_content.py b/pywb/rewrite/rewrite_content.py
index 4454ea3c..677e20ae 100644
--- a/pywb/rewrite/rewrite_content.py
+++ b/pywb/rewrite/rewrite_content.py
@@ -4,7 +4,7 @@ import webencodings
import yaml
import re
-from chardet.universaldetector import UniversalDetector
+#from chardet.universaldetector import UniversalDetector
from io import BytesIO
from pywb.rewrite.header_rewriter import RewrittenStatusAndHeaders
@@ -21,7 +21,7 @@ from pywb.rewrite.regex_rewriters import JSNoneRewriter, JSLinkOnlyRewriter
#=================================================================
-class RewriteContent:
+class RewriteContent(object):
HEAD_REGEX = re.compile(b'<\s*head\\b[^>]*[>]+', re.I)
TAG_REGEX = re.compile(b'^\s*\<')
@@ -77,6 +77,7 @@ class RewriteContent:
def _check_encoding(self, rewritten_headers, stream, enc):
+ matched = False
if (rewritten_headers.
contains_removed_header('content-encoding', enc)):
@@ -87,14 +88,15 @@ class RewriteContent:
stream = DecompressingBufferedReader(stream, decomp_type=enc)
rewritten_headers.status_headers.remove_header('content-length')
+ matched = True
- return stream
+ return matched, stream
def rewrite_content(self, urlrewriter, status_headers, stream,
head_insert_func=None, urlkey='',
- cdx=None, cookie_rewriter=None):
+ cdx=None, cookie_rewriter=None, env=None):
wb_url = urlrewriter.wburl
@@ -118,9 +120,12 @@ class RewriteContent:
status_headers = rewritten_headers.status_headers
- # use rewritten headers, but no further rewriting needed
- if rewritten_headers.text_type is None:
- return (status_headers, self.stream_to_gen(stream), False)
+ res = self.handle_custom_rewrite(rewritten_headers.text_type,
+ status_headers,
+ stream,
+ env)
+ if res:
+ return res
# Handle text content rewriting
# ====================================================================
@@ -136,8 +141,12 @@ class RewriteContent:
encoding = None
first_buff = b''
- stream = self._check_encoding(rewritten_headers, stream, 'gzip')
- stream = self._check_encoding(rewritten_headers, stream, 'deflate')
+ for decomp_type in BufferedReader.get_supported_decompressors():
+ matched, stream = self._check_encoding(rewritten_headers,
+ stream,
+ decomp_type)
+ if matched:
+ break
if mod == 'js_':
text_type, stream = self._resolve_text_type('js',
@@ -237,6 +246,11 @@ class RewriteContent:
return (status_headers, gen, True)
+ def handle_custom_rewrite(self, text_type, status_headers, stream, env):
+ # use rewritten headers, but no further rewriting needed
+ if text_type is None:
+ return (status_headers, self.stream_to_gen(stream), False)
+
@staticmethod
def _extract_html_charset(buff, status_headers):
charset = None
@@ -360,3 +374,5 @@ class RewriteContent:
finally:
stream.close()
+
+
diff --git a/pywb/rewrite/test/test_html_rewriter.py b/pywb/rewrite/test/test_html_rewriter.py
index 818bd114..afb1da93 100644
--- a/pywb/rewrite/test/test_html_rewriter.py
+++ b/pywb/rewrite/test/test_html_rewriter.py
@@ -49,6 +49,12 @@ r"""
>>> parse('
', urlrewriter=no_base_canon_rewriter)
+# Empty url
+>>> parse('')
+
+
+>>> parse('')
+
# HTML Entities
@@ -66,6 +72,10 @@ r"""
>>> parse('X')
X
+# Empty values should be ignored
+>>> parse('')
+
+
# SKIPPED
# Unicode -- default with %-encoding
#>>> parse(u'испытание')
@@ -92,7 +102,7 @@ r"""
>>> parse('')
-
+
>>> parse('')
@@ -115,6 +125,10 @@ r"""
>>> parse('
')
+# empty srcset attrib
+>>> parse('
')
+
+
# Script tag
>>> parse('')
@@ -131,7 +145,7 @@ r"""
>>> parse('
')
-
+
>>> parse('')
diff --git a/pywb/static/wb.js b/pywb/static/wb.js
index e186f1df..f57e833b 100644
--- a/pywb/static/wb.js
+++ b/pywb/static/wb.js
@@ -123,12 +123,23 @@ function notify_top() {
return;
}
- if (window.__WB_top_frame.update_wb_url) {
- window.__WB_top_frame.update_wb_url(window.WB_wombat_location.href,
- wbinfo.timestamp,
- wbinfo.request_ts,
- wbinfo.is_live);
- }
+ //if (window.__WB_top_frame.update_wb_url) {
+ // window.__WB_top_frame.update_wb_url(window.WB_wombat_location.href,
+ // wbinfo.timestamp,
+ // wbinfo.request_ts,
+ // wbinfo.is_live);
+ //}
+
+ var message = {
+ "url": window.WB_wombat_location.href,
+ "ts": wbinfo.timestamp,
+ "request_ts": wbinfo.request_ts,
+ "is_live": wbinfo.is_live,
+ "title": "",
+ "wb_type": "load",
+ }
+
+ window.__WB_top_frame.postMessage(message, "*");
remove_event("readystatechange", notify_top, document);
}
diff --git a/pywb/static/wb_frame.js b/pywb/static/wb_frame.js
index c9e47ef3..168b914f 100644
--- a/pywb/static/wb_frame.js
+++ b/pywb/static/wb_frame.js
@@ -38,27 +38,21 @@ function make_url(url, ts, mod)
}
}
-function push_state(url, timestamp, request_ts, capture_str, is_live) {
+function push_state(state) {
var frame = document.getElementById(IFRAME_ID).contentWindow;
if (frame.WB_wombat_location) {
var curr_href = frame.WB_wombat_location.href;
// If not current url, don't update
- if (url != curr_href) {
+ if (state.url != curr_href) {
return;
}
}
- var state = {}
- state.timestamp = timestamp;
- state.request_ts = request_ts;
- state.outer_url = make_url(url, state.request_ts, wbinfo.frame_mod);
- state.inner_url = make_url(url, state.request_ts, wbinfo.replay_mod);
- state.url = url;
- state.capture_str = capture_str;
- state.is_live = is_live;
+ state.outer_url = make_url(state.url, state.request_ts, wbinfo.frame_mod);
+ state.inner_url = make_url(state.url, state.request_ts, wbinfo.replay_mod);
- var canon_url = make_url(url, state.request_ts, "");
+ var canon_url = make_url(state.url, state.request_ts, "");
if (window.location.href != canon_url) {
window.history.replaceState(state, "", canon_url);
}
@@ -157,7 +151,13 @@ function iframe_loaded(event) {
request_ts = ts;
}
- update_wb_url(url, ts, request_ts, is_live);
+ var state = {}
+ state["url"] = url;
+ state["ts"] = ts;
+ state["request_ts"] = request_ts;
+ state["is_live"] = is_live
+
+ update_wb_url(state);
}
@@ -165,12 +165,18 @@ function init_pm() {
var frame = document.getElementById(IFRAME_ID).contentWindow;
window.addEventListener("message", function(event) {
- // Pass to replay frame
if (event.source == window.parent) {
+ // Pass to replay frame
frame.postMessage(event.data, "*");
} else if (event.source == frame) {
- // Pass to parent
- window.parent.postMessage(event.data, "*");
+
+ // Check if iframe url change message
+ if (typeof(event.data) == "object" && event.data["wb_type"]) {
+ update_wb_url(event.data);
+ } else {
+ // Pass to parent
+ window.parent.postMessage(event.data, "*");
+ }
}
});
@@ -181,14 +187,14 @@ function init_pm() {
}
-function update_wb_url(url, ts, request_ts, is_live) {
- if (curr_state.url == url && curr_state.timestamp == ts) {
+function update_wb_url(state) {
+ if (curr_state.url == state.url && curr_state.ts == state.ts) {
return;
}
- capture_str = _wb_js.ts_to_date(ts, true);
+ state['capture_str'] = _wb_js.ts_to_date(state.ts, true);
- push_state(url, ts, request_ts, capture_str, is_live);
+ push_state(state);
}
// Load Banner
@@ -237,3 +243,4 @@ function init_hash_connect() {
}
document.addEventListener("DOMContentLoaded", init_hash_connect);
+
diff --git a/pywb/static/wombat.js b/pywb/static/wombat.js
index 5fb4e2a4..af280f94 100644
--- a/pywb/static/wombat.js
+++ b/pywb/static/wombat.js
@@ -292,6 +292,7 @@ var wombat_internal = function($wbwindow) {
var parser = make_parser(extract_orig($wbwindow.document.baseURI));
var href = parser.href;
var hash = href.lastIndexOf("#");
+
if (hash >= 0) {
href = href.substring(0, hash);
}
@@ -300,8 +301,6 @@ var wombat_internal = function($wbwindow) {
if (lastslash >= 0 && lastslash != (href.length - 1)) {
href = href.substring(0, lastslash + 1);
- } else {
- href += "/";
}
parser.href = href + url;
@@ -667,15 +666,15 @@ var wombat_internal = function($wbwindow) {
// Adapted from:
// http://indiegamr.com/generate-repeatable-random-numbers-in-js/
- Math.seed = parseInt(seed);
+ $wbwindow.Math.seed = parseInt(seed);
function seeded_random() {
- Math.seed = (Math.seed * 9301 + 49297) % 233280;
- var rnd = Math.seed / 233280;
+ $wbwindow.Math.seed = ($wbwindow.Math.seed * 9301 + 49297) % 233280;
+ var rnd = $wbwindow.Math.seed / 233280;
return rnd;
}
- Math.random = seeded_random;
+ $wbwindow.Math.random = seeded_random;
}
function init_crypto_random() {
@@ -687,7 +686,7 @@ var wombat_internal = function($wbwindow) {
var new_getrandom = function(array) {
for (i = 0; i < array.length; i++) {
- array[i] = parseInt(Math.random() * 4294967296);
+ array[i] = parseInt($wbwindow.Math.random() * 4294967296);
}
return array;
}
@@ -719,11 +718,23 @@ var wombat_internal = function($wbwindow) {
orig_func.call(this, state_obj, title, url);
- if ($wbwindow.__WB_top_frame && $wbwindow != $wbwindow.__WB_top_frame && $wbwindow.__WB_top_frame.update_wb_url) {
- $wbwindow.__WB_top_frame.update_wb_url($wbwindow.WB_wombat_location.href,
- wb_info.timestamp,
- wb_info.request_ts,
- wb_info.is_live);
+ //if ($wbwindow.__WB_top_frame && $wbwindow != $wbwindow.__WB_top_frame && $wbwindow.__WB_top_frame.update_wb_url) {
+ // $wbwindow.__WB_top_frame.update_wb_url($wbwindow.WB_wombat_location.href,
+ // wb_info.timestamp,
+ // wb_info.request_ts,
+ // wb_info.is_live);
+ //}
+ if ($wbwindow.__WB_top_frame && $wbwindow != $wbwindow.__WB_top_frame) {
+ var message = {
+ "url": url,
+ "ts": wb_info.timestamp,
+ "request_ts": wb_info.request_ts,
+ "is_live": wb_info.is_live,
+ "title": title,
+ "wb_type": func_name,
+ }
+
+ $wbwindow.__WB_top_frame.postMessage(message, "*");
}
}
@@ -931,7 +942,8 @@ var wombat_internal = function($wbwindow) {
//var timezone = new Date().getTimezoneOffset() * 60 * 1000;
// Already UTC!
var timezone = 0;
- var timediff = $wbwindow.Date.now() - (timestamp - timezone);
+ var start_now = $wbwindow.Date.now()
+ var timediff = start_now - (timestamp - timezone);
if ($wbwindow.__wb_Date_now) {
return;
@@ -1656,13 +1668,14 @@ var wombat_internal = function($wbwindow) {
var from = source.WB_wombat_location.origin;
- if (!source.__WB_id) {
- source.__WB_id = Math.round(Math.random() * 1000) + source.WB_wombat_location.href;
- }
if (!this.__WB_win_id) {
this.__WB_win_id = {};
+ this.__WB_counter = 0;
}
+ if (!source.__WB_id) {
+ source.__WB_id = (this.__WB_counter++) + source.WB_wombat_location.href;
+ }
this.__WB_win_id[source.__WB_id] = source;
src_id = source.__WB_id;
@@ -1783,19 +1796,22 @@ var wombat_internal = function($wbwindow) {
//============================================
function init_open_override()
{
- if (!$wbwindow.Window.prototype.open) {
- return;
+ var orig = $wbwindow.open;
+
+ if ($wbwindow.Window.prototype.open) {
+ orig = $wbwindow.Window.prototype.open;
}
- var orig = $wbwindow.Window.prototype.open;
-
var open_rewritten = function(strUrl, strWindowName, strWindowFeatures) {
- strUrl = rewrite_url(strUrl);
+ strUrl = rewrite_url(strUrl, false, "");
return orig.call(this, strUrl, strWindowName, strWindowFeatures);
}
$wbwindow.open = open_rewritten;
- $wbwindow.Window.prototype.open = open_rewritten;
+
+ if ($wbwindow.Window.prototype.open) {
+ $wbwindow.Window.prototype.open = open_rewritten;
+ }
for (var i = 0; i < $wbwindow.frames.length; i++) {
try {
@@ -2086,7 +2102,7 @@ var wombat_internal = function($wbwindow) {
//============================================
function get_final_url(prefix, mod, url) {
- if (!mod) {
+ if (mod == undefined) {
mod = wb_info.mod;
}
diff --git a/pywb/utils/bufferedreaders.py b/pywb/utils/bufferedreaders.py
index f3268c58..e1ebfc90 100644
--- a/pywb/utils/bufferedreaders.py
+++ b/pywb/utils/bufferedreaders.py
@@ -1,5 +1,6 @@
from io import BytesIO
import zlib
+import brotli
#=================================================================
@@ -17,6 +18,11 @@ def deflate_decompressor():
def deflate_decompressor_alt():
return zlib.decompressobj(-zlib.MAX_WBITS)
+def brotli_decompressor():
+ decomp = brotli.Decompressor()
+ decomp.unused_data = None
+ return decomp
+
#=================================================================
class BufferedReader(object):
@@ -40,7 +46,9 @@ class BufferedReader(object):
DECOMPRESSORS = {'gzip': gzip_decompressor,
'deflate': deflate_decompressor,
- 'deflate_alt': deflate_decompressor_alt}
+ 'deflate_alt': deflate_decompressor_alt,
+ 'br': brotli_decompressor
+ }
def __init__(self, stream, block_size=1024,
decomp_type=None,
@@ -98,7 +106,7 @@ class BufferedReader(object):
if self.decompressor and data:
try:
data = self.decompressor.decompress(data)
- except Exception:
+ except Exception as e:
# if first read attempt, assume non-gzipped stream
if self.num_read == 0:
if self.decomp_type == 'deflate':
@@ -108,7 +116,8 @@ class BufferedReader(object):
self.decompressor = None
# otherwise (partly decompressed), something is wrong
else:
- raise
+ print(str(e))
+ return b''
return data
def read(self, length=None):
@@ -180,6 +189,10 @@ class BufferedReader(object):
self.stream.close()
self.stream = None
+ @classmethod
+ def get_supported_decompressors(cls):
+ return cls.DECOMPRESSORS.keys()
+
#=================================================================
class DecompressingBufferedReader(BufferedReader):
diff --git a/pywb/utils/loaders.py b/pywb/utils/loaders.py
index 6dbbf1e2..4c298334 100644
--- a/pywb/utils/loaders.py
+++ b/pywb/utils/loaders.py
@@ -9,10 +9,12 @@ import requests
import six
from six.moves.urllib.request import pathname2url, url2pathname
-from six.moves.urllib.parse import urljoin, unquote_plus, urlsplit
+from six.moves.urllib.parse import urljoin, unquote_plus, urlsplit, urlencode
import time
import pkg_resources
+import base64
+import cgi
from io import open, BytesIO
@@ -65,19 +67,18 @@ def to_native_str(value, encoding='iso-8859-1', func=lambda x: x):
#=================================================================
-def extract_post_query(method, mime, length, stream, buffered_stream=None):
+def extract_post_query(method, mime, length, stream,
+ buffered_stream=None,
+ environ=None):
"""
Extract a url-encoded form POST from stream
- If not a application/x-www-form-urlencoded, or no missing
content length, return None
+ Attempt to decode application/x-www-form-urlencoded or multipart/*,
+ otherwise read whole block and b64encode
"""
if method.upper() != 'POST':
return None
- if ((not mime or
- not mime.lower().startswith('application/x-www-form-urlencoded'))):
- return None
-
try:
length = int(length)
except (ValueError, TypeError):
@@ -101,11 +102,79 @@ def extract_post_query(method, mime, length, stream, buffered_stream=None):
buffered_stream.write(post_query)
buffered_stream.seek(0)
- post_query = to_native_str(post_query)
- post_query = unquote_plus(post_query)
+ if not mime:
+ mime = ''
+
+ if mime.startswith('application/x-www-form-urlencoded'):
+ post_query = to_native_str(post_query)
+ post_query = unquote_plus(post_query)
+
+ elif mime.startswith('multipart/'):
+ env = {'REQUEST_METHOD': 'POST',
+ 'CONTENT_TYPE': mime,
+ 'CONTENT_LENGTH': len(post_query)}
+
+ args = dict(fp=BytesIO(post_query),
+ environ=env,
+ keep_blank_values=True)
+
+ if six.PY3:
+ args['encoding'] = 'utf-8'
+
+ data = cgi.FieldStorage(**args)
+
+ values = []
+ for item in data.list:
+ values.append((item.name, item.value))
+
+ post_query = urlencode(values, True)
+
+ elif mime.startswith('application/x-amf'):
+ post_query = amf_parse(post_query, environ)
+
+ else:
+ post_query = base64.b64encode(post_query)
+ post_query = to_native_str(post_query)
+ post_query = '&__wb_post_data=' + post_query
+
return post_query
+#=================================================================
+def amf_parse(string, environ):
+ try:
+ from pyamf import remoting
+
+ res = remoting.decode(BytesIO(string))
+
+ #print(res)
+ body = res.bodies[0][1].body[0]
+
+ values = {}
+
+ if hasattr(body, 'body'):
+ values['body'] = body.body
+
+ if hasattr(body, 'source'):
+ values['source'] = body.source
+
+ if hasattr(body, 'operation'):
+ values['op'] = body.operation
+
+ if environ is not None:
+ environ['pywb.inputdata'] = res
+
+ query = urlencode(values)
+ #print(query)
+ return query
+
+ except Exception as e:
+ import traceback
+ traceback.print_exc()
+ print(e)
+ return None
+
+
#=================================================================
def append_post_query(url, post_query):
if not post_query:
@@ -167,23 +236,34 @@ def read_last_line(fh, offset=256):
#=================================================================
-class BlockLoader(object):
+class BaseLoader(object):
+ def __init__(self, **kwargs):
+ pass
+
+ def load(self, url, offset=0, length=-1):
+ raise NotImplemented()
+
+
+#=================================================================
+class BlockLoader(BaseLoader):
"""
a loader which can stream blocks of content
given a uri, offset and optional length.
Currently supports: http/https and file/local file system
"""
- def __init__(self, *args, **kwargs):
+ loaders = {}
+ profile_loader = None
+
+ def __init__(self, **kwargs):
self.cached = {}
- self.args = args
self.kwargs = kwargs
def load(self, url, offset=0, length=-1):
- loader = self._get_loader_for(url)
+ loader, url = self._get_loader_for_url(url)
return loader.load(url, offset, length)
- def _get_loader_for(self, url):
+ def _get_loader_for_url(self, url):
"""
Determine loading method based on uri
"""
@@ -193,18 +273,47 @@ class BlockLoader(object):
else:
type_ = parts[0]
+ if '+' in type_:
+ profile_name, scheme = type_.split('+', 1)
+ if len(parts) == 2:
+ url = scheme + '://' + parts[1]
+ else:
+ profile_name = ''
+ scheme = type_
+
loader = self.cached.get(type_)
if loader:
- return loader
+ return loader, url
+
+ loader_cls = self._get_loader_class_for_type(scheme)
- loader_cls = LOADERS.get(type_)
if not loader_cls:
- raise IOError('No Loader for type: ' + type_)
+ raise IOError('No Loader for type: ' + scheme)
+
+ profile = self.kwargs
+
+ if self.profile_loader:
+ profile = self.profile_loader(profile_name, scheme)
+
+ loader = loader_cls(**profile)
- loader = loader_cls(*self.args, **self.kwargs)
self.cached[type_] = loader
- return loader
+ return loader, url
+ def _get_loader_class_for_type(self, type_):
+ loader_cls = self.loaders.get(type_)
+ return loader_cls
+
+ @staticmethod
+ def init_default_loaders():
+ BlockLoader.loaders['http'] = HttpLoader
+ BlockLoader.loaders['https'] = HttpLoader
+ BlockLoader.loaders['s3'] = S3Loader
+ BlockLoader.loaders['file'] = LocalFileLoader
+
+ @staticmethod
+ def set_profile_loader(src):
+ BlockLoader.profile_loader = src
@staticmethod
def _make_range_header(offset, length):
@@ -217,10 +326,7 @@ class BlockLoader(object):
#=================================================================
-class LocalFileLoader(object):
- def __init__(self, *args, **kwargs):
- pass
-
+class LocalFileLoader(BaseLoader):
def load(self, url, offset=0, length=-1):
"""
Load a file-like reader from the local file system
@@ -260,9 +366,11 @@ class LocalFileLoader(object):
#=================================================================
-class HttpLoader(object):
- def __init__(self, cookie_maker=None, *args, **kwargs):
- self.cookie_maker = cookie_maker
+class HttpLoader(BaseLoader):
+ def __init__(self, **kwargs):
+ self.cookie_maker = kwargs.get('cookie_maker')
+ if not self.cookie_maker:
+ self.cookie_maker = kwargs.get('cookie')
self.session = None
def load(self, url, offset, length):
@@ -288,33 +396,47 @@ class HttpLoader(object):
#=================================================================
-class S3Loader(object):
- def __init__(self, *args, **kwargs):
+class S3Loader(BaseLoader):
+ def __init__(self, **kwargs):
self.s3conn = None
+ self.aws_access_key_id = kwargs.get('aws_access_key_id')
+ self.aws_secret_access_key = kwargs.get('aws_secret_access_key')
def load(self, url, offset, length):
if not s3_avail: #pragma: no cover
raise IOError('To load from s3 paths, ' +
'you must install boto: pip install boto')
- if not self.s3conn:
- try:
- self.s3conn = connect_s3()
- except Exception: #pragma: no cover
- self.s3conn = connect_s3(anon=True)
+ aws_access_key_id = self.aws_access_key_id
+ aws_secret_access_key = self.aws_secret_access_key
parts = urlsplit(url)
- bucket = self.s3conn.get_bucket(parts.netloc)
+ if parts.username and parts.password:
+ aws_access_key_id = unquote_plus(parts.username)
+ aws_secret_access_key = unquote_plus(parts.password)
+ bucket_name = parts.netloc.split('@', 1)[-1]
+ else:
+ bucket_name = parts.netloc
- headers = {'Range': BlockLoader._make_range_header(offset, length)}
+ if not self.s3conn:
+ try:
+ self.s3conn = connect_s3(aws_access_key_id, aws_secret_access_key)
+ except Exception: #pragma: no cover
+ self.s3conn = connect_s3(anon=True)
+
+ bucket = self.s3conn.get_bucket(bucket_name)
key = bucket.get_key(parts.path)
- result = key.get_contents_as_string(headers=headers)
- key.close()
+ if offset == 0 and length == -1:
+ headers = {}
+ else:
+ headers = {'Range': BlockLoader._make_range_header(offset, length)}
- return BytesIO(result)
+ # Read range
+ key.open_read(headers=headers)
+ return key
#=================================================================
@@ -414,12 +536,6 @@ class LimitReader(object):
return stream
-
-#=================================================================
-LOADERS = {'http': HttpLoader,
- 'https': HttpLoader,
- 's3': S3Loader,
- 'file': LocalFileLoader
- }
-
+# ============================================================================
+BlockLoader.init_default_loaders()
diff --git a/pywb/utils/test/test_bufferedreaders.py b/pywb/utils/test/test_bufferedreaders.py
index 9f4fd54a..7d058dcd 100644
--- a/pywb/utils/test/test_bufferedreaders.py
+++ b/pywb/utils/test/test_bufferedreaders.py
@@ -133,6 +133,14 @@ def compress_alt(buff):
return compressed
+# Brotli
+
+def test_brotli():
+ with open(get_test_dir() + 'text_content/quickfox_repeated.compressed', 'rb') as fh:
+ x = DecompressingBufferedReader(fh, decomp_type='br')
+ x.read() == b'The quick brown fox jumps over the lazy dog' * 4096
+
+
# Errors
@@ -140,9 +148,11 @@ def test_err_compress_mix():
# error: compressed member, followed by not compressed -- considered invalid
x = DecompressingBufferedReader(BytesIO(compress('ABC') + b'123'), decomp_type = 'gzip')
b = x.read()
- b = x.read_next_member()
- with pytest.raises(zlib.error):
- x.read()
+ assert b == b'ABC'
+ x.read_next_member()
+ assert x.read() == b''
+ #with pytest.raises(zlib.error):
+ # x.read()
#error: Error -3 while decompressing: incorrect header check
def test_err_chunk_cut_off():
diff --git a/pywb/utils/test/test_loaders.py b/pywb/utils/test/test_loaders.py
index abf0acfa..5d71a711 100644
--- a/pywb/utils/test/test_loaders.py
+++ b/pywb/utils/test/test_loaders.py
@@ -37,17 +37,21 @@ Traceback (most recent call last):
IOError: [Errno 2] No such file or directory: '_x_no_such_file_'
# HMAC Cookie Maker
->>> print_str(BlockLoader(HMACCookieMaker('test', 'test', 5)).load('http://example.com', 41, 14).read())
+>>> print_str(BlockLoader(cookie_maker=HMACCookieMaker('test', 'test', 5)).load('http://example.com', 41, 14).read())
'Example Domain'
# fixed cookie, range request
->>> print_str(BlockLoader('some=value').load('http://example.com', 41, 14).read())
+>>> print_str(BlockLoader(cookie='some=value').load('http://example.com', 41, 14).read())
'Example Domain'
# range request
>>> print_str(BlockLoader().load('http://example.com', 1262).read())
'