diff --git a/README.rst b/README.rst index fc257400..010a6f3e 100644 --- a/README.rst +++ b/README.rst @@ -1,4 +1,4 @@ -PyWb 0.30.1 +PyWb 0.31.0 =========== .. image:: https://travis-ci.org/ikreymer/pywb.svg?branch=master diff --git a/pywb/__init__.py b/pywb/__init__.py index c3b4b701..9f66d658 100644 --- a/pywb/__init__.py +++ b/pywb/__init__.py @@ -1,4 +1,4 @@ -__version__ = '0.30.1' +__version__ = '0.31.0' DEFAULT_CONFIG = 'pywb/default_config.yaml' diff --git a/pywb/cdx/cdxobject.py b/pywb/cdx/cdxobject.py index 432d69e4..3e8dddc5 100644 --- a/pywb/cdx/cdxobject.py +++ b/pywb/cdx/cdxobject.py @@ -153,7 +153,7 @@ class CDXObject(OrderedDict): raise CDXException(msg) for header, field in zip(cdxformat, fields): - self[header] = field.decode('utf-8') + self[header] = to_native_str(field, 'utf-8') self.cdxline = cdxline @@ -213,7 +213,7 @@ class CDXObject(OrderedDict): def __str__(self): if self.cdxline: - return self.cdxline.decode('utf-8') + return to_native_str(self.cdxline, 'utf-8') if not self._from_json: return ' '.join(str(val) for val in six.itervalues(self)) @@ -263,7 +263,7 @@ class IDXObject(OrderedDict): raise CDXException(msg.format(len(fields), self.NUM_REQ_FIELDS)) for header, field in zip(self.FORMAT, fields): - self[header] = field.decode('utf-8') + self[header] = to_native_str(field, 'utf-8') self['offset'] = int(self['offset']) self['length'] = int(self['length']) @@ -285,4 +285,4 @@ class IDXObject(OrderedDict): return json_encode(self) + '\n' def __str__(self): - return self.idxline.decode('utf-8') + return to_native_str(self.idxline, 'utf-8') diff --git a/pywb/framework/wbrequestresponse.py b/pywb/framework/wbrequestresponse.py index 0d2634f5..36afff40 100644 --- a/pywb/framework/wbrequestresponse.py +++ b/pywb/framework/wbrequestresponse.py @@ -184,14 +184,15 @@ class WbRequest(object): if not self.wb_url: return - mime = self.env.get('CONTENT_TYPE', '').split(';')[0] + mime = self.env.get('CONTENT_TYPE', '') length = self.env.get('CONTENT_LENGTH') stream = self.env['wsgi.input'] buffered_stream = BytesIO() post_query = extract_post_query('POST', mime, length, stream, - buffered_stream=buffered_stream) + buffered_stream=buffered_stream, + environ=self.env) if post_query: self.env['wsgi.input'] = buffered_stream diff --git a/pywb/rewrite/html_rewriter.py b/pywb/rewrite/html_rewriter.py index e57f8591..90148c1f 100644 --- a/pywb/rewrite/html_rewriter.py +++ b/pywb/rewrite/html_rewriter.py @@ -120,7 +120,7 @@ class HTMLRewriterMixin(object): def _rewrite_meta_refresh(self, meta_refresh): if not meta_refresh: - return None + return '' m = self.META_REFRESH_REGEX.match(meta_refresh) if not m: @@ -133,6 +133,9 @@ class HTMLRewriterMixin(object): return meta_refresh def _rewrite_base(self, url, mod=''): + if not url: + return '' + url = self._ensure_url_has_path(url) base_url = self._rewrite_url(url, mod) @@ -183,11 +186,11 @@ class HTMLRewriterMixin(object): def _rewrite_url(self, value, mod=None): if not value: - return None + return '' value = value.strip() if not value: - return None + return '' value = self.try_unescape(value) return self.url_rewriter.rewrite(value, mod) @@ -209,21 +212,24 @@ class HTMLRewriterMixin(object): return new_value def _rewrite_srcset(self, value, mod=''): + if not value: + return '' + values = value.split(',') - values = map(lambda x: self._rewrite_url(x.strip()), values) + values = [self._rewrite_url(v.strip()) for v in values] return ', '.join(values) def _rewrite_css(self, css_content): if css_content: return self.css_rewriter.rewrite(css_content) else: - return None + return '' def _rewrite_script(self, script_content): if script_content: return self.js_rewriter.rewrite(script_content) else: - return None + return '' def has_attr(self, tag_attrs, attr): name, value = attr @@ -252,6 +258,11 @@ class HTMLRewriterMixin(object): self.out.write('<' + tag) for attr_name, attr_value in tag_attrs: + empty_attr = False + if attr_value is None: + attr_value = '' + empty_attr = True + # special case: inline JS/event handler if ((attr_value and attr_value.startswith('javascript:')) or attr_name.startswith('on')): @@ -324,7 +335,7 @@ class HTMLRewriterMixin(object): attr_value = self._rewrite_url(attr_value, rw_mod) # write the attr! - self._write_attr(attr_name, attr_value) + self._write_attr(attr_name, attr_value, empty_attr) return True @@ -347,11 +358,17 @@ class HTMLRewriterMixin(object): return True - def _write_attr(self, name, value): - # parser doesn't differentiate between 'attr=""' and just 'attr' - # 'attr=""' is more common, so use that form - if value: + def _write_attr(self, name, value, empty_attr): + # if empty_attr is set, just write 'attr'! + if empty_attr: + self.out.write(' ' + name) + + # write with value, if set + elif value: + self.out.write(' ' + name + '="' + value.replace('"', '"') + '"') + + # otherwise, 'attr=""' is more common, so use that form else: self.out.write(' ' + name + '=""') @@ -421,8 +438,9 @@ class HTMLRewriter(HTMLRewriterMixin, HTMLParser): def feed(self, string): try: HTMLParser.feed(self, string) - except Exception: # pragma: no cover - # only raised in 2.6 + except Exception as e: # pragma: no cover + import traceback + traceback.print_exc() self.out.write(string) def _internal_close(self): diff --git a/pywb/rewrite/rewrite_amf.py b/pywb/rewrite/rewrite_amf.py new file mode 100644 index 00000000..07a73470 --- /dev/null +++ b/pywb/rewrite/rewrite_amf.py @@ -0,0 +1,52 @@ +from io import BytesIO +from six.moves import zip +from pywb.rewrite.rewrite_content import RewriteContent + + +# ============================================================================ +# Expiermental: not fully tested +class RewriteContentAMF(RewriteContent): #pragma: no cover + def handle_custom_rewrite(self, text_type, status_headers, stream, env): + + if status_headers.get_header('Content-Type') == 'application/x-amf': + stream = self.rewrite_amf(stream, env) + + return (super(RewriteContentAMF, self). + handle_custom_rewrite(text_type, status_headers, stream, env)) + + def rewrite_amf(self, stream, env): + try: + from pyamf import remoting + + iobuff = BytesIO() + while True: + buff = stream.read() + if not buff: + break + iobuff.write(buff) + + iobuff.seek(0) + res = remoting.decode(iobuff) + + if env and env.get('pywb.inputdata'): + inputdata = env.get('pywb.inputdata') + + new_list = [] + + for src, target in zip(inputdata.bodies, res.bodies): + #print(target[0] + ' = ' + src[0]) + + #print('messageId => corrId ' + target[1].body.correlationId + ' => ' + src[1].body[0].messageId) + target[1].body.correlationId = src[1].body[0].messageId + + new_list.append((src[0], target[1])) + + res.bodies = new_list + + return BytesIO(remoting.encode(res).getvalue()) + + except Exception as e: + import traceback + traceback.print_exc() + print(e) + return stream diff --git a/pywb/rewrite/rewrite_content.py b/pywb/rewrite/rewrite_content.py index 4454ea3c..677e20ae 100644 --- a/pywb/rewrite/rewrite_content.py +++ b/pywb/rewrite/rewrite_content.py @@ -4,7 +4,7 @@ import webencodings import yaml import re -from chardet.universaldetector import UniversalDetector +#from chardet.universaldetector import UniversalDetector from io import BytesIO from pywb.rewrite.header_rewriter import RewrittenStatusAndHeaders @@ -21,7 +21,7 @@ from pywb.rewrite.regex_rewriters import JSNoneRewriter, JSLinkOnlyRewriter #================================================================= -class RewriteContent: +class RewriteContent(object): HEAD_REGEX = re.compile(b'<\s*head\\b[^>]*[>]+', re.I) TAG_REGEX = re.compile(b'^\s*\<') @@ -77,6 +77,7 @@ class RewriteContent: def _check_encoding(self, rewritten_headers, stream, enc): + matched = False if (rewritten_headers. contains_removed_header('content-encoding', enc)): @@ -87,14 +88,15 @@ class RewriteContent: stream = DecompressingBufferedReader(stream, decomp_type=enc) rewritten_headers.status_headers.remove_header('content-length') + matched = True - return stream + return matched, stream def rewrite_content(self, urlrewriter, status_headers, stream, head_insert_func=None, urlkey='', - cdx=None, cookie_rewriter=None): + cdx=None, cookie_rewriter=None, env=None): wb_url = urlrewriter.wburl @@ -118,9 +120,12 @@ class RewriteContent: status_headers = rewritten_headers.status_headers - # use rewritten headers, but no further rewriting needed - if rewritten_headers.text_type is None: - return (status_headers, self.stream_to_gen(stream), False) + res = self.handle_custom_rewrite(rewritten_headers.text_type, + status_headers, + stream, + env) + if res: + return res # Handle text content rewriting # ==================================================================== @@ -136,8 +141,12 @@ class RewriteContent: encoding = None first_buff = b'' - stream = self._check_encoding(rewritten_headers, stream, 'gzip') - stream = self._check_encoding(rewritten_headers, stream, 'deflate') + for decomp_type in BufferedReader.get_supported_decompressors(): + matched, stream = self._check_encoding(rewritten_headers, + stream, + decomp_type) + if matched: + break if mod == 'js_': text_type, stream = self._resolve_text_type('js', @@ -237,6 +246,11 @@ class RewriteContent: return (status_headers, gen, True) + def handle_custom_rewrite(self, text_type, status_headers, stream, env): + # use rewritten headers, but no further rewriting needed + if text_type is None: + return (status_headers, self.stream_to_gen(stream), False) + @staticmethod def _extract_html_charset(buff, status_headers): charset = None @@ -360,3 +374,5 @@ class RewriteContent: finally: stream.close() + + diff --git a/pywb/rewrite/test/test_html_rewriter.py b/pywb/rewrite/test/test_html_rewriter.py index 818bd114..afb1da93 100644 --- a/pywb/rewrite/test/test_html_rewriter.py +++ b/pywb/rewrite/test/test_html_rewriter.py @@ -49,6 +49,12 @@ r""" >>> parse('', urlrewriter=no_base_canon_rewriter) +# Empty url +>>> parse('') + + +>>> parse('') + # HTML Entities @@ -66,6 +72,10 @@ r""" >>> parse('X') X +# Empty values should be ignored +>>> parse('') + + # SKIPPED # Unicode -- default with %-encoding #>>> parse(u'испытание') @@ -92,7 +102,7 @@ r""" >>> parse('') - + >>> parse('') @@ -115,6 +125,10 @@ r""" >>> parse('') +# empty srcset attrib +>>> parse('') + + # Script tag >>> parse('') @@ -131,7 +145,7 @@ r""" >>> parse('
') -
+
>>> parse('') diff --git a/pywb/static/wb.js b/pywb/static/wb.js index e186f1df..f57e833b 100644 --- a/pywb/static/wb.js +++ b/pywb/static/wb.js @@ -123,12 +123,23 @@ function notify_top() { return; } - if (window.__WB_top_frame.update_wb_url) { - window.__WB_top_frame.update_wb_url(window.WB_wombat_location.href, - wbinfo.timestamp, - wbinfo.request_ts, - wbinfo.is_live); - } + //if (window.__WB_top_frame.update_wb_url) { + // window.__WB_top_frame.update_wb_url(window.WB_wombat_location.href, + // wbinfo.timestamp, + // wbinfo.request_ts, + // wbinfo.is_live); + //} + + var message = { + "url": window.WB_wombat_location.href, + "ts": wbinfo.timestamp, + "request_ts": wbinfo.request_ts, + "is_live": wbinfo.is_live, + "title": "", + "wb_type": "load", + } + + window.__WB_top_frame.postMessage(message, "*"); remove_event("readystatechange", notify_top, document); } diff --git a/pywb/static/wb_frame.js b/pywb/static/wb_frame.js index c9e47ef3..168b914f 100644 --- a/pywb/static/wb_frame.js +++ b/pywb/static/wb_frame.js @@ -38,27 +38,21 @@ function make_url(url, ts, mod) } } -function push_state(url, timestamp, request_ts, capture_str, is_live) { +function push_state(state) { var frame = document.getElementById(IFRAME_ID).contentWindow; if (frame.WB_wombat_location) { var curr_href = frame.WB_wombat_location.href; // If not current url, don't update - if (url != curr_href) { + if (state.url != curr_href) { return; } } - var state = {} - state.timestamp = timestamp; - state.request_ts = request_ts; - state.outer_url = make_url(url, state.request_ts, wbinfo.frame_mod); - state.inner_url = make_url(url, state.request_ts, wbinfo.replay_mod); - state.url = url; - state.capture_str = capture_str; - state.is_live = is_live; + state.outer_url = make_url(state.url, state.request_ts, wbinfo.frame_mod); + state.inner_url = make_url(state.url, state.request_ts, wbinfo.replay_mod); - var canon_url = make_url(url, state.request_ts, ""); + var canon_url = make_url(state.url, state.request_ts, ""); if (window.location.href != canon_url) { window.history.replaceState(state, "", canon_url); } @@ -157,7 +151,13 @@ function iframe_loaded(event) { request_ts = ts; } - update_wb_url(url, ts, request_ts, is_live); + var state = {} + state["url"] = url; + state["ts"] = ts; + state["request_ts"] = request_ts; + state["is_live"] = is_live + + update_wb_url(state); } @@ -165,12 +165,18 @@ function init_pm() { var frame = document.getElementById(IFRAME_ID).contentWindow; window.addEventListener("message", function(event) { - // Pass to replay frame if (event.source == window.parent) { + // Pass to replay frame frame.postMessage(event.data, "*"); } else if (event.source == frame) { - // Pass to parent - window.parent.postMessage(event.data, "*"); + + // Check if iframe url change message + if (typeof(event.data) == "object" && event.data["wb_type"]) { + update_wb_url(event.data); + } else { + // Pass to parent + window.parent.postMessage(event.data, "*"); + } } }); @@ -181,14 +187,14 @@ function init_pm() { } -function update_wb_url(url, ts, request_ts, is_live) { - if (curr_state.url == url && curr_state.timestamp == ts) { +function update_wb_url(state) { + if (curr_state.url == state.url && curr_state.ts == state.ts) { return; } - capture_str = _wb_js.ts_to_date(ts, true); + state['capture_str'] = _wb_js.ts_to_date(state.ts, true); - push_state(url, ts, request_ts, capture_str, is_live); + push_state(state); } // Load Banner @@ -237,3 +243,4 @@ function init_hash_connect() { } document.addEventListener("DOMContentLoaded", init_hash_connect); + diff --git a/pywb/static/wombat.js b/pywb/static/wombat.js index 5fb4e2a4..af280f94 100644 --- a/pywb/static/wombat.js +++ b/pywb/static/wombat.js @@ -292,6 +292,7 @@ var wombat_internal = function($wbwindow) { var parser = make_parser(extract_orig($wbwindow.document.baseURI)); var href = parser.href; var hash = href.lastIndexOf("#"); + if (hash >= 0) { href = href.substring(0, hash); } @@ -300,8 +301,6 @@ var wombat_internal = function($wbwindow) { if (lastslash >= 0 && lastslash != (href.length - 1)) { href = href.substring(0, lastslash + 1); - } else { - href += "/"; } parser.href = href + url; @@ -667,15 +666,15 @@ var wombat_internal = function($wbwindow) { // Adapted from: // http://indiegamr.com/generate-repeatable-random-numbers-in-js/ - Math.seed = parseInt(seed); + $wbwindow.Math.seed = parseInt(seed); function seeded_random() { - Math.seed = (Math.seed * 9301 + 49297) % 233280; - var rnd = Math.seed / 233280; + $wbwindow.Math.seed = ($wbwindow.Math.seed * 9301 + 49297) % 233280; + var rnd = $wbwindow.Math.seed / 233280; return rnd; } - Math.random = seeded_random; + $wbwindow.Math.random = seeded_random; } function init_crypto_random() { @@ -687,7 +686,7 @@ var wombat_internal = function($wbwindow) { var new_getrandom = function(array) { for (i = 0; i < array.length; i++) { - array[i] = parseInt(Math.random() * 4294967296); + array[i] = parseInt($wbwindow.Math.random() * 4294967296); } return array; } @@ -719,11 +718,23 @@ var wombat_internal = function($wbwindow) { orig_func.call(this, state_obj, title, url); - if ($wbwindow.__WB_top_frame && $wbwindow != $wbwindow.__WB_top_frame && $wbwindow.__WB_top_frame.update_wb_url) { - $wbwindow.__WB_top_frame.update_wb_url($wbwindow.WB_wombat_location.href, - wb_info.timestamp, - wb_info.request_ts, - wb_info.is_live); + //if ($wbwindow.__WB_top_frame && $wbwindow != $wbwindow.__WB_top_frame && $wbwindow.__WB_top_frame.update_wb_url) { + // $wbwindow.__WB_top_frame.update_wb_url($wbwindow.WB_wombat_location.href, + // wb_info.timestamp, + // wb_info.request_ts, + // wb_info.is_live); + //} + if ($wbwindow.__WB_top_frame && $wbwindow != $wbwindow.__WB_top_frame) { + var message = { + "url": url, + "ts": wb_info.timestamp, + "request_ts": wb_info.request_ts, + "is_live": wb_info.is_live, + "title": title, + "wb_type": func_name, + } + + $wbwindow.__WB_top_frame.postMessage(message, "*"); } } @@ -931,7 +942,8 @@ var wombat_internal = function($wbwindow) { //var timezone = new Date().getTimezoneOffset() * 60 * 1000; // Already UTC! var timezone = 0; - var timediff = $wbwindow.Date.now() - (timestamp - timezone); + var start_now = $wbwindow.Date.now() + var timediff = start_now - (timestamp - timezone); if ($wbwindow.__wb_Date_now) { return; @@ -1656,13 +1668,14 @@ var wombat_internal = function($wbwindow) { var from = source.WB_wombat_location.origin; - if (!source.__WB_id) { - source.__WB_id = Math.round(Math.random() * 1000) + source.WB_wombat_location.href; - } if (!this.__WB_win_id) { this.__WB_win_id = {}; + this.__WB_counter = 0; } + if (!source.__WB_id) { + source.__WB_id = (this.__WB_counter++) + source.WB_wombat_location.href; + } this.__WB_win_id[source.__WB_id] = source; src_id = source.__WB_id; @@ -1783,19 +1796,22 @@ var wombat_internal = function($wbwindow) { //============================================ function init_open_override() { - if (!$wbwindow.Window.prototype.open) { - return; + var orig = $wbwindow.open; + + if ($wbwindow.Window.prototype.open) { + orig = $wbwindow.Window.prototype.open; } - var orig = $wbwindow.Window.prototype.open; - var open_rewritten = function(strUrl, strWindowName, strWindowFeatures) { - strUrl = rewrite_url(strUrl); + strUrl = rewrite_url(strUrl, false, ""); return orig.call(this, strUrl, strWindowName, strWindowFeatures); } $wbwindow.open = open_rewritten; - $wbwindow.Window.prototype.open = open_rewritten; + + if ($wbwindow.Window.prototype.open) { + $wbwindow.Window.prototype.open = open_rewritten; + } for (var i = 0; i < $wbwindow.frames.length; i++) { try { @@ -2086,7 +2102,7 @@ var wombat_internal = function($wbwindow) { //============================================ function get_final_url(prefix, mod, url) { - if (!mod) { + if (mod == undefined) { mod = wb_info.mod; } diff --git a/pywb/utils/bufferedreaders.py b/pywb/utils/bufferedreaders.py index f3268c58..e1ebfc90 100644 --- a/pywb/utils/bufferedreaders.py +++ b/pywb/utils/bufferedreaders.py @@ -1,5 +1,6 @@ from io import BytesIO import zlib +import brotli #================================================================= @@ -17,6 +18,11 @@ def deflate_decompressor(): def deflate_decompressor_alt(): return zlib.decompressobj(-zlib.MAX_WBITS) +def brotli_decompressor(): + decomp = brotli.Decompressor() + decomp.unused_data = None + return decomp + #================================================================= class BufferedReader(object): @@ -40,7 +46,9 @@ class BufferedReader(object): DECOMPRESSORS = {'gzip': gzip_decompressor, 'deflate': deflate_decompressor, - 'deflate_alt': deflate_decompressor_alt} + 'deflate_alt': deflate_decompressor_alt, + 'br': brotli_decompressor + } def __init__(self, stream, block_size=1024, decomp_type=None, @@ -98,7 +106,7 @@ class BufferedReader(object): if self.decompressor and data: try: data = self.decompressor.decompress(data) - except Exception: + except Exception as e: # if first read attempt, assume non-gzipped stream if self.num_read == 0: if self.decomp_type == 'deflate': @@ -108,7 +116,8 @@ class BufferedReader(object): self.decompressor = None # otherwise (partly decompressed), something is wrong else: - raise + print(str(e)) + return b'' return data def read(self, length=None): @@ -180,6 +189,10 @@ class BufferedReader(object): self.stream.close() self.stream = None + @classmethod + def get_supported_decompressors(cls): + return cls.DECOMPRESSORS.keys() + #================================================================= class DecompressingBufferedReader(BufferedReader): diff --git a/pywb/utils/loaders.py b/pywb/utils/loaders.py index 6dbbf1e2..4c298334 100644 --- a/pywb/utils/loaders.py +++ b/pywb/utils/loaders.py @@ -9,10 +9,12 @@ import requests import six from six.moves.urllib.request import pathname2url, url2pathname -from six.moves.urllib.parse import urljoin, unquote_plus, urlsplit +from six.moves.urllib.parse import urljoin, unquote_plus, urlsplit, urlencode import time import pkg_resources +import base64 +import cgi from io import open, BytesIO @@ -65,19 +67,18 @@ def to_native_str(value, encoding='iso-8859-1', func=lambda x: x): #================================================================= -def extract_post_query(method, mime, length, stream, buffered_stream=None): +def extract_post_query(method, mime, length, stream, + buffered_stream=None, + environ=None): """ Extract a url-encoded form POST from stream - If not a application/x-www-form-urlencoded, or no missing content length, return None + Attempt to decode application/x-www-form-urlencoded or multipart/*, + otherwise read whole block and b64encode """ if method.upper() != 'POST': return None - if ((not mime or - not mime.lower().startswith('application/x-www-form-urlencoded'))): - return None - try: length = int(length) except (ValueError, TypeError): @@ -101,11 +102,79 @@ def extract_post_query(method, mime, length, stream, buffered_stream=None): buffered_stream.write(post_query) buffered_stream.seek(0) - post_query = to_native_str(post_query) - post_query = unquote_plus(post_query) + if not mime: + mime = '' + + if mime.startswith('application/x-www-form-urlencoded'): + post_query = to_native_str(post_query) + post_query = unquote_plus(post_query) + + elif mime.startswith('multipart/'): + env = {'REQUEST_METHOD': 'POST', + 'CONTENT_TYPE': mime, + 'CONTENT_LENGTH': len(post_query)} + + args = dict(fp=BytesIO(post_query), + environ=env, + keep_blank_values=True) + + if six.PY3: + args['encoding'] = 'utf-8' + + data = cgi.FieldStorage(**args) + + values = [] + for item in data.list: + values.append((item.name, item.value)) + + post_query = urlencode(values, True) + + elif mime.startswith('application/x-amf'): + post_query = amf_parse(post_query, environ) + + else: + post_query = base64.b64encode(post_query) + post_query = to_native_str(post_query) + post_query = '&__wb_post_data=' + post_query + return post_query +#================================================================= +def amf_parse(string, environ): + try: + from pyamf import remoting + + res = remoting.decode(BytesIO(string)) + + #print(res) + body = res.bodies[0][1].body[0] + + values = {} + + if hasattr(body, 'body'): + values['body'] = body.body + + if hasattr(body, 'source'): + values['source'] = body.source + + if hasattr(body, 'operation'): + values['op'] = body.operation + + if environ is not None: + environ['pywb.inputdata'] = res + + query = urlencode(values) + #print(query) + return query + + except Exception as e: + import traceback + traceback.print_exc() + print(e) + return None + + #================================================================= def append_post_query(url, post_query): if not post_query: @@ -167,23 +236,34 @@ def read_last_line(fh, offset=256): #================================================================= -class BlockLoader(object): +class BaseLoader(object): + def __init__(self, **kwargs): + pass + + def load(self, url, offset=0, length=-1): + raise NotImplemented() + + +#================================================================= +class BlockLoader(BaseLoader): """ a loader which can stream blocks of content given a uri, offset and optional length. Currently supports: http/https and file/local file system """ - def __init__(self, *args, **kwargs): + loaders = {} + profile_loader = None + + def __init__(self, **kwargs): self.cached = {} - self.args = args self.kwargs = kwargs def load(self, url, offset=0, length=-1): - loader = self._get_loader_for(url) + loader, url = self._get_loader_for_url(url) return loader.load(url, offset, length) - def _get_loader_for(self, url): + def _get_loader_for_url(self, url): """ Determine loading method based on uri """ @@ -193,18 +273,47 @@ class BlockLoader(object): else: type_ = parts[0] + if '+' in type_: + profile_name, scheme = type_.split('+', 1) + if len(parts) == 2: + url = scheme + '://' + parts[1] + else: + profile_name = '' + scheme = type_ + loader = self.cached.get(type_) if loader: - return loader + return loader, url + + loader_cls = self._get_loader_class_for_type(scheme) - loader_cls = LOADERS.get(type_) if not loader_cls: - raise IOError('No Loader for type: ' + type_) + raise IOError('No Loader for type: ' + scheme) + + profile = self.kwargs + + if self.profile_loader: + profile = self.profile_loader(profile_name, scheme) + + loader = loader_cls(**profile) - loader = loader_cls(*self.args, **self.kwargs) self.cached[type_] = loader - return loader + return loader, url + def _get_loader_class_for_type(self, type_): + loader_cls = self.loaders.get(type_) + return loader_cls + + @staticmethod + def init_default_loaders(): + BlockLoader.loaders['http'] = HttpLoader + BlockLoader.loaders['https'] = HttpLoader + BlockLoader.loaders['s3'] = S3Loader + BlockLoader.loaders['file'] = LocalFileLoader + + @staticmethod + def set_profile_loader(src): + BlockLoader.profile_loader = src @staticmethod def _make_range_header(offset, length): @@ -217,10 +326,7 @@ class BlockLoader(object): #================================================================= -class LocalFileLoader(object): - def __init__(self, *args, **kwargs): - pass - +class LocalFileLoader(BaseLoader): def load(self, url, offset=0, length=-1): """ Load a file-like reader from the local file system @@ -260,9 +366,11 @@ class LocalFileLoader(object): #================================================================= -class HttpLoader(object): - def __init__(self, cookie_maker=None, *args, **kwargs): - self.cookie_maker = cookie_maker +class HttpLoader(BaseLoader): + def __init__(self, **kwargs): + self.cookie_maker = kwargs.get('cookie_maker') + if not self.cookie_maker: + self.cookie_maker = kwargs.get('cookie') self.session = None def load(self, url, offset, length): @@ -288,33 +396,47 @@ class HttpLoader(object): #================================================================= -class S3Loader(object): - def __init__(self, *args, **kwargs): +class S3Loader(BaseLoader): + def __init__(self, **kwargs): self.s3conn = None + self.aws_access_key_id = kwargs.get('aws_access_key_id') + self.aws_secret_access_key = kwargs.get('aws_secret_access_key') def load(self, url, offset, length): if not s3_avail: #pragma: no cover raise IOError('To load from s3 paths, ' + 'you must install boto: pip install boto') - if not self.s3conn: - try: - self.s3conn = connect_s3() - except Exception: #pragma: no cover - self.s3conn = connect_s3(anon=True) + aws_access_key_id = self.aws_access_key_id + aws_secret_access_key = self.aws_secret_access_key parts = urlsplit(url) - bucket = self.s3conn.get_bucket(parts.netloc) + if parts.username and parts.password: + aws_access_key_id = unquote_plus(parts.username) + aws_secret_access_key = unquote_plus(parts.password) + bucket_name = parts.netloc.split('@', 1)[-1] + else: + bucket_name = parts.netloc - headers = {'Range': BlockLoader._make_range_header(offset, length)} + if not self.s3conn: + try: + self.s3conn = connect_s3(aws_access_key_id, aws_secret_access_key) + except Exception: #pragma: no cover + self.s3conn = connect_s3(anon=True) + + bucket = self.s3conn.get_bucket(bucket_name) key = bucket.get_key(parts.path) - result = key.get_contents_as_string(headers=headers) - key.close() + if offset == 0 and length == -1: + headers = {} + else: + headers = {'Range': BlockLoader._make_range_header(offset, length)} - return BytesIO(result) + # Read range + key.open_read(headers=headers) + return key #================================================================= @@ -414,12 +536,6 @@ class LimitReader(object): return stream - -#================================================================= -LOADERS = {'http': HttpLoader, - 'https': HttpLoader, - 's3': S3Loader, - 'file': LocalFileLoader - } - +# ============================================================================ +BlockLoader.init_default_loaders() diff --git a/pywb/utils/test/test_bufferedreaders.py b/pywb/utils/test/test_bufferedreaders.py index 9f4fd54a..7d058dcd 100644 --- a/pywb/utils/test/test_bufferedreaders.py +++ b/pywb/utils/test/test_bufferedreaders.py @@ -133,6 +133,14 @@ def compress_alt(buff): return compressed +# Brotli + +def test_brotli(): + with open(get_test_dir() + 'text_content/quickfox_repeated.compressed', 'rb') as fh: + x = DecompressingBufferedReader(fh, decomp_type='br') + x.read() == b'The quick brown fox jumps over the lazy dog' * 4096 + + # Errors @@ -140,9 +148,11 @@ def test_err_compress_mix(): # error: compressed member, followed by not compressed -- considered invalid x = DecompressingBufferedReader(BytesIO(compress('ABC') + b'123'), decomp_type = 'gzip') b = x.read() - b = x.read_next_member() - with pytest.raises(zlib.error): - x.read() + assert b == b'ABC' + x.read_next_member() + assert x.read() == b'' + #with pytest.raises(zlib.error): + # x.read() #error: Error -3 while decompressing: incorrect header check def test_err_chunk_cut_off(): diff --git a/pywb/utils/test/test_loaders.py b/pywb/utils/test/test_loaders.py index abf0acfa..5d71a711 100644 --- a/pywb/utils/test/test_loaders.py +++ b/pywb/utils/test/test_loaders.py @@ -37,17 +37,21 @@ Traceback (most recent call last): IOError: [Errno 2] No such file or directory: '_x_no_such_file_' # HMAC Cookie Maker ->>> print_str(BlockLoader(HMACCookieMaker('test', 'test', 5)).load('http://example.com', 41, 14).read()) +>>> print_str(BlockLoader(cookie_maker=HMACCookieMaker('test', 'test', 5)).load('http://example.com', 41, 14).read()) 'Example Domain' # fixed cookie, range request ->>> print_str(BlockLoader('some=value').load('http://example.com', 41, 14).read()) +>>> print_str(BlockLoader(cookie='some=value').load('http://example.com', 41, 14).read()) 'Example Domain' # range request >>> print_str(BlockLoader().load('http://example.com', 1262).read()) '\n' +# custom profile +>>> print_str(BlockLoader().load('local+http://example.com', 1262).read()) +'\n' + # unknown loader error #>>> BlockLoader().load('foo://example.com', 10).read() # doctest: +IGNORE_EXCEPTION_DETAIL #Traceback (most recent call last): @@ -90,8 +94,9 @@ IOError: [Errno 2] No such file or directory: '_x_no_such_file_' # unsupported method >>> extract_post_query('PUT', 'application/x-www-form-urlencoded', len(post_data), BytesIO(post_data)) -# unsupported type +# base64 encode >>> extract_post_query('POST', 'text/plain', len(post_data), BytesIO(post_data)) +'&__wb_post_data=Zm9vPWJhciZkaXI9JTJGYmF6' # invalid length >>> extract_post_query('POST', 'application/x-www-form-urlencoded', 'abc', BytesIO(post_data)) diff --git a/pywb/warc/archiveiterator.py b/pywb/warc/archiveiterator.py index 76a76abd..4ff500d4 100644 --- a/pywb/warc/archiveiterator.py +++ b/pywb/warc/archiveiterator.py @@ -253,6 +253,7 @@ class ArchiveIndexEntryMixin(object): self['mime'] = def_mime if mime: self['mime'] = self.MIME_RE.split(mime, 1)[0] + self['_content_type'] = mime def extract_status(self, status_headers): """ Extract status code only from status line @@ -390,7 +391,7 @@ class DefaultRecordParser(object): len_ = record.status_headers.get_header('Content-Length') post_query = extract_post_query(method, - entry.get('mime'), + entry.get('_content_type'), len_, record.stream) diff --git a/pywb/warc/recordloader.py b/pywb/warc/recordloader.py index 06a3c79e..402d1524 100644 --- a/pywb/warc/recordloader.py +++ b/pywb/warc/recordloader.py @@ -57,7 +57,7 @@ class ArcWarcRecordLoader(object): def __init__(self, loader=None, cookie_maker=None, block_size=8192, verify_http=True): if not loader: - loader = BlockLoader(cookie_maker) + loader = BlockLoader(cookie_maker=cookie_maker) self.loader = loader self.block_size = block_size diff --git a/pywb/webapp/replay_views.py b/pywb/webapp/replay_views.py index d3771c68..7d95db1c 100644 --- a/pywb/webapp/replay_views.py +++ b/pywb/webapp/replay_views.py @@ -174,7 +174,8 @@ class ReplayView(object): stream=stream, head_insert_func=head_insert_func, urlkey=cdx['urlkey'], - cdx=cdx)) + cdx=cdx, + env=wbrequest.env)) (status_headers, response_iter, is_rewritten) = result diff --git a/sample_archive/text_content/quickfox_repeated.compressed b/sample_archive/text_content/quickfox_repeated.compressed new file mode 100644 index 00000000..f9d79767 --- /dev/null +++ b/sample_archive/text_content/quickfox_repeated.compressed @@ -0,0 +1,2 @@ +["y\ZB;%UZ5 +{K< @Mme'_0{<S \ No newline at end of file diff --git a/setup.py b/setup.py index 6dba8420..629ea228 100755 --- a/setup.py +++ b/setup.py @@ -78,7 +78,8 @@ setup( 'requests', 'redis', 'jinja2', - 'surt==0.3b4', + 'surt>=0.3.0', + 'brotlipy', 'pyyaml', 'watchdog', 'webencodings', @@ -90,9 +91,6 @@ setup( 'fakeredis', 'mock', ], - dependency_links=[ - 'git+https://github.com/ikreymer/fakeredis.git@zset-lex-ops#egg=fakeredis-0.6.2-zset-lex-ops', - ], cmdclass={'test': PyTest}, test_suite='', entry_points="""