1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

Merge branch 'develop'

This commit is contained in:
Ilya Kreymer 2016-06-16 00:41:08 -04:00
commit f4e5a7df5d
20 changed files with 423 additions and 142 deletions

View File

@ -1,4 +1,4 @@
PyWb 0.30.1
PyWb 0.31.0
===========
.. image:: https://travis-ci.org/ikreymer/pywb.svg?branch=master

View File

@ -1,4 +1,4 @@
__version__ = '0.30.1'
__version__ = '0.31.0'
DEFAULT_CONFIG = 'pywb/default_config.yaml'

View File

@ -153,7 +153,7 @@ class CDXObject(OrderedDict):
raise CDXException(msg)
for header, field in zip(cdxformat, fields):
self[header] = field.decode('utf-8')
self[header] = to_native_str(field, 'utf-8')
self.cdxline = cdxline
@ -213,7 +213,7 @@ class CDXObject(OrderedDict):
def __str__(self):
if self.cdxline:
return self.cdxline.decode('utf-8')
return to_native_str(self.cdxline, 'utf-8')
if not self._from_json:
return ' '.join(str(val) for val in six.itervalues(self))
@ -263,7 +263,7 @@ class IDXObject(OrderedDict):
raise CDXException(msg.format(len(fields), self.NUM_REQ_FIELDS))
for header, field in zip(self.FORMAT, fields):
self[header] = field.decode('utf-8')
self[header] = to_native_str(field, 'utf-8')
self['offset'] = int(self['offset'])
self['length'] = int(self['length'])
@ -285,4 +285,4 @@ class IDXObject(OrderedDict):
return json_encode(self) + '\n'
def __str__(self):
return self.idxline.decode('utf-8')
return to_native_str(self.idxline, 'utf-8')

View File

@ -184,14 +184,15 @@ class WbRequest(object):
if not self.wb_url:
return
mime = self.env.get('CONTENT_TYPE', '').split(';')[0]
mime = self.env.get('CONTENT_TYPE', '')
length = self.env.get('CONTENT_LENGTH')
stream = self.env['wsgi.input']
buffered_stream = BytesIO()
post_query = extract_post_query('POST', mime, length, stream,
buffered_stream=buffered_stream)
buffered_stream=buffered_stream,
environ=self.env)
if post_query:
self.env['wsgi.input'] = buffered_stream

View File

@ -120,7 +120,7 @@ class HTMLRewriterMixin(object):
def _rewrite_meta_refresh(self, meta_refresh):
if not meta_refresh:
return None
return ''
m = self.META_REFRESH_REGEX.match(meta_refresh)
if not m:
@ -133,6 +133,9 @@ class HTMLRewriterMixin(object):
return meta_refresh
def _rewrite_base(self, url, mod=''):
if not url:
return ''
url = self._ensure_url_has_path(url)
base_url = self._rewrite_url(url, mod)
@ -183,11 +186,11 @@ class HTMLRewriterMixin(object):
def _rewrite_url(self, value, mod=None):
if not value:
return None
return ''
value = value.strip()
if not value:
return None
return ''
value = self.try_unescape(value)
return self.url_rewriter.rewrite(value, mod)
@ -209,21 +212,24 @@ class HTMLRewriterMixin(object):
return new_value
def _rewrite_srcset(self, value, mod=''):
if not value:
return ''
values = value.split(',')
values = map(lambda x: self._rewrite_url(x.strip()), values)
values = [self._rewrite_url(v.strip()) for v in values]
return ', '.join(values)
def _rewrite_css(self, css_content):
if css_content:
return self.css_rewriter.rewrite(css_content)
else:
return None
return ''
def _rewrite_script(self, script_content):
if script_content:
return self.js_rewriter.rewrite(script_content)
else:
return None
return ''
def has_attr(self, tag_attrs, attr):
name, value = attr
@ -252,6 +258,11 @@ class HTMLRewriterMixin(object):
self.out.write('<' + tag)
for attr_name, attr_value in tag_attrs:
empty_attr = False
if attr_value is None:
attr_value = ''
empty_attr = True
# special case: inline JS/event handler
if ((attr_value and attr_value.startswith('javascript:'))
or attr_name.startswith('on')):
@ -324,7 +335,7 @@ class HTMLRewriterMixin(object):
attr_value = self._rewrite_url(attr_value, rw_mod)
# write the attr!
self._write_attr(attr_name, attr_value)
self._write_attr(attr_name, attr_value, empty_attr)
return True
@ -347,11 +358,17 @@ class HTMLRewriterMixin(object):
return True
def _write_attr(self, name, value):
# parser doesn't differentiate between 'attr=""' and just 'attr'
# 'attr=""' is more common, so use that form
if value:
def _write_attr(self, name, value, empty_attr):
# if empty_attr is set, just write 'attr'!
if empty_attr:
self.out.write(' ' + name)
# write with value, if set
elif value:
self.out.write(' ' + name + '="' + value.replace('"', '&quot;') + '"')
# otherwise, 'attr=""' is more common, so use that form
else:
self.out.write(' ' + name + '=""')
@ -421,8 +438,9 @@ class HTMLRewriter(HTMLRewriterMixin, HTMLParser):
def feed(self, string):
try:
HTMLParser.feed(self, string)
except Exception: # pragma: no cover
# only raised in 2.6
except Exception as e: # pragma: no cover
import traceback
traceback.print_exc()
self.out.write(string)
def _internal_close(self):

View File

@ -0,0 +1,52 @@
from io import BytesIO
from six.moves import zip
from pywb.rewrite.rewrite_content import RewriteContent
# ============================================================================
# Expiermental: not fully tested
class RewriteContentAMF(RewriteContent): #pragma: no cover
def handle_custom_rewrite(self, text_type, status_headers, stream, env):
if status_headers.get_header('Content-Type') == 'application/x-amf':
stream = self.rewrite_amf(stream, env)
return (super(RewriteContentAMF, self).
handle_custom_rewrite(text_type, status_headers, stream, env))
def rewrite_amf(self, stream, env):
try:
from pyamf import remoting
iobuff = BytesIO()
while True:
buff = stream.read()
if not buff:
break
iobuff.write(buff)
iobuff.seek(0)
res = remoting.decode(iobuff)
if env and env.get('pywb.inputdata'):
inputdata = env.get('pywb.inputdata')
new_list = []
for src, target in zip(inputdata.bodies, res.bodies):
#print(target[0] + ' = ' + src[0])
#print('messageId => corrId ' + target[1].body.correlationId + ' => ' + src[1].body[0].messageId)
target[1].body.correlationId = src[1].body[0].messageId
new_list.append((src[0], target[1]))
res.bodies = new_list
return BytesIO(remoting.encode(res).getvalue())
except Exception as e:
import traceback
traceback.print_exc()
print(e)
return stream

View File

@ -4,7 +4,7 @@ import webencodings
import yaml
import re
from chardet.universaldetector import UniversalDetector
#from chardet.universaldetector import UniversalDetector
from io import BytesIO
from pywb.rewrite.header_rewriter import RewrittenStatusAndHeaders
@ -21,7 +21,7 @@ from pywb.rewrite.regex_rewriters import JSNoneRewriter, JSLinkOnlyRewriter
#=================================================================
class RewriteContent:
class RewriteContent(object):
HEAD_REGEX = re.compile(b'<\s*head\\b[^>]*[>]+', re.I)
TAG_REGEX = re.compile(b'^\s*\<')
@ -77,6 +77,7 @@ class RewriteContent:
def _check_encoding(self, rewritten_headers, stream, enc):
matched = False
if (rewritten_headers.
contains_removed_header('content-encoding', enc)):
@ -87,14 +88,15 @@ class RewriteContent:
stream = DecompressingBufferedReader(stream, decomp_type=enc)
rewritten_headers.status_headers.remove_header('content-length')
matched = True
return stream
return matched, stream
def rewrite_content(self, urlrewriter, status_headers, stream,
head_insert_func=None, urlkey='',
cdx=None, cookie_rewriter=None):
cdx=None, cookie_rewriter=None, env=None):
wb_url = urlrewriter.wburl
@ -118,9 +120,12 @@ class RewriteContent:
status_headers = rewritten_headers.status_headers
# use rewritten headers, but no further rewriting needed
if rewritten_headers.text_type is None:
return (status_headers, self.stream_to_gen(stream), False)
res = self.handle_custom_rewrite(rewritten_headers.text_type,
status_headers,
stream,
env)
if res:
return res
# Handle text content rewriting
# ====================================================================
@ -136,8 +141,12 @@ class RewriteContent:
encoding = None
first_buff = b''
stream = self._check_encoding(rewritten_headers, stream, 'gzip')
stream = self._check_encoding(rewritten_headers, stream, 'deflate')
for decomp_type in BufferedReader.get_supported_decompressors():
matched, stream = self._check_encoding(rewritten_headers,
stream,
decomp_type)
if matched:
break
if mod == 'js_':
text_type, stream = self._resolve_text_type('js',
@ -237,6 +246,11 @@ class RewriteContent:
return (status_headers, gen, True)
def handle_custom_rewrite(self, text_type, status_headers, stream, env):
# use rewritten headers, but no further rewriting needed
if text_type is None:
return (status_headers, self.stream_to_gen(stream), False)
@staticmethod
def _extract_html_charset(buff, status_headers):
charset = None
@ -360,3 +374,5 @@ class RewriteContent:
finally:
stream.close()

View File

@ -49,6 +49,12 @@ r"""
>>> parse('<base href="static/"/><img src="image.gif"/>', urlrewriter=no_base_canon_rewriter)
<base href="static/"/><img src="/web/20131226101010im_/http://example.com/some/path/static/image.gif"/>
# Empty url
>>> parse('<base href="">')
<base href="">
>>> parse('<base href>')
<base href>
# HTML Entities
@ -66,6 +72,10 @@ r"""
>>> parse('<input value="&amp;X&amp;&quot;">X</input>')
<input value="&amp;X&amp;&quot;">X</input>
# Empty values should be ignored
>>> parse('<input name="foo" value>')
<input name="foo" value>
# SKIPPED
# Unicode -- default with %-encoding
#>>> parse(u'<a href="http://испытание.испытание/">испытание</a>')
@ -92,7 +102,7 @@ r"""
<meta http-equiv="refresh" content="text/html; charset=utf-8"/>
>>> parse('<META http-equiv="refresh" content>')
<meta http-equiv="refresh" content="">
<meta http-equiv="refresh" content>
>>> parse('<meta property="og:image" content="http://example.com/example.jpg">')
<meta property="og:image" content="/web/20131226101010/http://example.com/example.jpg">
@ -115,6 +125,10 @@ r"""
>>> parse('<img srcset="//example.com/1x 1x, //example.com/foo 2x, https://example.com/bar 4x">')
<img srcset="/web/20131226101010///example.com/1x 1x, /web/20131226101010///example.com/foo 2x, /web/20131226101010/https://example.com/bar 4x">
# empty srcset attrib
>>> parse('<img srcset="">')
<img srcset="">
# Script tag
>>> parse('<script>window.location = "http://example.com/a/b/c.html"</script>')
<script>window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html"</script>
@ -131,7 +145,7 @@ r"""
<script>/*<![CDATA[*/window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html;/*]]>*/"</script>
>>> parse('<div style="background: url(\'abc.html\')" onblah onclick="location = \'redirect.html\'"></div>')
<div style="background: url('/web/20131226101010/http://example.com/some/path/abc.html')" onblah="" onclick="WB_wombat_location = 'redirect.html'"></div>
<div style="background: url('/web/20131226101010/http://example.com/some/path/abc.html')" onblah onclick="WB_wombat_location = 'redirect.html'"></div>
>>> parse('<i style="background-image: url(http://foo-.bar_.example.com/)"></i>')
<i style="background-image: url(/web/20131226101010/http://foo-.bar_.example.com/)"></i>

View File

@ -123,12 +123,23 @@ function notify_top() {
return;
}
if (window.__WB_top_frame.update_wb_url) {
window.__WB_top_frame.update_wb_url(window.WB_wombat_location.href,
wbinfo.timestamp,
wbinfo.request_ts,
wbinfo.is_live);
}
//if (window.__WB_top_frame.update_wb_url) {
// window.__WB_top_frame.update_wb_url(window.WB_wombat_location.href,
// wbinfo.timestamp,
// wbinfo.request_ts,
// wbinfo.is_live);
//}
var message = {
"url": window.WB_wombat_location.href,
"ts": wbinfo.timestamp,
"request_ts": wbinfo.request_ts,
"is_live": wbinfo.is_live,
"title": "",
"wb_type": "load",
}
window.__WB_top_frame.postMessage(message, "*");
remove_event("readystatechange", notify_top, document);
}

View File

@ -38,27 +38,21 @@ function make_url(url, ts, mod)
}
}
function push_state(url, timestamp, request_ts, capture_str, is_live) {
function push_state(state) {
var frame = document.getElementById(IFRAME_ID).contentWindow;
if (frame.WB_wombat_location) {
var curr_href = frame.WB_wombat_location.href;
// If not current url, don't update
if (url != curr_href) {
if (state.url != curr_href) {
return;
}
}
var state = {}
state.timestamp = timestamp;
state.request_ts = request_ts;
state.outer_url = make_url(url, state.request_ts, wbinfo.frame_mod);
state.inner_url = make_url(url, state.request_ts, wbinfo.replay_mod);
state.url = url;
state.capture_str = capture_str;
state.is_live = is_live;
state.outer_url = make_url(state.url, state.request_ts, wbinfo.frame_mod);
state.inner_url = make_url(state.url, state.request_ts, wbinfo.replay_mod);
var canon_url = make_url(url, state.request_ts, "");
var canon_url = make_url(state.url, state.request_ts, "");
if (window.location.href != canon_url) {
window.history.replaceState(state, "", canon_url);
}
@ -157,7 +151,13 @@ function iframe_loaded(event) {
request_ts = ts;
}
update_wb_url(url, ts, request_ts, is_live);
var state = {}
state["url"] = url;
state["ts"] = ts;
state["request_ts"] = request_ts;
state["is_live"] = is_live
update_wb_url(state);
}
@ -165,12 +165,18 @@ function init_pm() {
var frame = document.getElementById(IFRAME_ID).contentWindow;
window.addEventListener("message", function(event) {
// Pass to replay frame
if (event.source == window.parent) {
// Pass to replay frame
frame.postMessage(event.data, "*");
} else if (event.source == frame) {
// Pass to parent
window.parent.postMessage(event.data, "*");
// Check if iframe url change message
if (typeof(event.data) == "object" && event.data["wb_type"]) {
update_wb_url(event.data);
} else {
// Pass to parent
window.parent.postMessage(event.data, "*");
}
}
});
@ -181,14 +187,14 @@ function init_pm() {
}
function update_wb_url(url, ts, request_ts, is_live) {
if (curr_state.url == url && curr_state.timestamp == ts) {
function update_wb_url(state) {
if (curr_state.url == state.url && curr_state.ts == state.ts) {
return;
}
capture_str = _wb_js.ts_to_date(ts, true);
state['capture_str'] = _wb_js.ts_to_date(state.ts, true);
push_state(url, ts, request_ts, capture_str, is_live);
push_state(state);
}
// Load Banner
@ -237,3 +243,4 @@ function init_hash_connect() {
}
document.addEventListener("DOMContentLoaded", init_hash_connect);

View File

@ -292,6 +292,7 @@ var wombat_internal = function($wbwindow) {
var parser = make_parser(extract_orig($wbwindow.document.baseURI));
var href = parser.href;
var hash = href.lastIndexOf("#");
if (hash >= 0) {
href = href.substring(0, hash);
}
@ -300,8 +301,6 @@ var wombat_internal = function($wbwindow) {
if (lastslash >= 0 && lastslash != (href.length - 1)) {
href = href.substring(0, lastslash + 1);
} else {
href += "/";
}
parser.href = href + url;
@ -667,15 +666,15 @@ var wombat_internal = function($wbwindow) {
// Adapted from:
// http://indiegamr.com/generate-repeatable-random-numbers-in-js/
Math.seed = parseInt(seed);
$wbwindow.Math.seed = parseInt(seed);
function seeded_random() {
Math.seed = (Math.seed * 9301 + 49297) % 233280;
var rnd = Math.seed / 233280;
$wbwindow.Math.seed = ($wbwindow.Math.seed * 9301 + 49297) % 233280;
var rnd = $wbwindow.Math.seed / 233280;
return rnd;
}
Math.random = seeded_random;
$wbwindow.Math.random = seeded_random;
}
function init_crypto_random() {
@ -687,7 +686,7 @@ var wombat_internal = function($wbwindow) {
var new_getrandom = function(array) {
for (i = 0; i < array.length; i++) {
array[i] = parseInt(Math.random() * 4294967296);
array[i] = parseInt($wbwindow.Math.random() * 4294967296);
}
return array;
}
@ -719,11 +718,23 @@ var wombat_internal = function($wbwindow) {
orig_func.call(this, state_obj, title, url);
if ($wbwindow.__WB_top_frame && $wbwindow != $wbwindow.__WB_top_frame && $wbwindow.__WB_top_frame.update_wb_url) {
$wbwindow.__WB_top_frame.update_wb_url($wbwindow.WB_wombat_location.href,
wb_info.timestamp,
wb_info.request_ts,
wb_info.is_live);
//if ($wbwindow.__WB_top_frame && $wbwindow != $wbwindow.__WB_top_frame && $wbwindow.__WB_top_frame.update_wb_url) {
// $wbwindow.__WB_top_frame.update_wb_url($wbwindow.WB_wombat_location.href,
// wb_info.timestamp,
// wb_info.request_ts,
// wb_info.is_live);
//}
if ($wbwindow.__WB_top_frame && $wbwindow != $wbwindow.__WB_top_frame) {
var message = {
"url": url,
"ts": wb_info.timestamp,
"request_ts": wb_info.request_ts,
"is_live": wb_info.is_live,
"title": title,
"wb_type": func_name,
}
$wbwindow.__WB_top_frame.postMessage(message, "*");
}
}
@ -931,7 +942,8 @@ var wombat_internal = function($wbwindow) {
//var timezone = new Date().getTimezoneOffset() * 60 * 1000;
// Already UTC!
var timezone = 0;
var timediff = $wbwindow.Date.now() - (timestamp - timezone);
var start_now = $wbwindow.Date.now()
var timediff = start_now - (timestamp - timezone);
if ($wbwindow.__wb_Date_now) {
return;
@ -1656,13 +1668,14 @@ var wombat_internal = function($wbwindow) {
var from = source.WB_wombat_location.origin;
if (!source.__WB_id) {
source.__WB_id = Math.round(Math.random() * 1000) + source.WB_wombat_location.href;
}
if (!this.__WB_win_id) {
this.__WB_win_id = {};
this.__WB_counter = 0;
}
if (!source.__WB_id) {
source.__WB_id = (this.__WB_counter++) + source.WB_wombat_location.href;
}
this.__WB_win_id[source.__WB_id] = source;
src_id = source.__WB_id;
@ -1783,19 +1796,22 @@ var wombat_internal = function($wbwindow) {
//============================================
function init_open_override()
{
if (!$wbwindow.Window.prototype.open) {
return;
var orig = $wbwindow.open;
if ($wbwindow.Window.prototype.open) {
orig = $wbwindow.Window.prototype.open;
}
var orig = $wbwindow.Window.prototype.open;
var open_rewritten = function(strUrl, strWindowName, strWindowFeatures) {
strUrl = rewrite_url(strUrl);
strUrl = rewrite_url(strUrl, false, "");
return orig.call(this, strUrl, strWindowName, strWindowFeatures);
}
$wbwindow.open = open_rewritten;
$wbwindow.Window.prototype.open = open_rewritten;
if ($wbwindow.Window.prototype.open) {
$wbwindow.Window.prototype.open = open_rewritten;
}
for (var i = 0; i < $wbwindow.frames.length; i++) {
try {
@ -2086,7 +2102,7 @@ var wombat_internal = function($wbwindow) {
//============================================
function get_final_url(prefix, mod, url) {
if (!mod) {
if (mod == undefined) {
mod = wb_info.mod;
}

View File

@ -1,5 +1,6 @@
from io import BytesIO
import zlib
import brotli
#=================================================================
@ -17,6 +18,11 @@ def deflate_decompressor():
def deflate_decompressor_alt():
return zlib.decompressobj(-zlib.MAX_WBITS)
def brotli_decompressor():
decomp = brotli.Decompressor()
decomp.unused_data = None
return decomp
#=================================================================
class BufferedReader(object):
@ -40,7 +46,9 @@ class BufferedReader(object):
DECOMPRESSORS = {'gzip': gzip_decompressor,
'deflate': deflate_decompressor,
'deflate_alt': deflate_decompressor_alt}
'deflate_alt': deflate_decompressor_alt,
'br': brotli_decompressor
}
def __init__(self, stream, block_size=1024,
decomp_type=None,
@ -98,7 +106,7 @@ class BufferedReader(object):
if self.decompressor and data:
try:
data = self.decompressor.decompress(data)
except Exception:
except Exception as e:
# if first read attempt, assume non-gzipped stream
if self.num_read == 0:
if self.decomp_type == 'deflate':
@ -108,7 +116,8 @@ class BufferedReader(object):
self.decompressor = None
# otherwise (partly decompressed), something is wrong
else:
raise
print(str(e))
return b''
return data
def read(self, length=None):
@ -180,6 +189,10 @@ class BufferedReader(object):
self.stream.close()
self.stream = None
@classmethod
def get_supported_decompressors(cls):
return cls.DECOMPRESSORS.keys()
#=================================================================
class DecompressingBufferedReader(BufferedReader):

View File

@ -9,10 +9,12 @@ import requests
import six
from six.moves.urllib.request import pathname2url, url2pathname
from six.moves.urllib.parse import urljoin, unquote_plus, urlsplit
from six.moves.urllib.parse import urljoin, unquote_plus, urlsplit, urlencode
import time
import pkg_resources
import base64
import cgi
from io import open, BytesIO
@ -65,19 +67,18 @@ def to_native_str(value, encoding='iso-8859-1', func=lambda x: x):
#=================================================================
def extract_post_query(method, mime, length, stream, buffered_stream=None):
def extract_post_query(method, mime, length, stream,
buffered_stream=None,
environ=None):
"""
Extract a url-encoded form POST from stream
If not a application/x-www-form-urlencoded, or no missing
content length, return None
Attempt to decode application/x-www-form-urlencoded or multipart/*,
otherwise read whole block and b64encode
"""
if method.upper() != 'POST':
return None
if ((not mime or
not mime.lower().startswith('application/x-www-form-urlencoded'))):
return None
try:
length = int(length)
except (ValueError, TypeError):
@ -101,11 +102,79 @@ def extract_post_query(method, mime, length, stream, buffered_stream=None):
buffered_stream.write(post_query)
buffered_stream.seek(0)
post_query = to_native_str(post_query)
post_query = unquote_plus(post_query)
if not mime:
mime = ''
if mime.startswith('application/x-www-form-urlencoded'):
post_query = to_native_str(post_query)
post_query = unquote_plus(post_query)
elif mime.startswith('multipart/'):
env = {'REQUEST_METHOD': 'POST',
'CONTENT_TYPE': mime,
'CONTENT_LENGTH': len(post_query)}
args = dict(fp=BytesIO(post_query),
environ=env,
keep_blank_values=True)
if six.PY3:
args['encoding'] = 'utf-8'
data = cgi.FieldStorage(**args)
values = []
for item in data.list:
values.append((item.name, item.value))
post_query = urlencode(values, True)
elif mime.startswith('application/x-amf'):
post_query = amf_parse(post_query, environ)
else:
post_query = base64.b64encode(post_query)
post_query = to_native_str(post_query)
post_query = '&__wb_post_data=' + post_query
return post_query
#=================================================================
def amf_parse(string, environ):
try:
from pyamf import remoting
res = remoting.decode(BytesIO(string))
#print(res)
body = res.bodies[0][1].body[0]
values = {}
if hasattr(body, 'body'):
values['body'] = body.body
if hasattr(body, 'source'):
values['source'] = body.source
if hasattr(body, 'operation'):
values['op'] = body.operation
if environ is not None:
environ['pywb.inputdata'] = res
query = urlencode(values)
#print(query)
return query
except Exception as e:
import traceback
traceback.print_exc()
print(e)
return None
#=================================================================
def append_post_query(url, post_query):
if not post_query:
@ -167,23 +236,34 @@ def read_last_line(fh, offset=256):
#=================================================================
class BlockLoader(object):
class BaseLoader(object):
def __init__(self, **kwargs):
pass
def load(self, url, offset=0, length=-1):
raise NotImplemented()
#=================================================================
class BlockLoader(BaseLoader):
"""
a loader which can stream blocks of content
given a uri, offset and optional length.
Currently supports: http/https and file/local file system
"""
def __init__(self, *args, **kwargs):
loaders = {}
profile_loader = None
def __init__(self, **kwargs):
self.cached = {}
self.args = args
self.kwargs = kwargs
def load(self, url, offset=0, length=-1):
loader = self._get_loader_for(url)
loader, url = self._get_loader_for_url(url)
return loader.load(url, offset, length)
def _get_loader_for(self, url):
def _get_loader_for_url(self, url):
"""
Determine loading method based on uri
"""
@ -193,18 +273,47 @@ class BlockLoader(object):
else:
type_ = parts[0]
if '+' in type_:
profile_name, scheme = type_.split('+', 1)
if len(parts) == 2:
url = scheme + '://' + parts[1]
else:
profile_name = ''
scheme = type_
loader = self.cached.get(type_)
if loader:
return loader
return loader, url
loader_cls = self._get_loader_class_for_type(scheme)
loader_cls = LOADERS.get(type_)
if not loader_cls:
raise IOError('No Loader for type: ' + type_)
raise IOError('No Loader for type: ' + scheme)
profile = self.kwargs
if self.profile_loader:
profile = self.profile_loader(profile_name, scheme)
loader = loader_cls(**profile)
loader = loader_cls(*self.args, **self.kwargs)
self.cached[type_] = loader
return loader
return loader, url
def _get_loader_class_for_type(self, type_):
loader_cls = self.loaders.get(type_)
return loader_cls
@staticmethod
def init_default_loaders():
BlockLoader.loaders['http'] = HttpLoader
BlockLoader.loaders['https'] = HttpLoader
BlockLoader.loaders['s3'] = S3Loader
BlockLoader.loaders['file'] = LocalFileLoader
@staticmethod
def set_profile_loader(src):
BlockLoader.profile_loader = src
@staticmethod
def _make_range_header(offset, length):
@ -217,10 +326,7 @@ class BlockLoader(object):
#=================================================================
class LocalFileLoader(object):
def __init__(self, *args, **kwargs):
pass
class LocalFileLoader(BaseLoader):
def load(self, url, offset=0, length=-1):
"""
Load a file-like reader from the local file system
@ -260,9 +366,11 @@ class LocalFileLoader(object):
#=================================================================
class HttpLoader(object):
def __init__(self, cookie_maker=None, *args, **kwargs):
self.cookie_maker = cookie_maker
class HttpLoader(BaseLoader):
def __init__(self, **kwargs):
self.cookie_maker = kwargs.get('cookie_maker')
if not self.cookie_maker:
self.cookie_maker = kwargs.get('cookie')
self.session = None
def load(self, url, offset, length):
@ -288,33 +396,47 @@ class HttpLoader(object):
#=================================================================
class S3Loader(object):
def __init__(self, *args, **kwargs):
class S3Loader(BaseLoader):
def __init__(self, **kwargs):
self.s3conn = None
self.aws_access_key_id = kwargs.get('aws_access_key_id')
self.aws_secret_access_key = kwargs.get('aws_secret_access_key')
def load(self, url, offset, length):
if not s3_avail: #pragma: no cover
raise IOError('To load from s3 paths, ' +
'you must install boto: pip install boto')
if not self.s3conn:
try:
self.s3conn = connect_s3()
except Exception: #pragma: no cover
self.s3conn = connect_s3(anon=True)
aws_access_key_id = self.aws_access_key_id
aws_secret_access_key = self.aws_secret_access_key
parts = urlsplit(url)
bucket = self.s3conn.get_bucket(parts.netloc)
if parts.username and parts.password:
aws_access_key_id = unquote_plus(parts.username)
aws_secret_access_key = unquote_plus(parts.password)
bucket_name = parts.netloc.split('@', 1)[-1]
else:
bucket_name = parts.netloc
headers = {'Range': BlockLoader._make_range_header(offset, length)}
if not self.s3conn:
try:
self.s3conn = connect_s3(aws_access_key_id, aws_secret_access_key)
except Exception: #pragma: no cover
self.s3conn = connect_s3(anon=True)
bucket = self.s3conn.get_bucket(bucket_name)
key = bucket.get_key(parts.path)
result = key.get_contents_as_string(headers=headers)
key.close()
if offset == 0 and length == -1:
headers = {}
else:
headers = {'Range': BlockLoader._make_range_header(offset, length)}
return BytesIO(result)
# Read range
key.open_read(headers=headers)
return key
#=================================================================
@ -414,12 +536,6 @@ class LimitReader(object):
return stream
#=================================================================
LOADERS = {'http': HttpLoader,
'https': HttpLoader,
's3': S3Loader,
'file': LocalFileLoader
}
# ============================================================================
BlockLoader.init_default_loaders()

View File

@ -133,6 +133,14 @@ def compress_alt(buff):
return compressed
# Brotli
def test_brotli():
with open(get_test_dir() + 'text_content/quickfox_repeated.compressed', 'rb') as fh:
x = DecompressingBufferedReader(fh, decomp_type='br')
x.read() == b'The quick brown fox jumps over the lazy dog' * 4096
# Errors
@ -140,9 +148,11 @@ def test_err_compress_mix():
# error: compressed member, followed by not compressed -- considered invalid
x = DecompressingBufferedReader(BytesIO(compress('ABC') + b'123'), decomp_type = 'gzip')
b = x.read()
b = x.read_next_member()
with pytest.raises(zlib.error):
x.read()
assert b == b'ABC'
x.read_next_member()
assert x.read() == b''
#with pytest.raises(zlib.error):
# x.read()
#error: Error -3 while decompressing: incorrect header check
def test_err_chunk_cut_off():

View File

@ -37,17 +37,21 @@ Traceback (most recent call last):
IOError: [Errno 2] No such file or directory: '_x_no_such_file_'
# HMAC Cookie Maker
>>> print_str(BlockLoader(HMACCookieMaker('test', 'test', 5)).load('http://example.com', 41, 14).read())
>>> print_str(BlockLoader(cookie_maker=HMACCookieMaker('test', 'test', 5)).load('http://example.com', 41, 14).read())
'Example Domain'
# fixed cookie, range request
>>> print_str(BlockLoader('some=value').load('http://example.com', 41, 14).read())
>>> print_str(BlockLoader(cookie='some=value').load('http://example.com', 41, 14).read())
'Example Domain'
# range request
>>> print_str(BlockLoader().load('http://example.com', 1262).read())
'</html>\n'
# custom profile
>>> print_str(BlockLoader().load('local+http://example.com', 1262).read())
'</html>\n'
# unknown loader error
#>>> BlockLoader().load('foo://example.com', 10).read() # doctest: +IGNORE_EXCEPTION_DETAIL
#Traceback (most recent call last):
@ -90,8 +94,9 @@ IOError: [Errno 2] No such file or directory: '_x_no_such_file_'
# unsupported method
>>> extract_post_query('PUT', 'application/x-www-form-urlencoded', len(post_data), BytesIO(post_data))
# unsupported type
# base64 encode
>>> extract_post_query('POST', 'text/plain', len(post_data), BytesIO(post_data))
'&__wb_post_data=Zm9vPWJhciZkaXI9JTJGYmF6'
# invalid length
>>> extract_post_query('POST', 'application/x-www-form-urlencoded', 'abc', BytesIO(post_data))

View File

@ -253,6 +253,7 @@ class ArchiveIndexEntryMixin(object):
self['mime'] = def_mime
if mime:
self['mime'] = self.MIME_RE.split(mime, 1)[0]
self['_content_type'] = mime
def extract_status(self, status_headers):
""" Extract status code only from status line
@ -390,7 +391,7 @@ class DefaultRecordParser(object):
len_ = record.status_headers.get_header('Content-Length')
post_query = extract_post_query(method,
entry.get('mime'),
entry.get('_content_type'),
len_,
record.stream)

View File

@ -57,7 +57,7 @@ class ArcWarcRecordLoader(object):
def __init__(self, loader=None, cookie_maker=None, block_size=8192,
verify_http=True):
if not loader:
loader = BlockLoader(cookie_maker)
loader = BlockLoader(cookie_maker=cookie_maker)
self.loader = loader
self.block_size = block_size

View File

@ -174,7 +174,8 @@ class ReplayView(object):
stream=stream,
head_insert_func=head_insert_func,
urlkey=cdx['urlkey'],
cdx=cdx))
cdx=cdx,
env=wbrequest.env))
(status_headers, response_iter, is_rewritten) = result

View File

@ -0,0 +1,2 @@
[яЇА"y\ыZЊB;ф%UZ™±5Ићћ
{Kђ№<<3C>И @ужЩMдme'‡_¦й0{<ШS

View File

@ -78,7 +78,8 @@ setup(
'requests',
'redis',
'jinja2',
'surt==0.3b4',
'surt>=0.3.0',
'brotlipy',
'pyyaml',
'watchdog',
'webencodings',
@ -90,9 +91,6 @@ setup(
'fakeredis',
'mock',
],
dependency_links=[
'git+https://github.com/ikreymer/fakeredis.git@zset-lex-ops#egg=fakeredis-0.6.2-zset-lex-ops',
],
cmdclass={'test': PyTest},
test_suite='',
entry_points="""