mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
Merge branch 'develop'
This commit is contained in:
commit
f4e5a7df5d
@ -1,4 +1,4 @@
|
|||||||
PyWb 0.30.1
|
PyWb 0.31.0
|
||||||
===========
|
===========
|
||||||
|
|
||||||
.. image:: https://travis-ci.org/ikreymer/pywb.svg?branch=master
|
.. image:: https://travis-ci.org/ikreymer/pywb.svg?branch=master
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
__version__ = '0.30.1'
|
__version__ = '0.31.0'
|
||||||
|
|
||||||
DEFAULT_CONFIG = 'pywb/default_config.yaml'
|
DEFAULT_CONFIG = 'pywb/default_config.yaml'
|
||||||
|
|
||||||
|
@ -153,7 +153,7 @@ class CDXObject(OrderedDict):
|
|||||||
raise CDXException(msg)
|
raise CDXException(msg)
|
||||||
|
|
||||||
for header, field in zip(cdxformat, fields):
|
for header, field in zip(cdxformat, fields):
|
||||||
self[header] = field.decode('utf-8')
|
self[header] = to_native_str(field, 'utf-8')
|
||||||
|
|
||||||
self.cdxline = cdxline
|
self.cdxline = cdxline
|
||||||
|
|
||||||
@ -213,7 +213,7 @@ class CDXObject(OrderedDict):
|
|||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
if self.cdxline:
|
if self.cdxline:
|
||||||
return self.cdxline.decode('utf-8')
|
return to_native_str(self.cdxline, 'utf-8')
|
||||||
|
|
||||||
if not self._from_json:
|
if not self._from_json:
|
||||||
return ' '.join(str(val) for val in six.itervalues(self))
|
return ' '.join(str(val) for val in six.itervalues(self))
|
||||||
@ -263,7 +263,7 @@ class IDXObject(OrderedDict):
|
|||||||
raise CDXException(msg.format(len(fields), self.NUM_REQ_FIELDS))
|
raise CDXException(msg.format(len(fields), self.NUM_REQ_FIELDS))
|
||||||
|
|
||||||
for header, field in zip(self.FORMAT, fields):
|
for header, field in zip(self.FORMAT, fields):
|
||||||
self[header] = field.decode('utf-8')
|
self[header] = to_native_str(field, 'utf-8')
|
||||||
|
|
||||||
self['offset'] = int(self['offset'])
|
self['offset'] = int(self['offset'])
|
||||||
self['length'] = int(self['length'])
|
self['length'] = int(self['length'])
|
||||||
@ -285,4 +285,4 @@ class IDXObject(OrderedDict):
|
|||||||
return json_encode(self) + '\n'
|
return json_encode(self) + '\n'
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return self.idxline.decode('utf-8')
|
return to_native_str(self.idxline, 'utf-8')
|
||||||
|
@ -184,14 +184,15 @@ class WbRequest(object):
|
|||||||
if not self.wb_url:
|
if not self.wb_url:
|
||||||
return
|
return
|
||||||
|
|
||||||
mime = self.env.get('CONTENT_TYPE', '').split(';')[0]
|
mime = self.env.get('CONTENT_TYPE', '')
|
||||||
length = self.env.get('CONTENT_LENGTH')
|
length = self.env.get('CONTENT_LENGTH')
|
||||||
stream = self.env['wsgi.input']
|
stream = self.env['wsgi.input']
|
||||||
|
|
||||||
buffered_stream = BytesIO()
|
buffered_stream = BytesIO()
|
||||||
|
|
||||||
post_query = extract_post_query('POST', mime, length, stream,
|
post_query = extract_post_query('POST', mime, length, stream,
|
||||||
buffered_stream=buffered_stream)
|
buffered_stream=buffered_stream,
|
||||||
|
environ=self.env)
|
||||||
|
|
||||||
if post_query:
|
if post_query:
|
||||||
self.env['wsgi.input'] = buffered_stream
|
self.env['wsgi.input'] = buffered_stream
|
||||||
|
@ -120,7 +120,7 @@ class HTMLRewriterMixin(object):
|
|||||||
|
|
||||||
def _rewrite_meta_refresh(self, meta_refresh):
|
def _rewrite_meta_refresh(self, meta_refresh):
|
||||||
if not meta_refresh:
|
if not meta_refresh:
|
||||||
return None
|
return ''
|
||||||
|
|
||||||
m = self.META_REFRESH_REGEX.match(meta_refresh)
|
m = self.META_REFRESH_REGEX.match(meta_refresh)
|
||||||
if not m:
|
if not m:
|
||||||
@ -133,6 +133,9 @@ class HTMLRewriterMixin(object):
|
|||||||
return meta_refresh
|
return meta_refresh
|
||||||
|
|
||||||
def _rewrite_base(self, url, mod=''):
|
def _rewrite_base(self, url, mod=''):
|
||||||
|
if not url:
|
||||||
|
return ''
|
||||||
|
|
||||||
url = self._ensure_url_has_path(url)
|
url = self._ensure_url_has_path(url)
|
||||||
|
|
||||||
base_url = self._rewrite_url(url, mod)
|
base_url = self._rewrite_url(url, mod)
|
||||||
@ -183,11 +186,11 @@ class HTMLRewriterMixin(object):
|
|||||||
|
|
||||||
def _rewrite_url(self, value, mod=None):
|
def _rewrite_url(self, value, mod=None):
|
||||||
if not value:
|
if not value:
|
||||||
return None
|
return ''
|
||||||
|
|
||||||
value = value.strip()
|
value = value.strip()
|
||||||
if not value:
|
if not value:
|
||||||
return None
|
return ''
|
||||||
|
|
||||||
value = self.try_unescape(value)
|
value = self.try_unescape(value)
|
||||||
return self.url_rewriter.rewrite(value, mod)
|
return self.url_rewriter.rewrite(value, mod)
|
||||||
@ -209,21 +212,24 @@ class HTMLRewriterMixin(object):
|
|||||||
return new_value
|
return new_value
|
||||||
|
|
||||||
def _rewrite_srcset(self, value, mod=''):
|
def _rewrite_srcset(self, value, mod=''):
|
||||||
|
if not value:
|
||||||
|
return ''
|
||||||
|
|
||||||
values = value.split(',')
|
values = value.split(',')
|
||||||
values = map(lambda x: self._rewrite_url(x.strip()), values)
|
values = [self._rewrite_url(v.strip()) for v in values]
|
||||||
return ', '.join(values)
|
return ', '.join(values)
|
||||||
|
|
||||||
def _rewrite_css(self, css_content):
|
def _rewrite_css(self, css_content):
|
||||||
if css_content:
|
if css_content:
|
||||||
return self.css_rewriter.rewrite(css_content)
|
return self.css_rewriter.rewrite(css_content)
|
||||||
else:
|
else:
|
||||||
return None
|
return ''
|
||||||
|
|
||||||
def _rewrite_script(self, script_content):
|
def _rewrite_script(self, script_content):
|
||||||
if script_content:
|
if script_content:
|
||||||
return self.js_rewriter.rewrite(script_content)
|
return self.js_rewriter.rewrite(script_content)
|
||||||
else:
|
else:
|
||||||
return None
|
return ''
|
||||||
|
|
||||||
def has_attr(self, tag_attrs, attr):
|
def has_attr(self, tag_attrs, attr):
|
||||||
name, value = attr
|
name, value = attr
|
||||||
@ -252,6 +258,11 @@ class HTMLRewriterMixin(object):
|
|||||||
self.out.write('<' + tag)
|
self.out.write('<' + tag)
|
||||||
|
|
||||||
for attr_name, attr_value in tag_attrs:
|
for attr_name, attr_value in tag_attrs:
|
||||||
|
empty_attr = False
|
||||||
|
if attr_value is None:
|
||||||
|
attr_value = ''
|
||||||
|
empty_attr = True
|
||||||
|
|
||||||
# special case: inline JS/event handler
|
# special case: inline JS/event handler
|
||||||
if ((attr_value and attr_value.startswith('javascript:'))
|
if ((attr_value and attr_value.startswith('javascript:'))
|
||||||
or attr_name.startswith('on')):
|
or attr_name.startswith('on')):
|
||||||
@ -324,7 +335,7 @@ class HTMLRewriterMixin(object):
|
|||||||
attr_value = self._rewrite_url(attr_value, rw_mod)
|
attr_value = self._rewrite_url(attr_value, rw_mod)
|
||||||
|
|
||||||
# write the attr!
|
# write the attr!
|
||||||
self._write_attr(attr_name, attr_value)
|
self._write_attr(attr_name, attr_value, empty_attr)
|
||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
@ -347,11 +358,17 @@ class HTMLRewriterMixin(object):
|
|||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def _write_attr(self, name, value):
|
def _write_attr(self, name, value, empty_attr):
|
||||||
# parser doesn't differentiate between 'attr=""' and just 'attr'
|
# if empty_attr is set, just write 'attr'!
|
||||||
# 'attr=""' is more common, so use that form
|
if empty_attr:
|
||||||
if value:
|
self.out.write(' ' + name)
|
||||||
|
|
||||||
|
# write with value, if set
|
||||||
|
elif value:
|
||||||
|
|
||||||
self.out.write(' ' + name + '="' + value.replace('"', '"') + '"')
|
self.out.write(' ' + name + '="' + value.replace('"', '"') + '"')
|
||||||
|
|
||||||
|
# otherwise, 'attr=""' is more common, so use that form
|
||||||
else:
|
else:
|
||||||
self.out.write(' ' + name + '=""')
|
self.out.write(' ' + name + '=""')
|
||||||
|
|
||||||
@ -421,8 +438,9 @@ class HTMLRewriter(HTMLRewriterMixin, HTMLParser):
|
|||||||
def feed(self, string):
|
def feed(self, string):
|
||||||
try:
|
try:
|
||||||
HTMLParser.feed(self, string)
|
HTMLParser.feed(self, string)
|
||||||
except Exception: # pragma: no cover
|
except Exception as e: # pragma: no cover
|
||||||
# only raised in 2.6
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
self.out.write(string)
|
self.out.write(string)
|
||||||
|
|
||||||
def _internal_close(self):
|
def _internal_close(self):
|
||||||
|
52
pywb/rewrite/rewrite_amf.py
Normal file
52
pywb/rewrite/rewrite_amf.py
Normal file
@ -0,0 +1,52 @@
|
|||||||
|
from io import BytesIO
|
||||||
|
from six.moves import zip
|
||||||
|
from pywb.rewrite.rewrite_content import RewriteContent
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# Expiermental: not fully tested
|
||||||
|
class RewriteContentAMF(RewriteContent): #pragma: no cover
|
||||||
|
def handle_custom_rewrite(self, text_type, status_headers, stream, env):
|
||||||
|
|
||||||
|
if status_headers.get_header('Content-Type') == 'application/x-amf':
|
||||||
|
stream = self.rewrite_amf(stream, env)
|
||||||
|
|
||||||
|
return (super(RewriteContentAMF, self).
|
||||||
|
handle_custom_rewrite(text_type, status_headers, stream, env))
|
||||||
|
|
||||||
|
def rewrite_amf(self, stream, env):
|
||||||
|
try:
|
||||||
|
from pyamf import remoting
|
||||||
|
|
||||||
|
iobuff = BytesIO()
|
||||||
|
while True:
|
||||||
|
buff = stream.read()
|
||||||
|
if not buff:
|
||||||
|
break
|
||||||
|
iobuff.write(buff)
|
||||||
|
|
||||||
|
iobuff.seek(0)
|
||||||
|
res = remoting.decode(iobuff)
|
||||||
|
|
||||||
|
if env and env.get('pywb.inputdata'):
|
||||||
|
inputdata = env.get('pywb.inputdata')
|
||||||
|
|
||||||
|
new_list = []
|
||||||
|
|
||||||
|
for src, target in zip(inputdata.bodies, res.bodies):
|
||||||
|
#print(target[0] + ' = ' + src[0])
|
||||||
|
|
||||||
|
#print('messageId => corrId ' + target[1].body.correlationId + ' => ' + src[1].body[0].messageId)
|
||||||
|
target[1].body.correlationId = src[1].body[0].messageId
|
||||||
|
|
||||||
|
new_list.append((src[0], target[1]))
|
||||||
|
|
||||||
|
res.bodies = new_list
|
||||||
|
|
||||||
|
return BytesIO(remoting.encode(res).getvalue())
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
print(e)
|
||||||
|
return stream
|
@ -4,7 +4,7 @@ import webencodings
|
|||||||
import yaml
|
import yaml
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from chardet.universaldetector import UniversalDetector
|
#from chardet.universaldetector import UniversalDetector
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
|
|
||||||
from pywb.rewrite.header_rewriter import RewrittenStatusAndHeaders
|
from pywb.rewrite.header_rewriter import RewrittenStatusAndHeaders
|
||||||
@ -21,7 +21,7 @@ from pywb.rewrite.regex_rewriters import JSNoneRewriter, JSLinkOnlyRewriter
|
|||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class RewriteContent:
|
class RewriteContent(object):
|
||||||
HEAD_REGEX = re.compile(b'<\s*head\\b[^>]*[>]+', re.I)
|
HEAD_REGEX = re.compile(b'<\s*head\\b[^>]*[>]+', re.I)
|
||||||
|
|
||||||
TAG_REGEX = re.compile(b'^\s*\<')
|
TAG_REGEX = re.compile(b'^\s*\<')
|
||||||
@ -77,6 +77,7 @@ class RewriteContent:
|
|||||||
|
|
||||||
|
|
||||||
def _check_encoding(self, rewritten_headers, stream, enc):
|
def _check_encoding(self, rewritten_headers, stream, enc):
|
||||||
|
matched = False
|
||||||
if (rewritten_headers.
|
if (rewritten_headers.
|
||||||
contains_removed_header('content-encoding', enc)):
|
contains_removed_header('content-encoding', enc)):
|
||||||
|
|
||||||
@ -87,14 +88,15 @@ class RewriteContent:
|
|||||||
stream = DecompressingBufferedReader(stream, decomp_type=enc)
|
stream = DecompressingBufferedReader(stream, decomp_type=enc)
|
||||||
|
|
||||||
rewritten_headers.status_headers.remove_header('content-length')
|
rewritten_headers.status_headers.remove_header('content-length')
|
||||||
|
matched = True
|
||||||
|
|
||||||
return stream
|
return matched, stream
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def rewrite_content(self, urlrewriter, status_headers, stream,
|
def rewrite_content(self, urlrewriter, status_headers, stream,
|
||||||
head_insert_func=None, urlkey='',
|
head_insert_func=None, urlkey='',
|
||||||
cdx=None, cookie_rewriter=None):
|
cdx=None, cookie_rewriter=None, env=None):
|
||||||
|
|
||||||
wb_url = urlrewriter.wburl
|
wb_url = urlrewriter.wburl
|
||||||
|
|
||||||
@ -118,9 +120,12 @@ class RewriteContent:
|
|||||||
|
|
||||||
status_headers = rewritten_headers.status_headers
|
status_headers = rewritten_headers.status_headers
|
||||||
|
|
||||||
# use rewritten headers, but no further rewriting needed
|
res = self.handle_custom_rewrite(rewritten_headers.text_type,
|
||||||
if rewritten_headers.text_type is None:
|
status_headers,
|
||||||
return (status_headers, self.stream_to_gen(stream), False)
|
stream,
|
||||||
|
env)
|
||||||
|
if res:
|
||||||
|
return res
|
||||||
|
|
||||||
# Handle text content rewriting
|
# Handle text content rewriting
|
||||||
# ====================================================================
|
# ====================================================================
|
||||||
@ -136,8 +141,12 @@ class RewriteContent:
|
|||||||
encoding = None
|
encoding = None
|
||||||
first_buff = b''
|
first_buff = b''
|
||||||
|
|
||||||
stream = self._check_encoding(rewritten_headers, stream, 'gzip')
|
for decomp_type in BufferedReader.get_supported_decompressors():
|
||||||
stream = self._check_encoding(rewritten_headers, stream, 'deflate')
|
matched, stream = self._check_encoding(rewritten_headers,
|
||||||
|
stream,
|
||||||
|
decomp_type)
|
||||||
|
if matched:
|
||||||
|
break
|
||||||
|
|
||||||
if mod == 'js_':
|
if mod == 'js_':
|
||||||
text_type, stream = self._resolve_text_type('js',
|
text_type, stream = self._resolve_text_type('js',
|
||||||
@ -237,6 +246,11 @@ class RewriteContent:
|
|||||||
|
|
||||||
return (status_headers, gen, True)
|
return (status_headers, gen, True)
|
||||||
|
|
||||||
|
def handle_custom_rewrite(self, text_type, status_headers, stream, env):
|
||||||
|
# use rewritten headers, but no further rewriting needed
|
||||||
|
if text_type is None:
|
||||||
|
return (status_headers, self.stream_to_gen(stream), False)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _extract_html_charset(buff, status_headers):
|
def _extract_html_charset(buff, status_headers):
|
||||||
charset = None
|
charset = None
|
||||||
@ -360,3 +374,5 @@ class RewriteContent:
|
|||||||
|
|
||||||
finally:
|
finally:
|
||||||
stream.close()
|
stream.close()
|
||||||
|
|
||||||
|
|
||||||
|
@ -49,6 +49,12 @@ r"""
|
|||||||
>>> parse('<base href="static/"/><img src="image.gif"/>', urlrewriter=no_base_canon_rewriter)
|
>>> parse('<base href="static/"/><img src="image.gif"/>', urlrewriter=no_base_canon_rewriter)
|
||||||
<base href="static/"/><img src="/web/20131226101010im_/http://example.com/some/path/static/image.gif"/>
|
<base href="static/"/><img src="/web/20131226101010im_/http://example.com/some/path/static/image.gif"/>
|
||||||
|
|
||||||
|
# Empty url
|
||||||
|
>>> parse('<base href="">')
|
||||||
|
<base href="">
|
||||||
|
|
||||||
|
>>> parse('<base href>')
|
||||||
|
<base href>
|
||||||
|
|
||||||
|
|
||||||
# HTML Entities
|
# HTML Entities
|
||||||
@ -66,6 +72,10 @@ r"""
|
|||||||
>>> parse('<input value="&X&"">X</input>')
|
>>> parse('<input value="&X&"">X</input>')
|
||||||
<input value="&X&"">X</input>
|
<input value="&X&"">X</input>
|
||||||
|
|
||||||
|
# Empty values should be ignored
|
||||||
|
>>> parse('<input name="foo" value>')
|
||||||
|
<input name="foo" value>
|
||||||
|
|
||||||
# SKIPPED
|
# SKIPPED
|
||||||
# Unicode -- default with %-encoding
|
# Unicode -- default with %-encoding
|
||||||
#>>> parse(u'<a href="http://испытание.испытание/">испытание</a>')
|
#>>> parse(u'<a href="http://испытание.испытание/">испытание</a>')
|
||||||
@ -92,7 +102,7 @@ r"""
|
|||||||
<meta http-equiv="refresh" content="text/html; charset=utf-8"/>
|
<meta http-equiv="refresh" content="text/html; charset=utf-8"/>
|
||||||
|
|
||||||
>>> parse('<META http-equiv="refresh" content>')
|
>>> parse('<META http-equiv="refresh" content>')
|
||||||
<meta http-equiv="refresh" content="">
|
<meta http-equiv="refresh" content>
|
||||||
|
|
||||||
>>> parse('<meta property="og:image" content="http://example.com/example.jpg">')
|
>>> parse('<meta property="og:image" content="http://example.com/example.jpg">')
|
||||||
<meta property="og:image" content="/web/20131226101010/http://example.com/example.jpg">
|
<meta property="og:image" content="/web/20131226101010/http://example.com/example.jpg">
|
||||||
@ -115,6 +125,10 @@ r"""
|
|||||||
>>> parse('<img srcset="//example.com/1x 1x, //example.com/foo 2x, https://example.com/bar 4x">')
|
>>> parse('<img srcset="//example.com/1x 1x, //example.com/foo 2x, https://example.com/bar 4x">')
|
||||||
<img srcset="/web/20131226101010///example.com/1x 1x, /web/20131226101010///example.com/foo 2x, /web/20131226101010/https://example.com/bar 4x">
|
<img srcset="/web/20131226101010///example.com/1x 1x, /web/20131226101010///example.com/foo 2x, /web/20131226101010/https://example.com/bar 4x">
|
||||||
|
|
||||||
|
# empty srcset attrib
|
||||||
|
>>> parse('<img srcset="">')
|
||||||
|
<img srcset="">
|
||||||
|
|
||||||
# Script tag
|
# Script tag
|
||||||
>>> parse('<script>window.location = "http://example.com/a/b/c.html"</script>')
|
>>> parse('<script>window.location = "http://example.com/a/b/c.html"</script>')
|
||||||
<script>window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html"</script>
|
<script>window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html"</script>
|
||||||
@ -131,7 +145,7 @@ r"""
|
|||||||
<script>/*<![CDATA[*/window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html;/*]]>*/"</script>
|
<script>/*<![CDATA[*/window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html;/*]]>*/"</script>
|
||||||
|
|
||||||
>>> parse('<div style="background: url(\'abc.html\')" onblah onclick="location = \'redirect.html\'"></div>')
|
>>> parse('<div style="background: url(\'abc.html\')" onblah onclick="location = \'redirect.html\'"></div>')
|
||||||
<div style="background: url('/web/20131226101010/http://example.com/some/path/abc.html')" onblah="" onclick="WB_wombat_location = 'redirect.html'"></div>
|
<div style="background: url('/web/20131226101010/http://example.com/some/path/abc.html')" onblah onclick="WB_wombat_location = 'redirect.html'"></div>
|
||||||
|
|
||||||
>>> parse('<i style="background-image: url(http://foo-.bar_.example.com/)"></i>')
|
>>> parse('<i style="background-image: url(http://foo-.bar_.example.com/)"></i>')
|
||||||
<i style="background-image: url(/web/20131226101010/http://foo-.bar_.example.com/)"></i>
|
<i style="background-image: url(/web/20131226101010/http://foo-.bar_.example.com/)"></i>
|
||||||
|
@ -123,12 +123,23 @@ function notify_top() {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (window.__WB_top_frame.update_wb_url) {
|
//if (window.__WB_top_frame.update_wb_url) {
|
||||||
window.__WB_top_frame.update_wb_url(window.WB_wombat_location.href,
|
// window.__WB_top_frame.update_wb_url(window.WB_wombat_location.href,
|
||||||
wbinfo.timestamp,
|
// wbinfo.timestamp,
|
||||||
wbinfo.request_ts,
|
// wbinfo.request_ts,
|
||||||
wbinfo.is_live);
|
// wbinfo.is_live);
|
||||||
}
|
//}
|
||||||
|
|
||||||
|
var message = {
|
||||||
|
"url": window.WB_wombat_location.href,
|
||||||
|
"ts": wbinfo.timestamp,
|
||||||
|
"request_ts": wbinfo.request_ts,
|
||||||
|
"is_live": wbinfo.is_live,
|
||||||
|
"title": "",
|
||||||
|
"wb_type": "load",
|
||||||
|
}
|
||||||
|
|
||||||
|
window.__WB_top_frame.postMessage(message, "*");
|
||||||
|
|
||||||
remove_event("readystatechange", notify_top, document);
|
remove_event("readystatechange", notify_top, document);
|
||||||
}
|
}
|
||||||
|
@ -38,27 +38,21 @@ function make_url(url, ts, mod)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
function push_state(url, timestamp, request_ts, capture_str, is_live) {
|
function push_state(state) {
|
||||||
var frame = document.getElementById(IFRAME_ID).contentWindow;
|
var frame = document.getElementById(IFRAME_ID).contentWindow;
|
||||||
if (frame.WB_wombat_location) {
|
if (frame.WB_wombat_location) {
|
||||||
var curr_href = frame.WB_wombat_location.href;
|
var curr_href = frame.WB_wombat_location.href;
|
||||||
|
|
||||||
// If not current url, don't update
|
// If not current url, don't update
|
||||||
if (url != curr_href) {
|
if (state.url != curr_href) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
var state = {}
|
state.outer_url = make_url(state.url, state.request_ts, wbinfo.frame_mod);
|
||||||
state.timestamp = timestamp;
|
state.inner_url = make_url(state.url, state.request_ts, wbinfo.replay_mod);
|
||||||
state.request_ts = request_ts;
|
|
||||||
state.outer_url = make_url(url, state.request_ts, wbinfo.frame_mod);
|
|
||||||
state.inner_url = make_url(url, state.request_ts, wbinfo.replay_mod);
|
|
||||||
state.url = url;
|
|
||||||
state.capture_str = capture_str;
|
|
||||||
state.is_live = is_live;
|
|
||||||
|
|
||||||
var canon_url = make_url(url, state.request_ts, "");
|
var canon_url = make_url(state.url, state.request_ts, "");
|
||||||
if (window.location.href != canon_url) {
|
if (window.location.href != canon_url) {
|
||||||
window.history.replaceState(state, "", canon_url);
|
window.history.replaceState(state, "", canon_url);
|
||||||
}
|
}
|
||||||
@ -157,7 +151,13 @@ function iframe_loaded(event) {
|
|||||||
request_ts = ts;
|
request_ts = ts;
|
||||||
}
|
}
|
||||||
|
|
||||||
update_wb_url(url, ts, request_ts, is_live);
|
var state = {}
|
||||||
|
state["url"] = url;
|
||||||
|
state["ts"] = ts;
|
||||||
|
state["request_ts"] = request_ts;
|
||||||
|
state["is_live"] = is_live
|
||||||
|
|
||||||
|
update_wb_url(state);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -165,12 +165,18 @@ function init_pm() {
|
|||||||
var frame = document.getElementById(IFRAME_ID).contentWindow;
|
var frame = document.getElementById(IFRAME_ID).contentWindow;
|
||||||
|
|
||||||
window.addEventListener("message", function(event) {
|
window.addEventListener("message", function(event) {
|
||||||
// Pass to replay frame
|
|
||||||
if (event.source == window.parent) {
|
if (event.source == window.parent) {
|
||||||
|
// Pass to replay frame
|
||||||
frame.postMessage(event.data, "*");
|
frame.postMessage(event.data, "*");
|
||||||
} else if (event.source == frame) {
|
} else if (event.source == frame) {
|
||||||
// Pass to parent
|
|
||||||
window.parent.postMessage(event.data, "*");
|
// Check if iframe url change message
|
||||||
|
if (typeof(event.data) == "object" && event.data["wb_type"]) {
|
||||||
|
update_wb_url(event.data);
|
||||||
|
} else {
|
||||||
|
// Pass to parent
|
||||||
|
window.parent.postMessage(event.data, "*");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -181,14 +187,14 @@ function init_pm() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
function update_wb_url(url, ts, request_ts, is_live) {
|
function update_wb_url(state) {
|
||||||
if (curr_state.url == url && curr_state.timestamp == ts) {
|
if (curr_state.url == state.url && curr_state.ts == state.ts) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
capture_str = _wb_js.ts_to_date(ts, true);
|
state['capture_str'] = _wb_js.ts_to_date(state.ts, true);
|
||||||
|
|
||||||
push_state(url, ts, request_ts, capture_str, is_live);
|
push_state(state);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Load Banner
|
// Load Banner
|
||||||
@ -237,3 +243,4 @@ function init_hash_connect() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
document.addEventListener("DOMContentLoaded", init_hash_connect);
|
document.addEventListener("DOMContentLoaded", init_hash_connect);
|
||||||
|
|
||||||
|
@ -292,6 +292,7 @@ var wombat_internal = function($wbwindow) {
|
|||||||
var parser = make_parser(extract_orig($wbwindow.document.baseURI));
|
var parser = make_parser(extract_orig($wbwindow.document.baseURI));
|
||||||
var href = parser.href;
|
var href = parser.href;
|
||||||
var hash = href.lastIndexOf("#");
|
var hash = href.lastIndexOf("#");
|
||||||
|
|
||||||
if (hash >= 0) {
|
if (hash >= 0) {
|
||||||
href = href.substring(0, hash);
|
href = href.substring(0, hash);
|
||||||
}
|
}
|
||||||
@ -300,8 +301,6 @@ var wombat_internal = function($wbwindow) {
|
|||||||
|
|
||||||
if (lastslash >= 0 && lastslash != (href.length - 1)) {
|
if (lastslash >= 0 && lastslash != (href.length - 1)) {
|
||||||
href = href.substring(0, lastslash + 1);
|
href = href.substring(0, lastslash + 1);
|
||||||
} else {
|
|
||||||
href += "/";
|
|
||||||
}
|
}
|
||||||
|
|
||||||
parser.href = href + url;
|
parser.href = href + url;
|
||||||
@ -667,15 +666,15 @@ var wombat_internal = function($wbwindow) {
|
|||||||
// Adapted from:
|
// Adapted from:
|
||||||
// http://indiegamr.com/generate-repeatable-random-numbers-in-js/
|
// http://indiegamr.com/generate-repeatable-random-numbers-in-js/
|
||||||
|
|
||||||
Math.seed = parseInt(seed);
|
$wbwindow.Math.seed = parseInt(seed);
|
||||||
function seeded_random() {
|
function seeded_random() {
|
||||||
Math.seed = (Math.seed * 9301 + 49297) % 233280;
|
$wbwindow.Math.seed = ($wbwindow.Math.seed * 9301 + 49297) % 233280;
|
||||||
var rnd = Math.seed / 233280;
|
var rnd = $wbwindow.Math.seed / 233280;
|
||||||
|
|
||||||
return rnd;
|
return rnd;
|
||||||
}
|
}
|
||||||
|
|
||||||
Math.random = seeded_random;
|
$wbwindow.Math.random = seeded_random;
|
||||||
}
|
}
|
||||||
|
|
||||||
function init_crypto_random() {
|
function init_crypto_random() {
|
||||||
@ -687,7 +686,7 @@ var wombat_internal = function($wbwindow) {
|
|||||||
|
|
||||||
var new_getrandom = function(array) {
|
var new_getrandom = function(array) {
|
||||||
for (i = 0; i < array.length; i++) {
|
for (i = 0; i < array.length; i++) {
|
||||||
array[i] = parseInt(Math.random() * 4294967296);
|
array[i] = parseInt($wbwindow.Math.random() * 4294967296);
|
||||||
}
|
}
|
||||||
return array;
|
return array;
|
||||||
}
|
}
|
||||||
@ -719,11 +718,23 @@ var wombat_internal = function($wbwindow) {
|
|||||||
|
|
||||||
orig_func.call(this, state_obj, title, url);
|
orig_func.call(this, state_obj, title, url);
|
||||||
|
|
||||||
if ($wbwindow.__WB_top_frame && $wbwindow != $wbwindow.__WB_top_frame && $wbwindow.__WB_top_frame.update_wb_url) {
|
//if ($wbwindow.__WB_top_frame && $wbwindow != $wbwindow.__WB_top_frame && $wbwindow.__WB_top_frame.update_wb_url) {
|
||||||
$wbwindow.__WB_top_frame.update_wb_url($wbwindow.WB_wombat_location.href,
|
// $wbwindow.__WB_top_frame.update_wb_url($wbwindow.WB_wombat_location.href,
|
||||||
wb_info.timestamp,
|
// wb_info.timestamp,
|
||||||
wb_info.request_ts,
|
// wb_info.request_ts,
|
||||||
wb_info.is_live);
|
// wb_info.is_live);
|
||||||
|
//}
|
||||||
|
if ($wbwindow.__WB_top_frame && $wbwindow != $wbwindow.__WB_top_frame) {
|
||||||
|
var message = {
|
||||||
|
"url": url,
|
||||||
|
"ts": wb_info.timestamp,
|
||||||
|
"request_ts": wb_info.request_ts,
|
||||||
|
"is_live": wb_info.is_live,
|
||||||
|
"title": title,
|
||||||
|
"wb_type": func_name,
|
||||||
|
}
|
||||||
|
|
||||||
|
$wbwindow.__WB_top_frame.postMessage(message, "*");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -931,7 +942,8 @@ var wombat_internal = function($wbwindow) {
|
|||||||
//var timezone = new Date().getTimezoneOffset() * 60 * 1000;
|
//var timezone = new Date().getTimezoneOffset() * 60 * 1000;
|
||||||
// Already UTC!
|
// Already UTC!
|
||||||
var timezone = 0;
|
var timezone = 0;
|
||||||
var timediff = $wbwindow.Date.now() - (timestamp - timezone);
|
var start_now = $wbwindow.Date.now()
|
||||||
|
var timediff = start_now - (timestamp - timezone);
|
||||||
|
|
||||||
if ($wbwindow.__wb_Date_now) {
|
if ($wbwindow.__wb_Date_now) {
|
||||||
return;
|
return;
|
||||||
@ -1656,13 +1668,14 @@ var wombat_internal = function($wbwindow) {
|
|||||||
|
|
||||||
var from = source.WB_wombat_location.origin;
|
var from = source.WB_wombat_location.origin;
|
||||||
|
|
||||||
if (!source.__WB_id) {
|
|
||||||
source.__WB_id = Math.round(Math.random() * 1000) + source.WB_wombat_location.href;
|
|
||||||
}
|
|
||||||
if (!this.__WB_win_id) {
|
if (!this.__WB_win_id) {
|
||||||
this.__WB_win_id = {};
|
this.__WB_win_id = {};
|
||||||
|
this.__WB_counter = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!source.__WB_id) {
|
||||||
|
source.__WB_id = (this.__WB_counter++) + source.WB_wombat_location.href;
|
||||||
|
}
|
||||||
this.__WB_win_id[source.__WB_id] = source;
|
this.__WB_win_id[source.__WB_id] = source;
|
||||||
|
|
||||||
src_id = source.__WB_id;
|
src_id = source.__WB_id;
|
||||||
@ -1783,19 +1796,22 @@ var wombat_internal = function($wbwindow) {
|
|||||||
//============================================
|
//============================================
|
||||||
function init_open_override()
|
function init_open_override()
|
||||||
{
|
{
|
||||||
if (!$wbwindow.Window.prototype.open) {
|
var orig = $wbwindow.open;
|
||||||
return;
|
|
||||||
|
if ($wbwindow.Window.prototype.open) {
|
||||||
|
orig = $wbwindow.Window.prototype.open;
|
||||||
}
|
}
|
||||||
|
|
||||||
var orig = $wbwindow.Window.prototype.open;
|
|
||||||
|
|
||||||
var open_rewritten = function(strUrl, strWindowName, strWindowFeatures) {
|
var open_rewritten = function(strUrl, strWindowName, strWindowFeatures) {
|
||||||
strUrl = rewrite_url(strUrl);
|
strUrl = rewrite_url(strUrl, false, "");
|
||||||
return orig.call(this, strUrl, strWindowName, strWindowFeatures);
|
return orig.call(this, strUrl, strWindowName, strWindowFeatures);
|
||||||
}
|
}
|
||||||
|
|
||||||
$wbwindow.open = open_rewritten;
|
$wbwindow.open = open_rewritten;
|
||||||
$wbwindow.Window.prototype.open = open_rewritten;
|
|
||||||
|
if ($wbwindow.Window.prototype.open) {
|
||||||
|
$wbwindow.Window.prototype.open = open_rewritten;
|
||||||
|
}
|
||||||
|
|
||||||
for (var i = 0; i < $wbwindow.frames.length; i++) {
|
for (var i = 0; i < $wbwindow.frames.length; i++) {
|
||||||
try {
|
try {
|
||||||
@ -2086,7 +2102,7 @@ var wombat_internal = function($wbwindow) {
|
|||||||
|
|
||||||
//============================================
|
//============================================
|
||||||
function get_final_url(prefix, mod, url) {
|
function get_final_url(prefix, mod, url) {
|
||||||
if (!mod) {
|
if (mod == undefined) {
|
||||||
mod = wb_info.mod;
|
mod = wb_info.mod;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
import zlib
|
import zlib
|
||||||
|
import brotli
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
@ -17,6 +18,11 @@ def deflate_decompressor():
|
|||||||
def deflate_decompressor_alt():
|
def deflate_decompressor_alt():
|
||||||
return zlib.decompressobj(-zlib.MAX_WBITS)
|
return zlib.decompressobj(-zlib.MAX_WBITS)
|
||||||
|
|
||||||
|
def brotli_decompressor():
|
||||||
|
decomp = brotli.Decompressor()
|
||||||
|
decomp.unused_data = None
|
||||||
|
return decomp
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class BufferedReader(object):
|
class BufferedReader(object):
|
||||||
@ -40,7 +46,9 @@ class BufferedReader(object):
|
|||||||
|
|
||||||
DECOMPRESSORS = {'gzip': gzip_decompressor,
|
DECOMPRESSORS = {'gzip': gzip_decompressor,
|
||||||
'deflate': deflate_decompressor,
|
'deflate': deflate_decompressor,
|
||||||
'deflate_alt': deflate_decompressor_alt}
|
'deflate_alt': deflate_decompressor_alt,
|
||||||
|
'br': brotli_decompressor
|
||||||
|
}
|
||||||
|
|
||||||
def __init__(self, stream, block_size=1024,
|
def __init__(self, stream, block_size=1024,
|
||||||
decomp_type=None,
|
decomp_type=None,
|
||||||
@ -98,7 +106,7 @@ class BufferedReader(object):
|
|||||||
if self.decompressor and data:
|
if self.decompressor and data:
|
||||||
try:
|
try:
|
||||||
data = self.decompressor.decompress(data)
|
data = self.decompressor.decompress(data)
|
||||||
except Exception:
|
except Exception as e:
|
||||||
# if first read attempt, assume non-gzipped stream
|
# if first read attempt, assume non-gzipped stream
|
||||||
if self.num_read == 0:
|
if self.num_read == 0:
|
||||||
if self.decomp_type == 'deflate':
|
if self.decomp_type == 'deflate':
|
||||||
@ -108,7 +116,8 @@ class BufferedReader(object):
|
|||||||
self.decompressor = None
|
self.decompressor = None
|
||||||
# otherwise (partly decompressed), something is wrong
|
# otherwise (partly decompressed), something is wrong
|
||||||
else:
|
else:
|
||||||
raise
|
print(str(e))
|
||||||
|
return b''
|
||||||
return data
|
return data
|
||||||
|
|
||||||
def read(self, length=None):
|
def read(self, length=None):
|
||||||
@ -180,6 +189,10 @@ class BufferedReader(object):
|
|||||||
self.stream.close()
|
self.stream.close()
|
||||||
self.stream = None
|
self.stream = None
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_supported_decompressors(cls):
|
||||||
|
return cls.DECOMPRESSORS.keys()
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class DecompressingBufferedReader(BufferedReader):
|
class DecompressingBufferedReader(BufferedReader):
|
||||||
|
@ -9,10 +9,12 @@ import requests
|
|||||||
|
|
||||||
import six
|
import six
|
||||||
from six.moves.urllib.request import pathname2url, url2pathname
|
from six.moves.urllib.request import pathname2url, url2pathname
|
||||||
from six.moves.urllib.parse import urljoin, unquote_plus, urlsplit
|
from six.moves.urllib.parse import urljoin, unquote_plus, urlsplit, urlencode
|
||||||
|
|
||||||
import time
|
import time
|
||||||
import pkg_resources
|
import pkg_resources
|
||||||
|
import base64
|
||||||
|
import cgi
|
||||||
|
|
||||||
from io import open, BytesIO
|
from io import open, BytesIO
|
||||||
|
|
||||||
@ -65,19 +67,18 @@ def to_native_str(value, encoding='iso-8859-1', func=lambda x: x):
|
|||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
def extract_post_query(method, mime, length, stream, buffered_stream=None):
|
def extract_post_query(method, mime, length, stream,
|
||||||
|
buffered_stream=None,
|
||||||
|
environ=None):
|
||||||
"""
|
"""
|
||||||
Extract a url-encoded form POST from stream
|
Extract a url-encoded form POST from stream
|
||||||
If not a application/x-www-form-urlencoded, or no missing
|
|
||||||
content length, return None
|
content length, return None
|
||||||
|
Attempt to decode application/x-www-form-urlencoded or multipart/*,
|
||||||
|
otherwise read whole block and b64encode
|
||||||
"""
|
"""
|
||||||
if method.upper() != 'POST':
|
if method.upper() != 'POST':
|
||||||
return None
|
return None
|
||||||
|
|
||||||
if ((not mime or
|
|
||||||
not mime.lower().startswith('application/x-www-form-urlencoded'))):
|
|
||||||
return None
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
length = int(length)
|
length = int(length)
|
||||||
except (ValueError, TypeError):
|
except (ValueError, TypeError):
|
||||||
@ -101,11 +102,79 @@ def extract_post_query(method, mime, length, stream, buffered_stream=None):
|
|||||||
buffered_stream.write(post_query)
|
buffered_stream.write(post_query)
|
||||||
buffered_stream.seek(0)
|
buffered_stream.seek(0)
|
||||||
|
|
||||||
post_query = to_native_str(post_query)
|
if not mime:
|
||||||
post_query = unquote_plus(post_query)
|
mime = ''
|
||||||
|
|
||||||
|
if mime.startswith('application/x-www-form-urlencoded'):
|
||||||
|
post_query = to_native_str(post_query)
|
||||||
|
post_query = unquote_plus(post_query)
|
||||||
|
|
||||||
|
elif mime.startswith('multipart/'):
|
||||||
|
env = {'REQUEST_METHOD': 'POST',
|
||||||
|
'CONTENT_TYPE': mime,
|
||||||
|
'CONTENT_LENGTH': len(post_query)}
|
||||||
|
|
||||||
|
args = dict(fp=BytesIO(post_query),
|
||||||
|
environ=env,
|
||||||
|
keep_blank_values=True)
|
||||||
|
|
||||||
|
if six.PY3:
|
||||||
|
args['encoding'] = 'utf-8'
|
||||||
|
|
||||||
|
data = cgi.FieldStorage(**args)
|
||||||
|
|
||||||
|
values = []
|
||||||
|
for item in data.list:
|
||||||
|
values.append((item.name, item.value))
|
||||||
|
|
||||||
|
post_query = urlencode(values, True)
|
||||||
|
|
||||||
|
elif mime.startswith('application/x-amf'):
|
||||||
|
post_query = amf_parse(post_query, environ)
|
||||||
|
|
||||||
|
else:
|
||||||
|
post_query = base64.b64encode(post_query)
|
||||||
|
post_query = to_native_str(post_query)
|
||||||
|
post_query = '&__wb_post_data=' + post_query
|
||||||
|
|
||||||
return post_query
|
return post_query
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
def amf_parse(string, environ):
|
||||||
|
try:
|
||||||
|
from pyamf import remoting
|
||||||
|
|
||||||
|
res = remoting.decode(BytesIO(string))
|
||||||
|
|
||||||
|
#print(res)
|
||||||
|
body = res.bodies[0][1].body[0]
|
||||||
|
|
||||||
|
values = {}
|
||||||
|
|
||||||
|
if hasattr(body, 'body'):
|
||||||
|
values['body'] = body.body
|
||||||
|
|
||||||
|
if hasattr(body, 'source'):
|
||||||
|
values['source'] = body.source
|
||||||
|
|
||||||
|
if hasattr(body, 'operation'):
|
||||||
|
values['op'] = body.operation
|
||||||
|
|
||||||
|
if environ is not None:
|
||||||
|
environ['pywb.inputdata'] = res
|
||||||
|
|
||||||
|
query = urlencode(values)
|
||||||
|
#print(query)
|
||||||
|
return query
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
print(e)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
def append_post_query(url, post_query):
|
def append_post_query(url, post_query):
|
||||||
if not post_query:
|
if not post_query:
|
||||||
@ -167,23 +236,34 @@ def read_last_line(fh, offset=256):
|
|||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class BlockLoader(object):
|
class BaseLoader(object):
|
||||||
|
def __init__(self, **kwargs):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def load(self, url, offset=0, length=-1):
|
||||||
|
raise NotImplemented()
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
class BlockLoader(BaseLoader):
|
||||||
"""
|
"""
|
||||||
a loader which can stream blocks of content
|
a loader which can stream blocks of content
|
||||||
given a uri, offset and optional length.
|
given a uri, offset and optional length.
|
||||||
Currently supports: http/https and file/local file system
|
Currently supports: http/https and file/local file system
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
loaders = {}
|
||||||
|
profile_loader = None
|
||||||
|
|
||||||
|
def __init__(self, **kwargs):
|
||||||
self.cached = {}
|
self.cached = {}
|
||||||
self.args = args
|
|
||||||
self.kwargs = kwargs
|
self.kwargs = kwargs
|
||||||
|
|
||||||
def load(self, url, offset=0, length=-1):
|
def load(self, url, offset=0, length=-1):
|
||||||
loader = self._get_loader_for(url)
|
loader, url = self._get_loader_for_url(url)
|
||||||
return loader.load(url, offset, length)
|
return loader.load(url, offset, length)
|
||||||
|
|
||||||
def _get_loader_for(self, url):
|
def _get_loader_for_url(self, url):
|
||||||
"""
|
"""
|
||||||
Determine loading method based on uri
|
Determine loading method based on uri
|
||||||
"""
|
"""
|
||||||
@ -193,18 +273,47 @@ class BlockLoader(object):
|
|||||||
else:
|
else:
|
||||||
type_ = parts[0]
|
type_ = parts[0]
|
||||||
|
|
||||||
|
if '+' in type_:
|
||||||
|
profile_name, scheme = type_.split('+', 1)
|
||||||
|
if len(parts) == 2:
|
||||||
|
url = scheme + '://' + parts[1]
|
||||||
|
else:
|
||||||
|
profile_name = ''
|
||||||
|
scheme = type_
|
||||||
|
|
||||||
loader = self.cached.get(type_)
|
loader = self.cached.get(type_)
|
||||||
if loader:
|
if loader:
|
||||||
return loader
|
return loader, url
|
||||||
|
|
||||||
|
loader_cls = self._get_loader_class_for_type(scheme)
|
||||||
|
|
||||||
loader_cls = LOADERS.get(type_)
|
|
||||||
if not loader_cls:
|
if not loader_cls:
|
||||||
raise IOError('No Loader for type: ' + type_)
|
raise IOError('No Loader for type: ' + scheme)
|
||||||
|
|
||||||
|
profile = self.kwargs
|
||||||
|
|
||||||
|
if self.profile_loader:
|
||||||
|
profile = self.profile_loader(profile_name, scheme)
|
||||||
|
|
||||||
|
loader = loader_cls(**profile)
|
||||||
|
|
||||||
loader = loader_cls(*self.args, **self.kwargs)
|
|
||||||
self.cached[type_] = loader
|
self.cached[type_] = loader
|
||||||
return loader
|
return loader, url
|
||||||
|
|
||||||
|
def _get_loader_class_for_type(self, type_):
|
||||||
|
loader_cls = self.loaders.get(type_)
|
||||||
|
return loader_cls
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def init_default_loaders():
|
||||||
|
BlockLoader.loaders['http'] = HttpLoader
|
||||||
|
BlockLoader.loaders['https'] = HttpLoader
|
||||||
|
BlockLoader.loaders['s3'] = S3Loader
|
||||||
|
BlockLoader.loaders['file'] = LocalFileLoader
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def set_profile_loader(src):
|
||||||
|
BlockLoader.profile_loader = src
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _make_range_header(offset, length):
|
def _make_range_header(offset, length):
|
||||||
@ -217,10 +326,7 @@ class BlockLoader(object):
|
|||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class LocalFileLoader(object):
|
class LocalFileLoader(BaseLoader):
|
||||||
def __init__(self, *args, **kwargs):
|
|
||||||
pass
|
|
||||||
|
|
||||||
def load(self, url, offset=0, length=-1):
|
def load(self, url, offset=0, length=-1):
|
||||||
"""
|
"""
|
||||||
Load a file-like reader from the local file system
|
Load a file-like reader from the local file system
|
||||||
@ -260,9 +366,11 @@ class LocalFileLoader(object):
|
|||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class HttpLoader(object):
|
class HttpLoader(BaseLoader):
|
||||||
def __init__(self, cookie_maker=None, *args, **kwargs):
|
def __init__(self, **kwargs):
|
||||||
self.cookie_maker = cookie_maker
|
self.cookie_maker = kwargs.get('cookie_maker')
|
||||||
|
if not self.cookie_maker:
|
||||||
|
self.cookie_maker = kwargs.get('cookie')
|
||||||
self.session = None
|
self.session = None
|
||||||
|
|
||||||
def load(self, url, offset, length):
|
def load(self, url, offset, length):
|
||||||
@ -288,33 +396,47 @@ class HttpLoader(object):
|
|||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class S3Loader(object):
|
class S3Loader(BaseLoader):
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, **kwargs):
|
||||||
self.s3conn = None
|
self.s3conn = None
|
||||||
|
self.aws_access_key_id = kwargs.get('aws_access_key_id')
|
||||||
|
self.aws_secret_access_key = kwargs.get('aws_secret_access_key')
|
||||||
|
|
||||||
def load(self, url, offset, length):
|
def load(self, url, offset, length):
|
||||||
if not s3_avail: #pragma: no cover
|
if not s3_avail: #pragma: no cover
|
||||||
raise IOError('To load from s3 paths, ' +
|
raise IOError('To load from s3 paths, ' +
|
||||||
'you must install boto: pip install boto')
|
'you must install boto: pip install boto')
|
||||||
|
|
||||||
if not self.s3conn:
|
aws_access_key_id = self.aws_access_key_id
|
||||||
try:
|
aws_secret_access_key = self.aws_secret_access_key
|
||||||
self.s3conn = connect_s3()
|
|
||||||
except Exception: #pragma: no cover
|
|
||||||
self.s3conn = connect_s3(anon=True)
|
|
||||||
|
|
||||||
parts = urlsplit(url)
|
parts = urlsplit(url)
|
||||||
|
|
||||||
bucket = self.s3conn.get_bucket(parts.netloc)
|
if parts.username and parts.password:
|
||||||
|
aws_access_key_id = unquote_plus(parts.username)
|
||||||
|
aws_secret_access_key = unquote_plus(parts.password)
|
||||||
|
bucket_name = parts.netloc.split('@', 1)[-1]
|
||||||
|
else:
|
||||||
|
bucket_name = parts.netloc
|
||||||
|
|
||||||
headers = {'Range': BlockLoader._make_range_header(offset, length)}
|
if not self.s3conn:
|
||||||
|
try:
|
||||||
|
self.s3conn = connect_s3(aws_access_key_id, aws_secret_access_key)
|
||||||
|
except Exception: #pragma: no cover
|
||||||
|
self.s3conn = connect_s3(anon=True)
|
||||||
|
|
||||||
|
bucket = self.s3conn.get_bucket(bucket_name)
|
||||||
|
|
||||||
key = bucket.get_key(parts.path)
|
key = bucket.get_key(parts.path)
|
||||||
|
|
||||||
result = key.get_contents_as_string(headers=headers)
|
if offset == 0 and length == -1:
|
||||||
key.close()
|
headers = {}
|
||||||
|
else:
|
||||||
|
headers = {'Range': BlockLoader._make_range_header(offset, length)}
|
||||||
|
|
||||||
return BytesIO(result)
|
# Read range
|
||||||
|
key.open_read(headers=headers)
|
||||||
|
return key
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
@ -414,12 +536,6 @@ class LimitReader(object):
|
|||||||
|
|
||||||
return stream
|
return stream
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
#=================================================================
|
BlockLoader.init_default_loaders()
|
||||||
LOADERS = {'http': HttpLoader,
|
|
||||||
'https': HttpLoader,
|
|
||||||
's3': S3Loader,
|
|
||||||
'file': LocalFileLoader
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -133,6 +133,14 @@ def compress_alt(buff):
|
|||||||
|
|
||||||
return compressed
|
return compressed
|
||||||
|
|
||||||
|
# Brotli
|
||||||
|
|
||||||
|
def test_brotli():
|
||||||
|
with open(get_test_dir() + 'text_content/quickfox_repeated.compressed', 'rb') as fh:
|
||||||
|
x = DecompressingBufferedReader(fh, decomp_type='br')
|
||||||
|
x.read() == b'The quick brown fox jumps over the lazy dog' * 4096
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Errors
|
# Errors
|
||||||
|
|
||||||
@ -140,9 +148,11 @@ def test_err_compress_mix():
|
|||||||
# error: compressed member, followed by not compressed -- considered invalid
|
# error: compressed member, followed by not compressed -- considered invalid
|
||||||
x = DecompressingBufferedReader(BytesIO(compress('ABC') + b'123'), decomp_type = 'gzip')
|
x = DecompressingBufferedReader(BytesIO(compress('ABC') + b'123'), decomp_type = 'gzip')
|
||||||
b = x.read()
|
b = x.read()
|
||||||
b = x.read_next_member()
|
assert b == b'ABC'
|
||||||
with pytest.raises(zlib.error):
|
x.read_next_member()
|
||||||
x.read()
|
assert x.read() == b''
|
||||||
|
#with pytest.raises(zlib.error):
|
||||||
|
# x.read()
|
||||||
#error: Error -3 while decompressing: incorrect header check
|
#error: Error -3 while decompressing: incorrect header check
|
||||||
|
|
||||||
def test_err_chunk_cut_off():
|
def test_err_chunk_cut_off():
|
||||||
|
@ -37,17 +37,21 @@ Traceback (most recent call last):
|
|||||||
IOError: [Errno 2] No such file or directory: '_x_no_such_file_'
|
IOError: [Errno 2] No such file or directory: '_x_no_such_file_'
|
||||||
|
|
||||||
# HMAC Cookie Maker
|
# HMAC Cookie Maker
|
||||||
>>> print_str(BlockLoader(HMACCookieMaker('test', 'test', 5)).load('http://example.com', 41, 14).read())
|
>>> print_str(BlockLoader(cookie_maker=HMACCookieMaker('test', 'test', 5)).load('http://example.com', 41, 14).read())
|
||||||
'Example Domain'
|
'Example Domain'
|
||||||
|
|
||||||
# fixed cookie, range request
|
# fixed cookie, range request
|
||||||
>>> print_str(BlockLoader('some=value').load('http://example.com', 41, 14).read())
|
>>> print_str(BlockLoader(cookie='some=value').load('http://example.com', 41, 14).read())
|
||||||
'Example Domain'
|
'Example Domain'
|
||||||
|
|
||||||
# range request
|
# range request
|
||||||
>>> print_str(BlockLoader().load('http://example.com', 1262).read())
|
>>> print_str(BlockLoader().load('http://example.com', 1262).read())
|
||||||
'</html>\n'
|
'</html>\n'
|
||||||
|
|
||||||
|
# custom profile
|
||||||
|
>>> print_str(BlockLoader().load('local+http://example.com', 1262).read())
|
||||||
|
'</html>\n'
|
||||||
|
|
||||||
# unknown loader error
|
# unknown loader error
|
||||||
#>>> BlockLoader().load('foo://example.com', 10).read() # doctest: +IGNORE_EXCEPTION_DETAIL
|
#>>> BlockLoader().load('foo://example.com', 10).read() # doctest: +IGNORE_EXCEPTION_DETAIL
|
||||||
#Traceback (most recent call last):
|
#Traceback (most recent call last):
|
||||||
@ -90,8 +94,9 @@ IOError: [Errno 2] No such file or directory: '_x_no_such_file_'
|
|||||||
# unsupported method
|
# unsupported method
|
||||||
>>> extract_post_query('PUT', 'application/x-www-form-urlencoded', len(post_data), BytesIO(post_data))
|
>>> extract_post_query('PUT', 'application/x-www-form-urlencoded', len(post_data), BytesIO(post_data))
|
||||||
|
|
||||||
# unsupported type
|
# base64 encode
|
||||||
>>> extract_post_query('POST', 'text/plain', len(post_data), BytesIO(post_data))
|
>>> extract_post_query('POST', 'text/plain', len(post_data), BytesIO(post_data))
|
||||||
|
'&__wb_post_data=Zm9vPWJhciZkaXI9JTJGYmF6'
|
||||||
|
|
||||||
# invalid length
|
# invalid length
|
||||||
>>> extract_post_query('POST', 'application/x-www-form-urlencoded', 'abc', BytesIO(post_data))
|
>>> extract_post_query('POST', 'application/x-www-form-urlencoded', 'abc', BytesIO(post_data))
|
||||||
|
@ -253,6 +253,7 @@ class ArchiveIndexEntryMixin(object):
|
|||||||
self['mime'] = def_mime
|
self['mime'] = def_mime
|
||||||
if mime:
|
if mime:
|
||||||
self['mime'] = self.MIME_RE.split(mime, 1)[0]
|
self['mime'] = self.MIME_RE.split(mime, 1)[0]
|
||||||
|
self['_content_type'] = mime
|
||||||
|
|
||||||
def extract_status(self, status_headers):
|
def extract_status(self, status_headers):
|
||||||
""" Extract status code only from status line
|
""" Extract status code only from status line
|
||||||
@ -390,7 +391,7 @@ class DefaultRecordParser(object):
|
|||||||
len_ = record.status_headers.get_header('Content-Length')
|
len_ = record.status_headers.get_header('Content-Length')
|
||||||
|
|
||||||
post_query = extract_post_query(method,
|
post_query = extract_post_query(method,
|
||||||
entry.get('mime'),
|
entry.get('_content_type'),
|
||||||
len_,
|
len_,
|
||||||
record.stream)
|
record.stream)
|
||||||
|
|
||||||
|
@ -57,7 +57,7 @@ class ArcWarcRecordLoader(object):
|
|||||||
def __init__(self, loader=None, cookie_maker=None, block_size=8192,
|
def __init__(self, loader=None, cookie_maker=None, block_size=8192,
|
||||||
verify_http=True):
|
verify_http=True):
|
||||||
if not loader:
|
if not loader:
|
||||||
loader = BlockLoader(cookie_maker)
|
loader = BlockLoader(cookie_maker=cookie_maker)
|
||||||
|
|
||||||
self.loader = loader
|
self.loader = loader
|
||||||
self.block_size = block_size
|
self.block_size = block_size
|
||||||
|
@ -174,7 +174,8 @@ class ReplayView(object):
|
|||||||
stream=stream,
|
stream=stream,
|
||||||
head_insert_func=head_insert_func,
|
head_insert_func=head_insert_func,
|
||||||
urlkey=cdx['urlkey'],
|
urlkey=cdx['urlkey'],
|
||||||
cdx=cdx))
|
cdx=cdx,
|
||||||
|
env=wbrequest.env))
|
||||||
|
|
||||||
(status_headers, response_iter, is_rewritten) = result
|
(status_headers, response_iter, is_rewritten) = result
|
||||||
|
|
||||||
|
2
sample_archive/text_content/quickfox_repeated.compressed
Normal file
2
sample_archive/text_content/quickfox_repeated.compressed
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
[яЇА"y\ыZЊB;ф%UZ’™±5Ићћ
|
||||||
|
{Kђ№<<3C>И @ужЩMдme'‡_¦й0–{<ШS
|
6
setup.py
6
setup.py
@ -78,7 +78,8 @@ setup(
|
|||||||
'requests',
|
'requests',
|
||||||
'redis',
|
'redis',
|
||||||
'jinja2',
|
'jinja2',
|
||||||
'surt==0.3b4',
|
'surt>=0.3.0',
|
||||||
|
'brotlipy',
|
||||||
'pyyaml',
|
'pyyaml',
|
||||||
'watchdog',
|
'watchdog',
|
||||||
'webencodings',
|
'webencodings',
|
||||||
@ -90,9 +91,6 @@ setup(
|
|||||||
'fakeredis',
|
'fakeredis',
|
||||||
'mock',
|
'mock',
|
||||||
],
|
],
|
||||||
dependency_links=[
|
|
||||||
'git+https://github.com/ikreymer/fakeredis.git@zset-lex-ops#egg=fakeredis-0.6.2-zset-lex-ops',
|
|
||||||
],
|
|
||||||
cmdclass={'test': PyTest},
|
cmdclass={'test': PyTest},
|
||||||
test_suite='',
|
test_suite='',
|
||||||
entry_points="""
|
entry_points="""
|
||||||
|
Loading…
x
Reference in New Issue
Block a user