From 1e7d4d27e3d5aeb7f06dd8a902680bcc59cb7620 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Fri, 6 May 2016 09:43:11 -0700 Subject: [PATCH 01/22] bump version to 0.30.2 --- README.rst | 2 +- pywb/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index fc257400..0f5a85df 100644 --- a/README.rst +++ b/README.rst @@ -1,4 +1,4 @@ -PyWb 0.30.1 +PyWb 0.30.2 =========== .. image:: https://travis-ci.org/ikreymer/pywb.svg?branch=master diff --git a/pywb/__init__.py b/pywb/__init__.py index c3b4b701..2ac4d5e1 100644 --- a/pywb/__init__.py +++ b/pywb/__init__.py @@ -1,4 +1,4 @@ -__version__ = '0.30.1' +__version__ = '0.30.2' DEFAULT_CONFIG = 'pywb/default_config.yaml' From e5e7c5a7df0b3e60a7164fc326083c745c80bc5e Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Fri, 6 May 2016 09:48:38 -0700 Subject: [PATCH 02/22] wombat: ensure Math.random() overrides use the current window --- pywb/static/wombat.js | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/pywb/static/wombat.js b/pywb/static/wombat.js index 5fb4e2a4..171b1940 100644 --- a/pywb/static/wombat.js +++ b/pywb/static/wombat.js @@ -667,15 +667,15 @@ var wombat_internal = function($wbwindow) { // Adapted from: // http://indiegamr.com/generate-repeatable-random-numbers-in-js/ - Math.seed = parseInt(seed); + $wbwindow.Math.seed = parseInt(seed); function seeded_random() { - Math.seed = (Math.seed * 9301 + 49297) % 233280; - var rnd = Math.seed / 233280; + $wbwindow.Math.seed = ($wbwindow.Math.seed * 9301 + 49297) % 233280; + var rnd = $wbwindow.Math.seed / 233280; return rnd; } - Math.random = seeded_random; + $wbwindow.Math.random = seeded_random; } function init_crypto_random() { @@ -687,7 +687,7 @@ var wombat_internal = function($wbwindow) { var new_getrandom = function(array) { for (i = 0; i < array.length; i++) { - array[i] = parseInt(Math.random() * 4294967296); + array[i] = parseInt($wbwindow.Math.random() * 4294967296); } return array; } @@ -931,7 +931,8 @@ var wombat_internal = function($wbwindow) { //var timezone = new Date().getTimezoneOffset() * 60 * 1000; // Already UTC! var timezone = 0; - var timediff = $wbwindow.Date.now() - (timestamp - timezone); + var start_now = $wbwindow.Date.now() + var timediff = start_now - (timestamp - timezone); if ($wbwindow.__wb_Date_now) { return; @@ -1656,13 +1657,14 @@ var wombat_internal = function($wbwindow) { var from = source.WB_wombat_location.origin; - if (!source.__WB_id) { - source.__WB_id = Math.round(Math.random() * 1000) + source.WB_wombat_location.href; - } if (!this.__WB_win_id) { this.__WB_win_id = {}; + this.__WB_counter = 0; } + if (!source.__WB_id) { + source.__WB_id = (this.__WB_counter++) + source.WB_wombat_location.href; + } this.__WB_win_id[source.__WB_id] = source; src_id = source.__WB_id; From 87da25c703cd424a1398ba1a0e364abe0304a485 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Fri, 6 May 2016 10:19:08 -0700 Subject: [PATCH 03/22] post request mapping improvements: work on #178, including: - mapping multipart/form-data same as x-www-form-urlencoded - parsing application/x-amf with pyamf - RewriteContentAMF for rewriting AMF response to match request - default encoding of other POST data as base64 encoded __wb_post_data param --- pywb/framework/wbrequestresponse.py | 5 +- pywb/rewrite/rewrite_content.py | 75 +++++++++++++++++++++++-- pywb/utils/loaders.py | 87 ++++++++++++++++++++++++++--- pywb/utils/test/test_loaders.py | 3 +- pywb/warc/archiveiterator.py | 3 +- pywb/webapp/replay_views.py | 7 ++- setup.py | 3 +- 7 files changed, 160 insertions(+), 23 deletions(-) diff --git a/pywb/framework/wbrequestresponse.py b/pywb/framework/wbrequestresponse.py index 0d2634f5..36afff40 100644 --- a/pywb/framework/wbrequestresponse.py +++ b/pywb/framework/wbrequestresponse.py @@ -184,14 +184,15 @@ class WbRequest(object): if not self.wb_url: return - mime = self.env.get('CONTENT_TYPE', '').split(';')[0] + mime = self.env.get('CONTENT_TYPE', '') length = self.env.get('CONTENT_LENGTH') stream = self.env['wsgi.input'] buffered_stream = BytesIO() post_query = extract_post_query('POST', mime, length, stream, - buffered_stream=buffered_stream) + buffered_stream=buffered_stream, + environ=self.env) if post_query: self.env['wsgi.input'] = buffered_stream diff --git a/pywb/rewrite/rewrite_content.py b/pywb/rewrite/rewrite_content.py index 4454ea3c..e9ad9fd0 100644 --- a/pywb/rewrite/rewrite_content.py +++ b/pywb/rewrite/rewrite_content.py @@ -5,7 +5,8 @@ import yaml import re from chardet.universaldetector import UniversalDetector -from io import BytesIO +from io import BytesIO, BufferedReader +from six.moves import zip from pywb.rewrite.header_rewriter import RewrittenStatusAndHeaders @@ -21,7 +22,7 @@ from pywb.rewrite.regex_rewriters import JSNoneRewriter, JSLinkOnlyRewriter #================================================================= -class RewriteContent: +class RewriteContent(object): HEAD_REGEX = re.compile(b'<\s*head\\b[^>]*[>]+', re.I) TAG_REGEX = re.compile(b'^\s*\<') @@ -94,7 +95,7 @@ class RewriteContent: def rewrite_content(self, urlrewriter, status_headers, stream, head_insert_func=None, urlkey='', - cdx=None, cookie_rewriter=None): + cdx=None, cookie_rewriter=None, env=None): wb_url = urlrewriter.wburl @@ -118,9 +119,12 @@ class RewriteContent: status_headers = rewritten_headers.status_headers - # use rewritten headers, but no further rewriting needed - if rewritten_headers.text_type is None: - return (status_headers, self.stream_to_gen(stream), False) + res = self.handle_custom_rewrite(rewritten_headers.text_type, + status_headers, + stream, + env) + if res: + return res # Handle text content rewriting # ==================================================================== @@ -237,6 +241,11 @@ class RewriteContent: return (status_headers, gen, True) + def handle_custom_rewrite(self, text_type, status_headers, stream, env): + # use rewritten headers, but no further rewriting needed + if text_type is None: + return (status_headers, self.stream_to_gen(stream), False) + @staticmethod def _extract_html_charset(buff, status_headers): charset = None @@ -360,3 +369,57 @@ class RewriteContent: finally: stream.close() + + +# ================================================================= +class RewriteContentAMF(RewriteContent): + def handle_custom_rewrite(self, text_type, status_headers, stream, env): + + if status_headers.get_header('Content-Type') == 'application/x-amf': + stream = self.rewrite_amf(stream, env) + + return (super(RewriteContentAMF, self). + handle_custom_rewrite(text_type, status_headers, stream, env)) + + def rewrite_amf(self, stream, env): + try: + from pyamf import remoting + + iobuff = BytesIO() + while True: + buff = stream.read() + if not buff: + break + iobuff.write(buff) + + iobuff.seek(0) + res = remoting.decode(iobuff) + + print('rewrite amf') + + print(env.get('pywb.inputdata')) + + if env and env.get('pywb.inputdata'): + inputdata = env.get('pywb.inputdata') + + new_list = [] + + for src, target in zip(inputdata.bodies, res.bodies): + print(target[0] + ' = ' + src[0]) + + print('messageId => corrId ' + target[1].body.correlationId + ' => ' + src[1].body[0].messageId) + target[1].body.correlationId = src[1].body[0].messageId + + new_list.append((src[0], target[1])) + + res.bodies = new_list + + return BytesIO(remoting.encode(res).getvalue()) + + except Exception as e: + import traceback + traceback.print_exc() + print(e) + return stream + + diff --git a/pywb/utils/loaders.py b/pywb/utils/loaders.py index 6dbbf1e2..6a262db4 100644 --- a/pywb/utils/loaders.py +++ b/pywb/utils/loaders.py @@ -9,10 +9,12 @@ import requests import six from six.moves.urllib.request import pathname2url, url2pathname -from six.moves.urllib.parse import urljoin, unquote_plus, urlsplit +from six.moves.urllib.parse import urljoin, unquote_plus, urlsplit, urlencode import time import pkg_resources +import base64 +import cgi from io import open, BytesIO @@ -65,19 +67,18 @@ def to_native_str(value, encoding='iso-8859-1', func=lambda x: x): #================================================================= -def extract_post_query(method, mime, length, stream, buffered_stream=None): +def extract_post_query(method, mime, length, stream, + buffered_stream=None, + environ=None): """ Extract a url-encoded form POST from stream - If not a application/x-www-form-urlencoded, or no missing content length, return None + Attempt to decode application/x-www-form-urlencoded or multipart/*, + otherwise read whole block and b64encode """ if method.upper() != 'POST': return None - if ((not mime or - not mime.lower().startswith('application/x-www-form-urlencoded'))): - return None - try: length = int(length) except (ValueError, TypeError): @@ -101,11 +102,79 @@ def extract_post_query(method, mime, length, stream, buffered_stream=None): buffered_stream.write(post_query) buffered_stream.seek(0) - post_query = to_native_str(post_query) - post_query = unquote_plus(post_query) + if not mime: + mime = '' + + if mime.startswith('application/x-www-form-urlencoded'): + post_query = to_native_str(post_query) + post_query = unquote_plus(post_query) + + elif mime.startswith('multipart/'): + env = {'REQUEST_METHOD': 'POST', + 'CONTENT_TYPE': mime, + 'CONTENT_LENGTH': len(post_query)} + + args = dict(fp=BytesIO(post_query), + environ=env, + keep_blank_values=True) + + if six.PY3: + args['encoding'] = 'utf-8' + + data = cgi.FieldStorage(**args) + + values = [] + for item in data.list: + values.append((item.name, item.value)) + + post_query = urlencode(values, True) + + elif mime.startswith('application/x-amf'): + post_query = amf_parse(post_query, environ) + + else: + post_query = base64.b64encode(post_query) + post_query = to_native_str(post_query) + post_query = '&__wb_post_data=' + post_query + return post_query +#================================================================= +def amf_parse(string, environ): + try: + from pyamf import remoting + + res = remoting.decode(BytesIO(string)) + + #print(res) + body = res.bodies[0][1].body[0] + + values = {} + + if hasattr(body, 'body'): + values['body'] = body.body + + if hasattr(body, 'source'): + values['source'] = body.source + + if hasattr(body, 'operation'): + values['op'] = body.operation + + if environ is not None: + environ['pywb.inputdata'] = res + + query = urlencode(values) + #print(query) + return query + + except Exception as e: + import traceback + traceback.print_exc() + print(e) + return None + + #================================================================= def append_post_query(url, post_query): if not post_query: diff --git a/pywb/utils/test/test_loaders.py b/pywb/utils/test/test_loaders.py index abf0acfa..0a751712 100644 --- a/pywb/utils/test/test_loaders.py +++ b/pywb/utils/test/test_loaders.py @@ -90,8 +90,9 @@ IOError: [Errno 2] No such file or directory: '_x_no_such_file_' # unsupported method >>> extract_post_query('PUT', 'application/x-www-form-urlencoded', len(post_data), BytesIO(post_data)) -# unsupported type +# base64 encode >>> extract_post_query('POST', 'text/plain', len(post_data), BytesIO(post_data)) +'&__wb_post_data=Zm9vPWJhciZkaXI9JTJGYmF6' # invalid length >>> extract_post_query('POST', 'application/x-www-form-urlencoded', 'abc', BytesIO(post_data)) diff --git a/pywb/warc/archiveiterator.py b/pywb/warc/archiveiterator.py index 76a76abd..4ff500d4 100644 --- a/pywb/warc/archiveiterator.py +++ b/pywb/warc/archiveiterator.py @@ -253,6 +253,7 @@ class ArchiveIndexEntryMixin(object): self['mime'] = def_mime if mime: self['mime'] = self.MIME_RE.split(mime, 1)[0] + self['_content_type'] = mime def extract_status(self, status_headers): """ Extract status code only from status line @@ -390,7 +391,7 @@ class DefaultRecordParser(object): len_ = record.status_headers.get_header('Content-Length') post_query = extract_post_query(method, - entry.get('mime'), + entry.get('_content_type'), len_, record.stream) diff --git a/pywb/webapp/replay_views.py b/pywb/webapp/replay_views.py index d3771c68..6af44ba3 100644 --- a/pywb/webapp/replay_views.py +++ b/pywb/webapp/replay_views.py @@ -13,7 +13,7 @@ from pywb.utils.timeutils import timestamp_now from pywb.framework.wbrequestresponse import WbResponse from pywb.framework.memento import MementoResponse -from pywb.rewrite.rewrite_content import RewriteContent +from pywb.rewrite.rewrite_content import RewriteContentAMF from pywb.warc.recordloader import ArchiveLoadFailed from pywb.webapp.views import HeadInsertView @@ -40,7 +40,7 @@ class ReplayView(object): self.content_loader = content_loader framed = config.get('framed_replay') - self.content_rewriter = RewriteContent(is_framed_replay=framed) + self.content_rewriter = RewriteContentAMF(is_framed_replay=framed) self.head_insert_view = HeadInsertView.init_from_config(config) @@ -174,7 +174,8 @@ class ReplayView(object): stream=stream, head_insert_func=head_insert_func, urlkey=cdx['urlkey'], - cdx=cdx)) + cdx=cdx, + env=wbrequest.env)) (status_headers, response_iter, is_rewritten) = result diff --git a/setup.py b/setup.py index 6dba8420..6936b3f8 100755 --- a/setup.py +++ b/setup.py @@ -82,6 +82,7 @@ setup( 'pyyaml', 'watchdog', 'webencodings', + 'pyamf', ], tests_require=[ 'pytest', @@ -91,7 +92,7 @@ setup( 'mock', ], dependency_links=[ - 'git+https://github.com/ikreymer/fakeredis.git@zset-lex-ops#egg=fakeredis-0.6.2-zset-lex-ops', + 'git+https://github.com/t0m/pyamf.git@python3' ], cmdclass={'test': PyTest}, test_suite='', From 10d8e4b3be5f236f3365a6de618f84f9405bbcdb Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 17 May 2016 18:38:57 -0700 Subject: [PATCH 04/22] bump version to 0.31.0 --- README.rst | 2 +- pywb/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index 0f5a85df..010a6f3e 100644 --- a/README.rst +++ b/README.rst @@ -1,4 +1,4 @@ -PyWb 0.30.2 +PyWb 0.31.0 =========== .. image:: https://travis-ci.org/ikreymer/pywb.svg?branch=master diff --git a/pywb/__init__.py b/pywb/__init__.py index 2ac4d5e1..9f66d658 100644 --- a/pywb/__init__.py +++ b/pywb/__init__.py @@ -1,4 +1,4 @@ -__version__ = '0.30.2' +__version__ = '0.31.0' DEFAULT_CONFIG = 'pywb/default_config.yaml' From 94afab0bb2d1d245f227068f6d483ed3e6ca49a7 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 17 May 2016 18:53:00 -0700 Subject: [PATCH 05/22] wombat rewrite: don't add duplicate slash in rel-url resolve --- pywb/static/wombat.js | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pywb/static/wombat.js b/pywb/static/wombat.js index 171b1940..3032df0c 100644 --- a/pywb/static/wombat.js +++ b/pywb/static/wombat.js @@ -292,6 +292,7 @@ var wombat_internal = function($wbwindow) { var parser = make_parser(extract_orig($wbwindow.document.baseURI)); var href = parser.href; var hash = href.lastIndexOf("#"); + if (hash >= 0) { href = href.substring(0, hash); } @@ -300,8 +301,6 @@ var wombat_internal = function($wbwindow) { if (lastslash >= 0 && lastslash != (href.length - 1)) { href = href.substring(0, lastslash + 1); - } else { - href += "/"; } parser.href = href + url; From 119074e0ee8d9580af8aa97346f527463695d2ba Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 17 May 2016 18:55:10 -0700 Subject: [PATCH 06/22] s3 loader improvements: support AWS cred in username and password part of url, stream s3 response directly --- pywb/utils/loaders.py | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/pywb/utils/loaders.py b/pywb/utils/loaders.py index 6a262db4..9841929d 100644 --- a/pywb/utils/loaders.py +++ b/pywb/utils/loaders.py @@ -366,24 +366,36 @@ class S3Loader(object): raise IOError('To load from s3 paths, ' + 'you must install boto: pip install boto') - if not self.s3conn: - try: - self.s3conn = connect_s3() - except Exception: #pragma: no cover - self.s3conn = connect_s3(anon=True) + aws_access_key_id = None + aws_secret_access_key = None parts = urlsplit(url) - bucket = self.s3conn.get_bucket(parts.netloc) + if parts.username and parts.password: + aws_access_key_id = parts.username + aws_secret_access_key = parts.password + bucket_name = parts.netloc.split('@', 1)[-1] + else: + bucket_name = parts.netloc - headers = {'Range': BlockLoader._make_range_header(offset, length)} + if not self.s3conn: + try: + self.s3conn = connect_s3(aws_access_key_id, aws_secret_access_key) + except Exception: #pragma: no cover + self.s3conn = connect_s3(anon=True) + + bucket = self.s3conn.get_bucket(bucket_name) key = bucket.get_key(parts.path) - result = key.get_contents_as_string(headers=headers) - key.close() + if offset == 0 and length == -1: + headers = {} + else: + headers = {'Range': BlockLoader._make_range_header(offset, length)} - return BytesIO(result) + # Read range + key.open_read(headers=headers) + return key #================================================================= From d11bd444ade0170b778ce1158dee97d1c1c77bfd Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 17 May 2016 19:24:14 -0700 Subject: [PATCH 07/22] s3 loader: unurlencode username/password --- pywb/utils/loaders.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pywb/utils/loaders.py b/pywb/utils/loaders.py index 9841929d..376a6224 100644 --- a/pywb/utils/loaders.py +++ b/pywb/utils/loaders.py @@ -372,8 +372,8 @@ class S3Loader(object): parts = urlsplit(url) if parts.username and parts.password: - aws_access_key_id = parts.username - aws_secret_access_key = parts.password + aws_access_key_id = unquote_plus(parts.username) + aws_secret_access_key = unquote_plus(parts.password) bucket_name = parts.netloc.split('@', 1)[-1] else: bucket_name = parts.netloc From 8ad66249c7ea190cf72f62594763f151687dde60 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 18 May 2016 16:34:58 -0700 Subject: [PATCH 08/22] blockloader: support for loader profiles, specified via 'profile+scheme://...' urls. Profiles specify additional settings (eg. credentials) that are not included in the url. To enabl e custom profiles, BlockLoader.set_profile_loader(callable) to a callable that will return custom config, addresses #180 --- pywb/utils/loaders.py | 89 ++++++++++++++++++++++----------- pywb/utils/test/test_loaders.py | 4 +- pywb/warc/recordloader.py | 2 +- 3 files changed, 64 insertions(+), 31 deletions(-) diff --git a/pywb/utils/loaders.py b/pywb/utils/loaders.py index 376a6224..6ce7bbac 100644 --- a/pywb/utils/loaders.py +++ b/pywb/utils/loaders.py @@ -236,23 +236,34 @@ def read_last_line(fh, offset=256): #================================================================= -class BlockLoader(object): +class BaseLoader(object): + def __init__(self, **kwargs): + pass + + def load(self, url, offset=0, length=-1): + raise NotImplemented() + + +#================================================================= +class BlockLoader(BaseLoader): """ a loader which can stream blocks of content given a uri, offset and optional length. Currently supports: http/https and file/local file system """ - def __init__(self, *args, **kwargs): + loaders = {} + profile_loader = None + + def __init__(self, **kwargs): self.cached = {} - self.args = args self.kwargs = kwargs def load(self, url, offset=0, length=-1): - loader = self._get_loader_for(url) + loader = self._get_loader_for_url(url) return loader.load(url, offset, length) - def _get_loader_for(self, url): + def _get_loader_for_url(self, url): """ Determine loading method based on uri """ @@ -266,14 +277,41 @@ class BlockLoader(object): if loader: return loader - loader_cls = LOADERS.get(type_) - if not loader_cls: - raise IOError('No Loader for type: ' + type_) + if '+' in type_: + profile_name, scheme = type_.split('+', 1) + else: + profile_name = '' + scheme = type_ + + loader_cls = self._get_loader_class_for_type(scheme) + + if not loader_cls: + raise IOError('No Loader for type: ' + scheme) + + profile = self.kwargs + + if self.profile_loader: + profile = self.profile_loader(profile_name, scheme) + + loader = loader_cls(**profile) - loader = loader_cls(*self.args, **self.kwargs) self.cached[type_] = loader return loader + def _get_loader_class_for_type(self, type_): + loader_cls = self.loaders.get(type_) + return loader_cls + + @staticmethod + def init_default_loaders(): + BlockLoader.loaders['http'] = HttpLoader + BlockLoader.loaders['https'] = HttpLoader + BlockLoader.loaders['s3'] = S3Loader + BlockLoader.loaders['file'] = LocalFileLoader + + @staticmethod + def set_profile_loader(src): + BlockLoader.profile_loader = src @staticmethod def _make_range_header(offset, length): @@ -286,10 +324,7 @@ class BlockLoader(object): #================================================================= -class LocalFileLoader(object): - def __init__(self, *args, **kwargs): - pass - +class LocalFileLoader(BaseLoader): def load(self, url, offset=0, length=-1): """ Load a file-like reader from the local file system @@ -329,9 +364,11 @@ class LocalFileLoader(object): #================================================================= -class HttpLoader(object): - def __init__(self, cookie_maker=None, *args, **kwargs): - self.cookie_maker = cookie_maker +class HttpLoader(BaseLoader): + def __init__(self, **kwargs): + self.cookie_maker = kwargs.get('cookie_maker') + if not self.cookie_maker: + self.cookie_maker = kwargs.get('cookie') self.session = None def load(self, url, offset, length): @@ -357,17 +394,19 @@ class HttpLoader(object): #================================================================= -class S3Loader(object): - def __init__(self, *args, **kwargs): +class S3Loader(BaseLoader): + def __init__(self, **kwargs): self.s3conn = None + self.aws_access_key_id = kwargs.get('aws_access_key_id') + self.aws_secret_access_key = kwargs.get('aws_secret_access_key') def load(self, url, offset, length): if not s3_avail: #pragma: no cover raise IOError('To load from s3 paths, ' + 'you must install boto: pip install boto') - aws_access_key_id = None - aws_secret_access_key = None + aws_access_key_id = self.aws_access_key_id + aws_secret_access_key = self.aws_secret_access_key parts = urlsplit(url) @@ -495,12 +534,6 @@ class LimitReader(object): return stream - -#================================================================= -LOADERS = {'http': HttpLoader, - 'https': HttpLoader, - 's3': S3Loader, - 'file': LocalFileLoader - } - +# ============================================================================ +BlockLoader.init_default_loaders() diff --git a/pywb/utils/test/test_loaders.py b/pywb/utils/test/test_loaders.py index 0a751712..8e8595bd 100644 --- a/pywb/utils/test/test_loaders.py +++ b/pywb/utils/test/test_loaders.py @@ -37,11 +37,11 @@ Traceback (most recent call last): IOError: [Errno 2] No such file or directory: '_x_no_such_file_' # HMAC Cookie Maker ->>> print_str(BlockLoader(HMACCookieMaker('test', 'test', 5)).load('http://example.com', 41, 14).read()) +>>> print_str(BlockLoader(cookie_maker=HMACCookieMaker('test', 'test', 5)).load('http://example.com', 41, 14).read()) 'Example Domain' # fixed cookie, range request ->>> print_str(BlockLoader('some=value').load('http://example.com', 41, 14).read()) +>>> print_str(BlockLoader(cookie='some=value').load('http://example.com', 41, 14).read()) 'Example Domain' # range request diff --git a/pywb/warc/recordloader.py b/pywb/warc/recordloader.py index 06a3c79e..402d1524 100644 --- a/pywb/warc/recordloader.py +++ b/pywb/warc/recordloader.py @@ -57,7 +57,7 @@ class ArcWarcRecordLoader(object): def __init__(self, loader=None, cookie_maker=None, block_size=8192, verify_http=True): if not loader: - loader = BlockLoader(cookie_maker) + loader = BlockLoader(cookie_maker=cookie_maker) self.loader = loader self.block_size = block_size From 8ef6eb97b89e5dc416413ef516cda05dca15c75d Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 23 May 2016 11:47:44 -0700 Subject: [PATCH 09/22] cdx: encoding: use to_native_str() consistently for better py2 compat --- pywb/cdx/cdxobject.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pywb/cdx/cdxobject.py b/pywb/cdx/cdxobject.py index 432d69e4..3e8dddc5 100644 --- a/pywb/cdx/cdxobject.py +++ b/pywb/cdx/cdxobject.py @@ -153,7 +153,7 @@ class CDXObject(OrderedDict): raise CDXException(msg) for header, field in zip(cdxformat, fields): - self[header] = field.decode('utf-8') + self[header] = to_native_str(field, 'utf-8') self.cdxline = cdxline @@ -213,7 +213,7 @@ class CDXObject(OrderedDict): def __str__(self): if self.cdxline: - return self.cdxline.decode('utf-8') + return to_native_str(self.cdxline, 'utf-8') if not self._from_json: return ' '.join(str(val) for val in six.itervalues(self)) @@ -263,7 +263,7 @@ class IDXObject(OrderedDict): raise CDXException(msg.format(len(fields), self.NUM_REQ_FIELDS)) for header, field in zip(self.FORMAT, fields): - self[header] = field.decode('utf-8') + self[header] = to_native_str(field, 'utf-8') self['offset'] = int(self['offset']) self['length'] = int(self['length']) @@ -285,4 +285,4 @@ class IDXObject(OrderedDict): return json_encode(self) + '\n' def __str__(self): - return self.idxline.decode('utf-8') + return to_native_str(self.idxline, 'utf-8') From 84c829467b71c744fe24c094b9ea81b9a9ffdc47 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 23 May 2016 12:10:10 -0700 Subject: [PATCH 10/22] framed replay: use postMessage() instead of custom function to notify of replay frame changing url, include different type of change, eg. load, replaceState, pushState, #181 --- pywb/static/wb.js | 23 +++++++++++++++------ pywb/static/wb_frame.js | 45 ++++++++++++++++++++++++----------------- pywb/static/wombat.js | 22 +++++++++++++++----- 3 files changed, 60 insertions(+), 30 deletions(-) diff --git a/pywb/static/wb.js b/pywb/static/wb.js index e186f1df..f57e833b 100644 --- a/pywb/static/wb.js +++ b/pywb/static/wb.js @@ -123,12 +123,23 @@ function notify_top() { return; } - if (window.__WB_top_frame.update_wb_url) { - window.__WB_top_frame.update_wb_url(window.WB_wombat_location.href, - wbinfo.timestamp, - wbinfo.request_ts, - wbinfo.is_live); - } + //if (window.__WB_top_frame.update_wb_url) { + // window.__WB_top_frame.update_wb_url(window.WB_wombat_location.href, + // wbinfo.timestamp, + // wbinfo.request_ts, + // wbinfo.is_live); + //} + + var message = { + "url": window.WB_wombat_location.href, + "ts": wbinfo.timestamp, + "request_ts": wbinfo.request_ts, + "is_live": wbinfo.is_live, + "title": "", + "wb_type": "load", + } + + window.__WB_top_frame.postMessage(message, "*"); remove_event("readystatechange", notify_top, document); } diff --git a/pywb/static/wb_frame.js b/pywb/static/wb_frame.js index c9e47ef3..168b914f 100644 --- a/pywb/static/wb_frame.js +++ b/pywb/static/wb_frame.js @@ -38,27 +38,21 @@ function make_url(url, ts, mod) } } -function push_state(url, timestamp, request_ts, capture_str, is_live) { +function push_state(state) { var frame = document.getElementById(IFRAME_ID).contentWindow; if (frame.WB_wombat_location) { var curr_href = frame.WB_wombat_location.href; // If not current url, don't update - if (url != curr_href) { + if (state.url != curr_href) { return; } } - var state = {} - state.timestamp = timestamp; - state.request_ts = request_ts; - state.outer_url = make_url(url, state.request_ts, wbinfo.frame_mod); - state.inner_url = make_url(url, state.request_ts, wbinfo.replay_mod); - state.url = url; - state.capture_str = capture_str; - state.is_live = is_live; + state.outer_url = make_url(state.url, state.request_ts, wbinfo.frame_mod); + state.inner_url = make_url(state.url, state.request_ts, wbinfo.replay_mod); - var canon_url = make_url(url, state.request_ts, ""); + var canon_url = make_url(state.url, state.request_ts, ""); if (window.location.href != canon_url) { window.history.replaceState(state, "", canon_url); } @@ -157,7 +151,13 @@ function iframe_loaded(event) { request_ts = ts; } - update_wb_url(url, ts, request_ts, is_live); + var state = {} + state["url"] = url; + state["ts"] = ts; + state["request_ts"] = request_ts; + state["is_live"] = is_live + + update_wb_url(state); } @@ -165,12 +165,18 @@ function init_pm() { var frame = document.getElementById(IFRAME_ID).contentWindow; window.addEventListener("message", function(event) { - // Pass to replay frame if (event.source == window.parent) { + // Pass to replay frame frame.postMessage(event.data, "*"); } else if (event.source == frame) { - // Pass to parent - window.parent.postMessage(event.data, "*"); + + // Check if iframe url change message + if (typeof(event.data) == "object" && event.data["wb_type"]) { + update_wb_url(event.data); + } else { + // Pass to parent + window.parent.postMessage(event.data, "*"); + } } }); @@ -181,14 +187,14 @@ function init_pm() { } -function update_wb_url(url, ts, request_ts, is_live) { - if (curr_state.url == url && curr_state.timestamp == ts) { +function update_wb_url(state) { + if (curr_state.url == state.url && curr_state.ts == state.ts) { return; } - capture_str = _wb_js.ts_to_date(ts, true); + state['capture_str'] = _wb_js.ts_to_date(state.ts, true); - push_state(url, ts, request_ts, capture_str, is_live); + push_state(state); } // Load Banner @@ -237,3 +243,4 @@ function init_hash_connect() { } document.addEventListener("DOMContentLoaded", init_hash_connect); + diff --git a/pywb/static/wombat.js b/pywb/static/wombat.js index 3032df0c..c6d416f3 100644 --- a/pywb/static/wombat.js +++ b/pywb/static/wombat.js @@ -718,11 +718,23 @@ var wombat_internal = function($wbwindow) { orig_func.call(this, state_obj, title, url); - if ($wbwindow.__WB_top_frame && $wbwindow != $wbwindow.__WB_top_frame && $wbwindow.__WB_top_frame.update_wb_url) { - $wbwindow.__WB_top_frame.update_wb_url($wbwindow.WB_wombat_location.href, - wb_info.timestamp, - wb_info.request_ts, - wb_info.is_live); + //if ($wbwindow.__WB_top_frame && $wbwindow != $wbwindow.__WB_top_frame && $wbwindow.__WB_top_frame.update_wb_url) { + // $wbwindow.__WB_top_frame.update_wb_url($wbwindow.WB_wombat_location.href, + // wb_info.timestamp, + // wb_info.request_ts, + // wb_info.is_live); + //} + if ($wbwindow.__WB_top_frame && $wbwindow != $wbwindow.__WB_top_frame) { + var message = { + "url": url, + "ts": wb_info.timestamp, + "request_ts": wb_info.request_ts, + "is_live": wb_info.is_live, + "title": title, + "wb_type": func_name, + } + + $wbwindow.__WB_top_frame.postMessage(message, "*"); } } From e28f29430219b4a26374bf2a18d0128de91607cd Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 24 May 2016 17:55:17 -0700 Subject: [PATCH 11/22] wombat: ensure window.open() rewrite happens even in if open not in prototype rewrite mod: allow empty "" as set mod, check for undefined --- pywb/static/wombat.js | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/pywb/static/wombat.js b/pywb/static/wombat.js index c6d416f3..af280f94 100644 --- a/pywb/static/wombat.js +++ b/pywb/static/wombat.js @@ -1796,19 +1796,22 @@ var wombat_internal = function($wbwindow) { //============================================ function init_open_override() { - if (!$wbwindow.Window.prototype.open) { - return; + var orig = $wbwindow.open; + + if ($wbwindow.Window.prototype.open) { + orig = $wbwindow.Window.prototype.open; } - var orig = $wbwindow.Window.prototype.open; - var open_rewritten = function(strUrl, strWindowName, strWindowFeatures) { - strUrl = rewrite_url(strUrl); + strUrl = rewrite_url(strUrl, false, ""); return orig.call(this, strUrl, strWindowName, strWindowFeatures); } $wbwindow.open = open_rewritten; - $wbwindow.Window.prototype.open = open_rewritten; + + if ($wbwindow.Window.prototype.open) { + $wbwindow.Window.prototype.open = open_rewritten; + } for (var i = 0; i < $wbwindow.frames.length; i++) { try { @@ -2099,7 +2102,7 @@ var wombat_internal = function($wbwindow) { //============================================ function get_final_url(prefix, mod, url) { - if (!mod) { + if (mod == undefined) { mod = wb_info.mod; } From b54347f8d1ec97ef9a53d96255676a82ec685ec7 Mon Sep 17 00:00:00 2001 From: chdorner Date: Wed, 1 Jun 2016 11:21:55 +0200 Subject: [PATCH 12/22] Allow rewriting of empty srcset attributes Strictly speaking a `srcset` attribute must consist of one or more strings (http://w3c.github.io/html/semantics-embedded-content.html#element-attrdef-img-srcset) However are websites out there that specify an empty string as the value. This commit makes sure that the rewriting does not break and just returns an empty string. --- pywb/rewrite/html_rewriter.py | 1 + pywb/rewrite/test/test_html_rewriter.py | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/pywb/rewrite/html_rewriter.py b/pywb/rewrite/html_rewriter.py index e57f8591..171b4e69 100644 --- a/pywb/rewrite/html_rewriter.py +++ b/pywb/rewrite/html_rewriter.py @@ -211,6 +211,7 @@ class HTMLRewriterMixin(object): def _rewrite_srcset(self, value, mod=''): values = value.split(',') values = map(lambda x: self._rewrite_url(x.strip()), values) + values = [v for v in values if v is not None] return ', '.join(values) def _rewrite_css(self, css_content): diff --git a/pywb/rewrite/test/test_html_rewriter.py b/pywb/rewrite/test/test_html_rewriter.py index 818bd114..13063936 100644 --- a/pywb/rewrite/test/test_html_rewriter.py +++ b/pywb/rewrite/test/test_html_rewriter.py @@ -115,6 +115,10 @@ r""" >>> parse('') +# empty srcset attrib +>>> parse('') + + # Script tag >>> parse('') From 197ed5be98a9279618746b272ccd4829e3c14a5f Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 4 Jun 2016 14:09:18 -0400 Subject: [PATCH 13/22] loader: profile urls: ensure the profile prefix is removed from url before passing to loader, #180 --- pywb/utils/loaders.py | 14 ++++++++------ pywb/utils/test/test_loaders.py | 4 ++++ 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/pywb/utils/loaders.py b/pywb/utils/loaders.py index 6ce7bbac..4c298334 100644 --- a/pywb/utils/loaders.py +++ b/pywb/utils/loaders.py @@ -260,7 +260,7 @@ class BlockLoader(BaseLoader): self.kwargs = kwargs def load(self, url, offset=0, length=-1): - loader = self._get_loader_for_url(url) + loader, url = self._get_loader_for_url(url) return loader.load(url, offset, length) def _get_loader_for_url(self, url): @@ -273,16 +273,18 @@ class BlockLoader(BaseLoader): else: type_ = parts[0] - loader = self.cached.get(type_) - if loader: - return loader - if '+' in type_: profile_name, scheme = type_.split('+', 1) + if len(parts) == 2: + url = scheme + '://' + parts[1] else: profile_name = '' scheme = type_ + loader = self.cached.get(type_) + if loader: + return loader, url + loader_cls = self._get_loader_class_for_type(scheme) if not loader_cls: @@ -296,7 +298,7 @@ class BlockLoader(BaseLoader): loader = loader_cls(**profile) self.cached[type_] = loader - return loader + return loader, url def _get_loader_class_for_type(self, type_): loader_cls = self.loaders.get(type_) diff --git a/pywb/utils/test/test_loaders.py b/pywb/utils/test/test_loaders.py index 8e8595bd..5d71a711 100644 --- a/pywb/utils/test/test_loaders.py +++ b/pywb/utils/test/test_loaders.py @@ -48,6 +48,10 @@ IOError: [Errno 2] No such file or directory: '_x_no_such_file_' >>> print_str(BlockLoader().load('http://example.com', 1262).read()) '\n' +# custom profile +>>> print_str(BlockLoader().load('local+http://example.com', 1262).read()) +'\n' + # unknown loader error #>>> BlockLoader().load('foo://example.com', 10).read() # doctest: +IGNORE_EXCEPTION_DETAIL #Traceback (most recent call last): From 527a3bc89c557f6db8ef813538a0ee93b50159b7 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sun, 12 Jun 2016 00:37:14 -0400 Subject: [PATCH 14/22] bufferedreader: be lenient of partially decompressed data: return what was decompressed, rather than just throw exception esp. useful if record was decompressed, but an error in crc check may add additional options for toggling 'leniency' if needed --- pywb/utils/bufferedreaders.py | 5 +++-- pywb/utils/test/test_bufferedreaders.py | 8 +++++--- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/pywb/utils/bufferedreaders.py b/pywb/utils/bufferedreaders.py index f3268c58..81132bdd 100644 --- a/pywb/utils/bufferedreaders.py +++ b/pywb/utils/bufferedreaders.py @@ -98,7 +98,7 @@ class BufferedReader(object): if self.decompressor and data: try: data = self.decompressor.decompress(data) - except Exception: + except Exception as e: # if first read attempt, assume non-gzipped stream if self.num_read == 0: if self.decomp_type == 'deflate': @@ -108,7 +108,8 @@ class BufferedReader(object): self.decompressor = None # otherwise (partly decompressed), something is wrong else: - raise + print(str(e)) + return b'' return data def read(self, length=None): diff --git a/pywb/utils/test/test_bufferedreaders.py b/pywb/utils/test/test_bufferedreaders.py index 9f4fd54a..38223ee9 100644 --- a/pywb/utils/test/test_bufferedreaders.py +++ b/pywb/utils/test/test_bufferedreaders.py @@ -140,9 +140,11 @@ def test_err_compress_mix(): # error: compressed member, followed by not compressed -- considered invalid x = DecompressingBufferedReader(BytesIO(compress('ABC') + b'123'), decomp_type = 'gzip') b = x.read() - b = x.read_next_member() - with pytest.raises(zlib.error): - x.read() + assert b == b'ABC' + x.read_next_member() + assert x.read() == b'' + #with pytest.raises(zlib.error): + # x.read() #error: Error -3 while decompressing: incorrect header check def test_err_chunk_cut_off(): From 9f299eb8e973c573b88fbd4fdc8ae27b155309ef Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sun, 12 Jun 2016 00:40:35 -0400 Subject: [PATCH 15/22] amf rewriting: move to separate file, mark as experimental, and don't include as default (for now) --- pywb/rewrite/rewrite_amf.py | 52 ++++++++++++++++++++++++++++++ pywb/rewrite/rewrite_content.py | 57 ++------------------------------- pywb/webapp/replay_views.py | 4 +-- 3 files changed, 56 insertions(+), 57 deletions(-) create mode 100644 pywb/rewrite/rewrite_amf.py diff --git a/pywb/rewrite/rewrite_amf.py b/pywb/rewrite/rewrite_amf.py new file mode 100644 index 00000000..07a73470 --- /dev/null +++ b/pywb/rewrite/rewrite_amf.py @@ -0,0 +1,52 @@ +from io import BytesIO +from six.moves import zip +from pywb.rewrite.rewrite_content import RewriteContent + + +# ============================================================================ +# Expiermental: not fully tested +class RewriteContentAMF(RewriteContent): #pragma: no cover + def handle_custom_rewrite(self, text_type, status_headers, stream, env): + + if status_headers.get_header('Content-Type') == 'application/x-amf': + stream = self.rewrite_amf(stream, env) + + return (super(RewriteContentAMF, self). + handle_custom_rewrite(text_type, status_headers, stream, env)) + + def rewrite_amf(self, stream, env): + try: + from pyamf import remoting + + iobuff = BytesIO() + while True: + buff = stream.read() + if not buff: + break + iobuff.write(buff) + + iobuff.seek(0) + res = remoting.decode(iobuff) + + if env and env.get('pywb.inputdata'): + inputdata = env.get('pywb.inputdata') + + new_list = [] + + for src, target in zip(inputdata.bodies, res.bodies): + #print(target[0] + ' = ' + src[0]) + + #print('messageId => corrId ' + target[1].body.correlationId + ' => ' + src[1].body[0].messageId) + target[1].body.correlationId = src[1].body[0].messageId + + new_list.append((src[0], target[1])) + + res.bodies = new_list + + return BytesIO(remoting.encode(res).getvalue()) + + except Exception as e: + import traceback + traceback.print_exc() + print(e) + return stream diff --git a/pywb/rewrite/rewrite_content.py b/pywb/rewrite/rewrite_content.py index e9ad9fd0..93012d0f 100644 --- a/pywb/rewrite/rewrite_content.py +++ b/pywb/rewrite/rewrite_content.py @@ -4,9 +4,8 @@ import webencodings import yaml import re -from chardet.universaldetector import UniversalDetector -from io import BytesIO, BufferedReader -from six.moves import zip +#from chardet.universaldetector import UniversalDetector +from io import BytesIO from pywb.rewrite.header_rewriter import RewrittenStatusAndHeaders @@ -371,55 +370,3 @@ class RewriteContent(object): stream.close() -# ================================================================= -class RewriteContentAMF(RewriteContent): - def handle_custom_rewrite(self, text_type, status_headers, stream, env): - - if status_headers.get_header('Content-Type') == 'application/x-amf': - stream = self.rewrite_amf(stream, env) - - return (super(RewriteContentAMF, self). - handle_custom_rewrite(text_type, status_headers, stream, env)) - - def rewrite_amf(self, stream, env): - try: - from pyamf import remoting - - iobuff = BytesIO() - while True: - buff = stream.read() - if not buff: - break - iobuff.write(buff) - - iobuff.seek(0) - res = remoting.decode(iobuff) - - print('rewrite amf') - - print(env.get('pywb.inputdata')) - - if env and env.get('pywb.inputdata'): - inputdata = env.get('pywb.inputdata') - - new_list = [] - - for src, target in zip(inputdata.bodies, res.bodies): - print(target[0] + ' = ' + src[0]) - - print('messageId => corrId ' + target[1].body.correlationId + ' => ' + src[1].body[0].messageId) - target[1].body.correlationId = src[1].body[0].messageId - - new_list.append((src[0], target[1])) - - res.bodies = new_list - - return BytesIO(remoting.encode(res).getvalue()) - - except Exception as e: - import traceback - traceback.print_exc() - print(e) - return stream - - diff --git a/pywb/webapp/replay_views.py b/pywb/webapp/replay_views.py index 6af44ba3..7d95db1c 100644 --- a/pywb/webapp/replay_views.py +++ b/pywb/webapp/replay_views.py @@ -13,7 +13,7 @@ from pywb.utils.timeutils import timestamp_now from pywb.framework.wbrequestresponse import WbResponse from pywb.framework.memento import MementoResponse -from pywb.rewrite.rewrite_content import RewriteContentAMF +from pywb.rewrite.rewrite_content import RewriteContent from pywb.warc.recordloader import ArchiveLoadFailed from pywb.webapp.views import HeadInsertView @@ -40,7 +40,7 @@ class ReplayView(object): self.content_loader = content_loader framed = config.get('framed_replay') - self.content_rewriter = RewriteContentAMF(is_framed_replay=framed) + self.content_rewriter = RewriteContent(is_framed_replay=framed) self.head_insert_view = HeadInsertView.init_from_config(config) From 0f530a3e0e6cb7115c1a09fa7908e754981e23fd Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sun, 12 Jun 2016 00:44:52 -0400 Subject: [PATCH 16/22] dependencies: remove pyamf, update to latest surt (0.3.0) --- setup.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/setup.py b/setup.py index 6936b3f8..d7958308 100755 --- a/setup.py +++ b/setup.py @@ -78,11 +78,10 @@ setup( 'requests', 'redis', 'jinja2', - 'surt==0.3b4', + 'surt>=0.3.0', 'pyyaml', 'watchdog', 'webencodings', - 'pyamf', ], tests_require=[ 'pytest', @@ -91,9 +90,6 @@ setup( 'fakeredis', 'mock', ], - dependency_links=[ - 'git+https://github.com/t0m/pyamf.git@python3' - ], cmdclass={'test': PyTest}, test_suite='', entry_points=""" From d2c37f7d91d26f2358e548797ecd42cbbe61ef3e Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sun, 12 Jun 2016 01:38:03 -0400 Subject: [PATCH 17/22] html parser: attr_value can now be None -- default to '' for string ops, write attr w/o assignment --- pywb/rewrite/html_rewriter.py | 26 ++++++++++++++++++------- pywb/rewrite/test/test_html_rewriter.py | 8 ++++++-- 2 files changed, 25 insertions(+), 9 deletions(-) diff --git a/pywb/rewrite/html_rewriter.py b/pywb/rewrite/html_rewriter.py index e57f8591..9543b159 100644 --- a/pywb/rewrite/html_rewriter.py +++ b/pywb/rewrite/html_rewriter.py @@ -252,6 +252,11 @@ class HTMLRewriterMixin(object): self.out.write('<' + tag) for attr_name, attr_value in tag_attrs: + empty_attr = False + if attr_value is None: + attr_value = '' + empty_attr = True + # special case: inline JS/event handler if ((attr_value and attr_value.startswith('javascript:')) or attr_name.startswith('on')): @@ -324,7 +329,7 @@ class HTMLRewriterMixin(object): attr_value = self._rewrite_url(attr_value, rw_mod) # write the attr! - self._write_attr(attr_name, attr_value) + self._write_attr(attr_name, attr_value, empty_attr) return True @@ -347,11 +352,17 @@ class HTMLRewriterMixin(object): return True - def _write_attr(self, name, value): - # parser doesn't differentiate between 'attr=""' and just 'attr' - # 'attr=""' is more common, so use that form - if value: + def _write_attr(self, name, value, empty_attr): + # if empty_attr is set, just write 'attr'! + if empty_attr: + self.out.write(' ' + name) + + # write with value, if set + elif value: + self.out.write(' ' + name + '="' + value.replace('"', '"') + '"') + + # otherwise, 'attr=""' is more common, so use that form else: self.out.write(' ' + name + '=""') @@ -421,8 +432,9 @@ class HTMLRewriter(HTMLRewriterMixin, HTMLParser): def feed(self, string): try: HTMLParser.feed(self, string) - except Exception: # pragma: no cover - # only raised in 2.6 + except Exception as e: # pragma: no cover + import traceback + traceback.print_exc() self.out.write(string) def _internal_close(self): diff --git a/pywb/rewrite/test/test_html_rewriter.py b/pywb/rewrite/test/test_html_rewriter.py index 818bd114..781af0e2 100644 --- a/pywb/rewrite/test/test_html_rewriter.py +++ b/pywb/rewrite/test/test_html_rewriter.py @@ -66,6 +66,10 @@ r""" >>> parse('X') X +# Empty values should be ignored +>>> parse('') + + # SKIPPED # Unicode -- default with %-encoding #>>> parse(u'испытание') @@ -92,7 +96,7 @@ r""" >>> parse('') - + >>> parse('') @@ -131,7 +135,7 @@ r""" >>> parse('
') -
+
>>> parse('') From 1bfec37970f7806aa81715c16ec92bf3cd679ed0 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sun, 12 Jun 2016 01:50:55 -0400 Subject: [PATCH 18/22] html rewriter: attr rewrite ops check for empty/blank attr value, return empty string --- pywb/rewrite/html_rewriter.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/pywb/rewrite/html_rewriter.py b/pywb/rewrite/html_rewriter.py index 9543b159..8f615df7 100644 --- a/pywb/rewrite/html_rewriter.py +++ b/pywb/rewrite/html_rewriter.py @@ -120,7 +120,7 @@ class HTMLRewriterMixin(object): def _rewrite_meta_refresh(self, meta_refresh): if not meta_refresh: - return None + return '' m = self.META_REFRESH_REGEX.match(meta_refresh) if not m: @@ -133,6 +133,9 @@ class HTMLRewriterMixin(object): return meta_refresh def _rewrite_base(self, url, mod=''): + if not url: + return '' + url = self._ensure_url_has_path(url) base_url = self._rewrite_url(url, mod) @@ -183,11 +186,11 @@ class HTMLRewriterMixin(object): def _rewrite_url(self, value, mod=None): if not value: - return None + return '' value = value.strip() if not value: - return None + return '' value = self.try_unescape(value) return self.url_rewriter.rewrite(value, mod) @@ -209,6 +212,9 @@ class HTMLRewriterMixin(object): return new_value def _rewrite_srcset(self, value, mod=''): + if not value: + return '' + values = value.split(',') values = map(lambda x: self._rewrite_url(x.strip()), values) return ', '.join(values) @@ -217,13 +223,13 @@ class HTMLRewriterMixin(object): if css_content: return self.css_rewriter.rewrite(css_content) else: - return None + return '' def _rewrite_script(self, script_content): if script_content: return self.js_rewriter.rewrite(script_content) else: - return None + return '' def has_attr(self, tag_attrs, attr): name, value = attr From 3b68ef6540ca55c4f2ac180b3a6a12e3853ef4a1 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sun, 12 Jun 2016 01:57:21 -0400 Subject: [PATCH 19/22] html rewriter: cleanup rewrite_srcset, add more tests for empty rewrite --- pywb/rewrite/html_rewriter.py | 3 +-- pywb/rewrite/test/test_html_rewriter.py | 6 ++++++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/pywb/rewrite/html_rewriter.py b/pywb/rewrite/html_rewriter.py index 1ae28b6a..90148c1f 100644 --- a/pywb/rewrite/html_rewriter.py +++ b/pywb/rewrite/html_rewriter.py @@ -216,8 +216,7 @@ class HTMLRewriterMixin(object): return '' values = value.split(',') - values = map(lambda x: self._rewrite_url(x.strip()), values) - values = [v for v in values if v is not None] + values = [self._rewrite_url(v.strip()) for v in values] return ', '.join(values) def _rewrite_css(self, css_content): diff --git a/pywb/rewrite/test/test_html_rewriter.py b/pywb/rewrite/test/test_html_rewriter.py index 30ca7da7..afb1da93 100644 --- a/pywb/rewrite/test/test_html_rewriter.py +++ b/pywb/rewrite/test/test_html_rewriter.py @@ -49,6 +49,12 @@ r""" >>> parse('', urlrewriter=no_base_canon_rewriter) +# Empty url +>>> parse('') + + +>>> parse('') + # HTML Entities From 457a1a564c1ef0b6fa8f624fc7efef5208777cff Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 15 Jun 2016 01:37:29 -0400 Subject: [PATCH 20/22] bufferedreader: support brotli decompression rewrite: handle Content-Encoding: br using brotli decompressor setup: add brotlipy as dependency --- pywb/rewrite/rewrite_content.py | 12 +++++++++--- pywb/utils/bufferedreaders.py | 14 +++++++++++++- setup.py | 1 + 3 files changed, 23 insertions(+), 4 deletions(-) diff --git a/pywb/rewrite/rewrite_content.py b/pywb/rewrite/rewrite_content.py index 93012d0f..677e20ae 100644 --- a/pywb/rewrite/rewrite_content.py +++ b/pywb/rewrite/rewrite_content.py @@ -77,6 +77,7 @@ class RewriteContent(object): def _check_encoding(self, rewritten_headers, stream, enc): + matched = False if (rewritten_headers. contains_removed_header('content-encoding', enc)): @@ -87,8 +88,9 @@ class RewriteContent(object): stream = DecompressingBufferedReader(stream, decomp_type=enc) rewritten_headers.status_headers.remove_header('content-length') + matched = True - return stream + return matched, stream @@ -139,8 +141,12 @@ class RewriteContent(object): encoding = None first_buff = b'' - stream = self._check_encoding(rewritten_headers, stream, 'gzip') - stream = self._check_encoding(rewritten_headers, stream, 'deflate') + for decomp_type in BufferedReader.get_supported_decompressors(): + matched, stream = self._check_encoding(rewritten_headers, + stream, + decomp_type) + if matched: + break if mod == 'js_': text_type, stream = self._resolve_text_type('js', diff --git a/pywb/utils/bufferedreaders.py b/pywb/utils/bufferedreaders.py index 81132bdd..e1ebfc90 100644 --- a/pywb/utils/bufferedreaders.py +++ b/pywb/utils/bufferedreaders.py @@ -1,5 +1,6 @@ from io import BytesIO import zlib +import brotli #================================================================= @@ -17,6 +18,11 @@ def deflate_decompressor(): def deflate_decompressor_alt(): return zlib.decompressobj(-zlib.MAX_WBITS) +def brotli_decompressor(): + decomp = brotli.Decompressor() + decomp.unused_data = None + return decomp + #================================================================= class BufferedReader(object): @@ -40,7 +46,9 @@ class BufferedReader(object): DECOMPRESSORS = {'gzip': gzip_decompressor, 'deflate': deflate_decompressor, - 'deflate_alt': deflate_decompressor_alt} + 'deflate_alt': deflate_decompressor_alt, + 'br': brotli_decompressor + } def __init__(self, stream, block_size=1024, decomp_type=None, @@ -181,6 +189,10 @@ class BufferedReader(object): self.stream.close() self.stream = None + @classmethod + def get_supported_decompressors(cls): + return cls.DECOMPRESSORS.keys() + #================================================================= class DecompressingBufferedReader(BufferedReader): diff --git a/setup.py b/setup.py index d7958308..629ea228 100755 --- a/setup.py +++ b/setup.py @@ -79,6 +79,7 @@ setup( 'redis', 'jinja2', 'surt>=0.3.0', + 'brotlipy', 'pyyaml', 'watchdog', 'webencodings', From d45722355504c6be1b8f49e4603e4b629a706303 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 16 Jun 2016 00:00:47 -0400 Subject: [PATCH 21/22] tests: add brotli compression test #184 --- pywb/utils/test/test_bufferedreaders.py | 8 ++++++++ sample_archive/text_content/quickfox_repeated.compressed | 2 ++ 2 files changed, 10 insertions(+) create mode 100644 sample_archive/text_content/quickfox_repeated.compressed diff --git a/pywb/utils/test/test_bufferedreaders.py b/pywb/utils/test/test_bufferedreaders.py index 38223ee9..7d058dcd 100644 --- a/pywb/utils/test/test_bufferedreaders.py +++ b/pywb/utils/test/test_bufferedreaders.py @@ -133,6 +133,14 @@ def compress_alt(buff): return compressed +# Brotli + +def test_brotli(): + with open(get_test_dir() + 'text_content/quickfox_repeated.compressed', 'rb') as fh: + x = DecompressingBufferedReader(fh, decomp_type='br') + x.read() == b'The quick brown fox jumps over the lazy dog' * 4096 + + # Errors diff --git a/sample_archive/text_content/quickfox_repeated.compressed b/sample_archive/text_content/quickfox_repeated.compressed new file mode 100644 index 00000000..f9d79767 --- /dev/null +++ b/sample_archive/text_content/quickfox_repeated.compressed @@ -0,0 +1,2 @@ +["y\ZB;%UZ5 +{K< @Mme'_0{<S \ No newline at end of file From 2fba97683a928ce64f943302829ec4a59fad279b Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 16 Jun 2016 00:39:51 -0400 Subject: [PATCH 22/22] CHANGES for 0.31.0 --- CHANGES.rst | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/CHANGES.rst b/CHANGES.rst index e5cf9355..d70ca4c3 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,3 +1,35 @@ +pywb 0.31.0 changelist +~~~~~~~~~~~~~~~~~~~~~~ + +* HTML rewriting: + - preserve empty attrs while parsing, eg. ```` instead of ```` + - empty ``srcset`` attribute does not cause errors + - better error checking of empty attributes for all custom parsers + +* wombat/client side improvements: + - use ``postMessage()`` for inner replay frame -> outer frame updates + - Fix ``window.open()`` rewriting even if prototype is missing + - Fix double-slash in relative url rewriting + - ``Math.random()`` overrides uses correct window + +* BufferedReader improvements: + - More lenient of partially decompressed data, return what was decompressed instead of raising exception. + - Support Brotli decompression, properly rewrite ``Content-Encoding: br`` + +* Python 2/3 Compatibility: + - Decode all cdx fields to native string in py2 + +* BlockLoader improvements: + - support custom profile urls, eg. ``profile+http://`` which allow a custom profile to be selected if a profile loader is registered via ``BlockLoader.set_profile_loader()`` + + - s3 loader: support profiles and AWS creds directly set in username/password of url + +* POST replay improvements: + - support ``multipart/form-data`` encoding same as ``x-www-form-urlencoded`` + - support ``application/x-amf`` with experimental AMF rewriter (RewriteContentAMF rewriter) + - support generic post-data matching exact base64 encoded value. + + pywb 0.30.1 changelist ~~~~~~~~~~~~~~~~~~~~~~ @@ -21,7 +53,7 @@ pywb 0.30.0 changelist - Comparison operators for cdxobject - Reading cdxline as byte buffer, individual fields as strings (python 3) -* redis: full testing of ``zlexbyrange`` with new fakeredis +* redis: full testing of ``zrangebylex`` with new fakeredis * timeutils: add ``datetime_to_iso_date``