From d6006acdc3dfab128ff9798a00c1348ab7e883c1 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sun, 6 Apr 2014 09:47:34 -0700 Subject: [PATCH 01/55] rewrite: when using lxml parser, just pass raw stream to lxml without decoding. lxml parser expects to have raw bytes and will determine encoding on its own. then serve back as utf-8 if no encoding specified. should address #36 --- pywb/rewrite/lxml_html_rewriter.py | 3 +- pywb/rewrite/rewrite_content.py | 39 ++++++++++++-------- pywb/rewrite/rewriterules.py | 11 +++++- pywb/rewrite/test/test_lxml_html_rewriter.py | 9 +++++ 4 files changed, 43 insertions(+), 19 deletions(-) diff --git a/pywb/rewrite/lxml_html_rewriter.py b/pywb/rewrite/lxml_html_rewriter.py index 2c8a8b8a..8aac2f54 100644 --- a/pywb/rewrite/lxml_html_rewriter.py +++ b/pywb/rewrite/lxml_html_rewriter.py @@ -79,7 +79,8 @@ class RewriterTarget(object): def data(self, data): if not self.rewriter._wb_parse_context: data = cgi.escape(data, quote=True) - + if isinstance(data, unicode): + data = data.replace(u'\xa0', ' ') self.rewriter.parse_data(data) def comment(self, data): diff --git a/pywb/rewrite/rewrite_content.py b/pywb/rewrite/rewrite_content.py index 0acdf5a5..720bf9f1 100644 --- a/pywb/rewrite/rewrite_content.py +++ b/pywb/rewrite/rewrite_content.py @@ -6,7 +6,7 @@ from io import BytesIO from header_rewriter import RewrittenStatusAndHeaders -from rewriterules import RewriteRules +from rewriterules import RewriteRules, is_lxml from pywb.utils.dsrules import RuleSet from pywb.utils.statusandheaders import StatusAndHeaders @@ -73,21 +73,25 @@ class RewriteContent: # ==================================================================== # special case -- need to ungzip the body + text_type = rewritten_headers.text_type + stream_raw = False + encoding = None + first_buff = None + if (rewritten_headers. contains_removed_header('content-encoding', 'gzip')): stream = DecompressingBufferedReader(stream, decomp_type='gzip') if rewritten_headers.charset: encoding = rewritten_headers.charset - first_buff = None + elif is_lxml() and text_type == 'html': + stream_raw = True else: (encoding, first_buff) = self._detect_charset(stream) - # if chardet thinks its ascii, use utf-8 - if encoding == 'ascii': - encoding = 'utf-8' - - text_type = rewritten_headers.text_type + # if encoding not set or chardet thinks its ascii, use utf-8 + if not encoding or encoding == 'ascii': + encoding = 'utf-8' rule = self.ruleset.get_first_match(urlkey) @@ -108,34 +112,33 @@ class RewriteContent: js_rewriter_class=rule.rewriters['js'], css_rewriter_class=rule.rewriters['css'], head_insert=head_insert_str) + else: # apply one of (js, css, xml) rewriters rewriter = rewriter_class(urlrewriter) # Create rewriting generator - gen = self._rewriting_stream_gen(rewriter, encoding, + gen = self._rewriting_stream_gen(rewriter, encoding, stream_raw, stream, first_buff) return (status_headers, gen, True) # Create rewrite stream, may even be chunked by front-end - def _rewriting_stream_gen(self, rewriter, encoding, + def _rewriting_stream_gen(self, rewriter, encoding, stream_raw, stream, first_buff=None): def do_rewrite(buff): - if encoding: + if not stream_raw: buff = self._decode_buff(buff, stream, encoding) buff = rewriter.rewrite(buff) - if encoding: - buff = buff.encode(encoding) + buff = buff.encode(encoding) return buff def do_finish(): result = rewriter.close() - if encoding: - result = result.encode(encoding) + result = result.encode(encoding) return result @@ -188,12 +191,16 @@ class RewriteContent: def stream_to_gen(stream, rewrite_func=None, final_read_func=None, first_buff=None): try: - buff = first_buff if first_buff else stream.read() + if first_buff: + buff = first_buff + else: + buff = stream.read() + stream.readline() + while buff: if rewrite_func: buff = rewrite_func(buff) yield buff - buff = stream.read() + buff = stream.read() + stream.readline() # For adding a tail/handling final buffer if final_read_func: diff --git a/pywb/rewrite/rewriterules.py b/pywb/rewrite/rewriterules.py index 03a23653..a7737248 100644 --- a/pywb/rewrite/rewriterules.py +++ b/pywb/rewrite/rewriterules.py @@ -9,6 +9,7 @@ from html_rewriter import HTMLRewriter import itertools HTML = HTMLRewriter +_is_lxml = False #================================================================= @@ -18,13 +19,19 @@ def use_lxml_parser(): if LXML_SUPPORTED: global HTML + global _is_lxml HTML = LXMLHTMLRewriter logging.debug('Using LXML Parser') - return True + _is_lxml = True else: # pragma: no cover logging.debug('LXML Parser not available') - return False + _is_lxml = False + return _is_lxml + + +def is_lxml(): + return _is_lxml #================================================================= class RewriteRules(BaseRule): diff --git a/pywb/rewrite/test/test_lxml_html_rewriter.py b/pywb/rewrite/test/test_lxml_html_rewriter.py index 125977e7..038de4a8 100644 --- a/pywb/rewrite/test/test_lxml_html_rewriter.py +++ b/pywb/rewrite/test/test_lxml_html_rewriter.py @@ -119,6 +119,15 @@ ur""" >>> p = LXMLHTMLRewriter(urlrewriter) >>> p.close() '' + +# test   +>>> parse(' ') +

 

+ +# test multiple rewrites:   extra >, split comment +>>> p = LXMLHTMLRewriter(urlrewriter) +>>> p.rewrite('
    >
') + p.close() +u'
    >
' """ from pywb.rewrite.url_rewriter import UrlRewriter From d8c20a59cf2a2381315f1f26da62b7258d9e5060 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sun, 6 Apr 2014 11:46:43 -0700 Subject: [PATCH 02/55] update to version 0.3.1 --- CHANGES.rst | 8 +++++++- README.rst | 2 +- setup.py | 3 ++- 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 547a8f52..129f2307 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,4 +1,10 @@ -pywb 0.2.2 changelist +pywb 0.3.1 changelist +~~~~~~~~~~~~~~~~~~~~~ + +* Improve lxml rewriting, letting lxml handle decoding from bytestream (to address #36) + + +pywb 0.3.0 changelist ~~~~~~~~~~~~~~~~~~~~~ * Generate cdx indexs via command-line `cdx-indexer` script. Optionally sorting, and output to either a single combined file or a file per-directory. diff --git a/README.rst b/README.rst index 86c18e06..f9fc0fc2 100644 --- a/README.rst +++ b/README.rst @@ -1,4 +1,4 @@ -PyWb 0.2.2 +PyWb 0.3.1 ============= .. image:: https://travis-ci.org/ikreymer/pywb.png?branch=develop diff --git a/setup.py b/setup.py index 0a578f54..cb5717f1 100755 --- a/setup.py +++ b/setup.py @@ -34,7 +34,7 @@ class PyTest(TestCommand): setup( name='pywb', - version='0.2.2', + version='0.3.1', url='https://github.com/ikreymer/pywb', author='Ilya Kreymer', author_email='ikreymer@gmail.com', @@ -84,6 +84,7 @@ setup( [console_scripts] wayback = pywb.apps.wayback:main cdx-server = pywb.apps.cdx_server:main + rewrite-live = pywb.apps.rewrite_live:main cdx-indexer = pywb.warc.archiveindexer:main """, zip_safe=False, From 64eef7063dfa904f4fee090e65307e3f1ff53a29 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 7 Apr 2014 17:08:39 -0700 Subject: [PATCH 03/55] record reading: better handling of empty arc (or warc) records for indexing, index empty/invalid length as '-' status code for reading, serve as 204 no content. ensure that StatusAndHeaders has a valid statusline when serving if http content-length is valid,, limit stream to that content-length as well as record content-length (whichever is smaller) replace content-length when buffering --- pywb/utils/loaders.py | 12 +++++- pywb/utils/statusandheaders.py | 54 +++++++++++++++++++++--- pywb/utils/test/test_statusandheaders.py | 17 ++++++++ pywb/warc/archiveindexer.py | 12 +++++- pywb/warc/recordloader.py | 12 ++++-- pywb/warc/resolvingloader.py | 3 ++ pywb/warc/test/test_indexing.py | 7 +-- pywb/webapp/replay_views.py | 14 +++--- 8 files changed, 108 insertions(+), 23 deletions(-) diff --git a/pywb/utils/loaders.py b/pywb/utils/loaders.py index ccbe960e..f86e4072 100644 --- a/pywb/utils/loaders.py +++ b/pywb/utils/loaders.py @@ -93,7 +93,10 @@ class BlockLoader(object): headers['Range'] = range_header if self.cookie_maker: - headers['Cookie'] = self.cookie_maker.make() + if isinstance(self.cookie_maker, basestring): + headers['Cookie'] = self.cookie_maker + else: + headers['Cookie'] = self.cookie_maker.make() request = urllib2.Request(url, headers=headers) return urllib2.urlopen(request) @@ -184,7 +187,12 @@ class LimitReader(object): try: content_length = int(content_length) if content_length >= 0: - stream = LimitReader(stream, content_length) + # optimize: if already a LimitStream, set limit to + # the smaller of the two limits + if isinstance(stream, LimitReader): + stream.limit = min(stream.limit, content_length) + else: + stream = LimitReader(stream, content_length) except (ValueError, TypeError): pass diff --git a/pywb/utils/statusandheaders.py b/pywb/utils/statusandheaders.py index 24dcf784..85805cb2 100644 --- a/pywb/utils/statusandheaders.py +++ b/pywb/utils/statusandheaders.py @@ -29,6 +29,21 @@ class StatusAndHeaders(object): if value[0].lower() == name_lower: return value[1] + def replace_header(self, name, value): + """ + replace header with new value or add new header + return old header value, if any + """ + name_lower = name.lower() + for index in xrange(len(self.headers) - 1, -1, -1): + curr_name, curr_value = self.headers[index] + if curr_name.lower() == name_lower: + self.headers[index] = (curr_name, value) + return curr_value + + self.headers.append((name, value)) + return None + def remove_header(self, name): """ remove header (case-insensitive) @@ -42,6 +57,20 @@ class StatusAndHeaders(object): return False + def validate_statusline(self, valid_statusline): + """ + Check that the statusline is valid, eg. starts with a numeric + code. If not, replace with passed in valid_statusline + """ + code = self.statusline.split(' ', 1)[0] + try: + code = int(code) + assert(code > 0) + return True + except ValueError, AssertionError: + self.statusline = valid_statusline + return False + def __repr__(self): headers_str = pprint.pformat(self.headers, indent=2) return "StatusAndHeaders(protocol = '{0}', statusline = '{1}', \ @@ -81,9 +110,16 @@ class StatusAndHeadersParser(object): statusline, total_read = _strip_count(full_statusline, 0) + headers = [] + # at end of stream if total_read == 0: raise EOFError() + elif not statusline: + return StatusAndHeaders(statusline=statusline, + headers=headers, + protocol='', + total_len=total_read) protocol_status = self.split_prefix(statusline, self.statuslist) @@ -92,13 +128,15 @@ class StatusAndHeadersParser(object): msg = msg.format(self.statuslist, statusline) raise StatusAndHeadersParserException(msg, full_statusline) - headers = [] - line, total_read = _strip_count(stream.readline(), total_read) while line: - name, value = line.split(':', 1) - name = name.rstrip(' \t') - value = value.lstrip() + result = line.split(':', 1) + if len(result) == 2: + name = result[0].rstrip(' \t') + value = result[1].lstrip() + else: + name = result[0] + value = None next_line, total_read = _strip_count(stream.readline(), total_read) @@ -109,8 +147,10 @@ class StatusAndHeadersParser(object): next_line, total_read = _strip_count(stream.readline(), total_read) - header = (name, value) - headers.append(header) + if value is not None: + header = (name, value) + headers.append(header) + line = next_line return StatusAndHeaders(statusline=protocol_status[1].strip(), diff --git a/pywb/utils/test/test_statusandheaders.py b/pywb/utils/test/test_statusandheaders.py index ea835e32..061532a3 100644 --- a/pywb/utils/test/test_statusandheaders.py +++ b/pywb/utils/test/test_statusandheaders.py @@ -13,6 +13,14 @@ StatusAndHeadersParserException: Expected Status Line starting with ['Other'] - >>> st1 == StatusAndHeadersParser(['HTTP/1.0']).parse(BytesIO(status_headers_1)) True +# replace header, print new headers +>>> st1.replace_header('some', 'Another-Value'); st1 +'Value' +StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '200 OK', headers = [ ('Content-Type', 'ABC'), + ('Some', 'Another-Value'), + ('Multi-Line', 'Value1 Also This')]) + + # remove header >>> st1.remove_header('some') True @@ -20,6 +28,10 @@ True # already removed >>> st1.remove_header('Some') False + +# empty +>>> st2 = StatusAndHeadersParser(['HTTP/1.0']).parse(BytesIO(status_headers_2)); x = st2.validate_statusline('204 No Content'); st2 +StatusAndHeaders(protocol = '', statusline = '204 No Content', headers = []) """ @@ -37,6 +49,11 @@ Multi-Line: Value1\r\n\ Body" +status_headers_2 = """ + +""" + + if __name__ == "__main__": import doctest doctest.testmod() diff --git a/pywb/warc/archiveindexer.py b/pywb/warc/archiveindexer.py index 6ee3a10c..2247ced4 100644 --- a/pywb/warc/archiveindexer.py +++ b/pywb/warc/archiveindexer.py @@ -164,7 +164,7 @@ class ArchiveIndexer(object): digest = record.rec_headers.get_header('WARC-Payload-Digest') - status = record.status_headers.statusline.split(' ')[0] + status = self._extract_status(record.status_headers) if record.rec_type == 'revisit': mime = 'warc/revisit' @@ -205,7 +205,9 @@ class ArchiveIndexer(object): timestamp = record.rec_headers.get_header('archive-date') if len(timestamp) > 14: timestamp = timestamp[:14] - status = record.status_headers.statusline.split(' ')[0] + + status = self._extract_status(record.status_headers) + mime = record.rec_headers.get_header('content-type') mime = self._extract_mime(mime) @@ -228,6 +230,12 @@ class ArchiveIndexer(object): mime = 'unk' return mime + def _extract_status(self, status_headers): + status = status_headers.statusline.split(' ')[0] + if not status: + status = '-' + return status + def read_rest(self, reader, digester=None): """ Read remainder of the stream If a digester is included, update it diff --git a/pywb/warc/recordloader.py b/pywb/warc/recordloader.py index 96e149e3..4c71dee3 100644 --- a/pywb/warc/recordloader.py +++ b/pywb/warc/recordloader.py @@ -97,18 +97,24 @@ class ArcWarcRecordLoader: rec_type = rec_headers.get_header('WARC-Type') length = rec_headers.get_header('Content-Length') + is_err = False + try: length = int(length) if length < 0: - length = 0 + is_err = True except ValueError: - length = 0 + is_err = True # ================================================================ # handle different types of records + # err condition + if is_err: + status_headers = StatusAndHeaders('-', []) + length = 0 # special case: empty w/arc record (hopefully a revisit) - if length == 0: + elif length == 0: status_headers = StatusAndHeaders('204 No Content', []) # special case: warc records that are not expected to have http headers diff --git a/pywb/warc/resolvingloader.py b/pywb/warc/resolvingloader.py index 10c7caa0..393efc3e 100644 --- a/pywb/warc/resolvingloader.py +++ b/pywb/warc/resolvingloader.py @@ -63,6 +63,9 @@ class ResolvingLoader: if not headers_record or not payload_record: raise ArchiveLoadFailed('Could not load ' + str(cdx)) + # ensure status line is valid from here + headers_record.status_headers.validate_statusline('204 No Content') + return (headers_record.status_headers, payload_record.stream) def _resolve_path_load(self, cdx, is_original, failed_files): diff --git a/pywb/warc/test/test_indexing.py b/pywb/warc/test/test_indexing.py index 0e470424..0a3d6038 100644 --- a/pywb/warc/test/test_indexing.py +++ b/pywb/warc/test/test_indexing.py @@ -36,8 +36,9 @@ metadata)/gnu.org/software/wget/warc/wget.log 20140216012908 metadata://gnu.org/ # bad arcs -- test error edge cases >>> print_cdx_index('bad.arc') CDX N b a m s k r M S V g -com,example)/ 20140401000000 http://example.com/ text/html 204 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 67 134 bad.arc -com,example)/ 20140401000000 http://example.com/ text/html 204 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 68 202 bad.arc +com,example)/ 20140401000000 http://example.com/ text/html - 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 67 134 bad.arc +com,example)/ 20140102000000 http://example.com/ text/plain - 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 59 202 bad.arc +com,example)/ 20140401000000 http://example.com/ text/html - 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 68 262 bad.arc # Test CLI interface -- (check for num lines) #================================================================= @@ -46,7 +47,7 @@ com,example)/ 20140401000000 http://example.com/ text/html 204 3I42H3S6NNFQ2MSVX >>> cli_lines(['--sort', '-', TEST_WARC_DIR]) com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1001 353 example-url-agnostic-orig.warc.gz -200 +201 # test writing to stdout >>> cli_lines(['-', TEST_WARC_DIR + 'example.warc.gz']) diff --git a/pywb/webapp/replay_views.py b/pywb/webapp/replay_views.py index 2ab17225..31fe4b57 100644 --- a/pywb/webapp/replay_views.py +++ b/pywb/webapp/replay_views.py @@ -1,9 +1,9 @@ import re from io import BytesIO -from pywb.utils.bufferedreaders import ChunkedDataReader from pywb.utils.statusandheaders import StatusAndHeaders from pywb.utils.wbexception import WbException, NotFoundException +from pywb.utils.loaders import LimitReader from pywb.framework.wbrequestresponse import WbResponse from pywb.framework.memento import MementoResponse @@ -105,6 +105,9 @@ class ReplayView(object): if redir_response: return redir_response + length = status_headers.get_header('content-length') + stream = LimitReader.wrap_stream(stream, length) + # one more check for referrer-based self-redirect self._reject_referrer_self_redirect(wbrequest) @@ -124,9 +127,6 @@ class ReplayView(object): # buffer response if buffering enabled if self.buffer_response: - if wbrequest.is_identity: - status_headers.remove_header('content-length') - response_iter = self.buffered_response(status_headers, response_iter) @@ -165,8 +165,10 @@ class ReplayView(object): content = out.getvalue() content_length_str = str(len(content)) - status_headers.headers.append(('Content-Length', - content_length_str)) + + # remove existing content length + status_headers.replace_header('Content-Length', + content_length_str) out.close() return content From 890c323617436565660c647a8a5bf670d2ec9c2c Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 7 Apr 2014 17:12:33 -0700 Subject: [PATCH 04/55] update bad.arc with empty record example --- sample_archive/warcs/bad.arc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sample_archive/warcs/bad.arc b/sample_archive/warcs/bad.arc index 0d812251..9de41600 100644 --- a/sample_archive/warcs/bad.arc +++ b/sample_archive/warcs/bad.arc @@ -4,4 +4,8 @@ URL IP-address Archive-date Content-type Archive-length http://example.com/ 93.184.216.119 201404010000000000 text/html -1 +http://example.com/ 127.0.0.1 20140102000000 text/plain 1 + + http://example.com/ 93.184.216.119 201404010000000000 text/html abc + From 2a318527dfb2c5721c2ab4dfe531c51bcff8f568 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 7 Apr 2014 17:13:43 -0700 Subject: [PATCH 05/55] lxml: use lxml's parse interface instead of feed interface to allow xml to handle decoding unicode data, better address #36 --- pywb/rewrite/lxml_html_rewriter.py | 12 ++++++++++++ pywb/rewrite/rewrite_content.py | 12 ++++++++++-- pywb/rewrite/test/test_regex_rewriters.py | 2 +- pywb/rewrite/url_rewriter.py | 6 +++++- 4 files changed, 28 insertions(+), 4 deletions(-) diff --git a/pywb/rewrite/lxml_html_rewriter.py b/pywb/rewrite/lxml_html_rewriter.py index 8aac2f54..abf28fc4 100644 --- a/pywb/rewrite/lxml_html_rewriter.py +++ b/pywb/rewrite/lxml_html_rewriter.py @@ -45,6 +45,18 @@ class LXMLHTMLRewriter(HTMLRewriterMixin): #string = string.replace(u'', u'') self.parser.feed(string) + def parse(self, stream): + self.out = self.AccumBuff() + + lxml.etree.parse(stream, self.parser) + + result = self.out.getvalue() + + # Clear buffer to create new one for next rewrite() + self.out = None + + return result + def _internal_close(self): if self.started: self.parser.close() diff --git a/pywb/rewrite/rewrite_content.py b/pywb/rewrite/rewrite_content.py index 720bf9f1..c2d17047 100644 --- a/pywb/rewrite/rewrite_content.py +++ b/pywb/rewrite/rewrite_content.py @@ -123,12 +123,20 @@ class RewriteContent: return (status_headers, gen, True) + def _parse_full_gen(self, rewriter, encoding, stream): + buff = rewriter.parse(stream) + buff = buff.encode(encoding) + yield buff + # Create rewrite stream, may even be chunked by front-end def _rewriting_stream_gen(self, rewriter, encoding, stream_raw, stream, first_buff=None): + + if stream_raw: + return self._parse_full_gen(rewriter, encoding, stream) + def do_rewrite(buff): - if not stream_raw: - buff = self._decode_buff(buff, stream, encoding) + buff = self._decode_buff(buff, stream, encoding) buff = rewriter.rewrite(buff) diff --git a/pywb/rewrite/test/test_regex_rewriters.py b/pywb/rewrite/test/test_regex_rewriters.py index fac38789..17bf0a75 100644 --- a/pywb/rewrite/test/test_regex_rewriters.py +++ b/pywb/rewrite/test/test_regex_rewriters.py @@ -51,7 +51,7 @@ r""" # scheme-agnostic >>> _test_js('cool_Location = "//example.com/abc.html" //comment') -'cool_Location = "/web/20131010em_///example.com/abc.html" //comment' +'cool_Location = "/web/20131010em_/http://example.com/abc.html" //comment' #================================================================= diff --git a/pywb/rewrite/url_rewriter.py b/pywb/rewrite/url_rewriter.py index 9545a040..cb35607f 100644 --- a/pywb/rewrite/url_rewriter.py +++ b/pywb/rewrite/url_rewriter.py @@ -14,7 +14,7 @@ class UrlRewriter(object): NO_REWRITE_URI_PREFIX = ['#', 'javascript:', 'data:', 'mailto:', 'about:'] - PROTOCOLS = ['http:', 'https:', '//', 'ftp:', 'mms:', 'rtsp:', 'wais:'] + PROTOCOLS = ['http:', 'https:', 'ftp:', 'mms:', 'rtsp:', 'wais:'] def __init__(self, wburl, prefix): self.wburl = wburl if isinstance(wburl, WbUrl) else WbUrl(wburl) @@ -32,6 +32,10 @@ class UrlRewriter(object): isAbs = any(url.startswith(x) for x in self.PROTOCOLS) + if url.startswith('//'): + isAbs = True + url = 'http:' + url + # Optimized rewriter for # -rel urls that don't start with / and # do not contain ../ and no special mod From c23dd7bda49519d38272b16591183da19ff2fafe Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 7 Apr 2014 17:17:08 -0700 Subject: [PATCH 06/55] wombat update: - support scheme-relative (//) urls - override dom manipulation (appendChild, insertBefore, replaceChild) - disable Worker() interface for now --- pywb/static/wombat.js | 105 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 101 insertions(+), 4 deletions(-) diff --git a/pywb/static/wombat.js b/pywb/static/wombat.js index 112d6d37..47d5042b 100644 --- a/pywb/static/wombat.js +++ b/pywb/static/wombat.js @@ -52,25 +52,49 @@ WB_wombat_init = (function() { return false; } + function starts_with(string, prefix) { + if (string.indexOf(prefix) == 0) { + return prefix; + } else { + return undefined; + } + } + //============================================ +/* function rewrite_url_debug(url) { + rewritten = rewrite_url_(url); + if (url != rewritten) { + console.log('REWRITE: ' + url + ' -> ' + rewritten); + } else { + console.log('NOT REWRITTEN ' + url); + } + return rewritten; + } +*/ function rewrite_url(url) { var http_prefix = "http://"; var https_prefix = "https://"; + var rel_prefix = "//"; // If not dealing with a string, just return it if (!url || (typeof url) != "string") { return url; } + // ignore anchors + if (starts_with(url, "#")) { + return url; + } + // If starts with prefix, no rewriting needed // Only check replay prefix (no date) as date may be different for each // capture - if (url.indexOf(wb_replay_prefix) == 0) { + if (starts_with(url, wb_replay_prefix)) { return url; } // If server relative url, add prefix and original host - if (url.charAt(0) == "/") { + if (url.charAt(0) == "/" && !starts_with(url, rel_prefix)) { // Already a relative url, don't make any changes! if (url.indexOf(wb_capture_date_part) >= 0) { @@ -81,13 +105,21 @@ WB_wombat_init = (function() { } // If full url starting with http://, add prefix - if (url.indexOf(http_prefix) == 0 || url.indexOf(https_prefix) == 0) { + + var prefix = starts_with(url, http_prefix) || + starts_with(url, https_prefix) || + starts_with(url, rel_prefix); + + if (prefix) { + if (starts_with(url, prefix + window.location.host + '/')) { + return url; + } return wb_replay_date_prefix + url; } // May or may not be a hostname, call function to determine // If it is, add the prefix and make sure port is removed - if (is_host_url(url)) { + if (is_host_url(url) && !starts_with(url, window.location.host + '/')) { return wb_replay_date_prefix + http_prefix + url; } @@ -252,12 +284,73 @@ WB_wombat_init = (function() { function open_rewritten(method, url, async, user, password) { url = rewrite_url(url); + + // defaults to true + if (async != false) { + async = true; + } + return orig.call(this, method, url, async, user, password); } window.XMLHttpRequest.prototype.open = open_rewritten; } + function init_worker_override() { + if (!window.Worker) { + return; + } + + // for now, disabling workers until override of worker content can be supported + // hopefully, pages depending on workers will have a fallback + window.Worker = undefined; + } + + + function rewrite_attr(elem, name) { + if (!elem || !elem.getAttribute) { + return; + } + + value = elem.getAttribute(name); + + if (!value) { + return; + } + + if (starts_with(value, "javascript:")) { + return; + } + + orig_value = value; + value = rewrite_url(value); + + elem.setAttribute(name, value); + } + + function init_dom_override() { + if (!Element || + !Element.prototype) { + return; + } + + function replace_dom_func(funcname) { + + var orig = Element.prototype[funcname]; + + Element.prototype[funcname] = function() { + rewrite_attr(arguments[0], "src"); + rewrite_attr(arguments[0], "href"); + + return orig.apply(this, arguments); + } + } + + replace_dom_func("appendChild"); + replace_dom_func("insertBefore"); + replace_dom_func("replaceChild"); + } + //============================================ function wombat_init(replay_prefix, capture_date, orig_host, timestamp) { wb_replay_prefix = replay_prefix; @@ -287,6 +380,10 @@ WB_wombat_init = (function() { // Ajax init_ajax_rewrite(); + init_worker_override(); + + // DOM + init_dom_override(); // Random init_seeded_random(timestamp); From a3310616918c8283939535e737cac60eaed05487 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 7 Apr 2014 17:19:07 -0700 Subject: [PATCH 07/55] minor tweaks: add default static_path for jinja, remove unused import --- pywb/webapp/cdx_api_handler.py | 1 - pywb/webapp/views.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/pywb/webapp/cdx_api_handler.py b/pywb/webapp/cdx_api_handler.py index e54de959..e3e16a72 100644 --- a/pywb/webapp/cdx_api_handler.py +++ b/pywb/webapp/cdx_api_handler.py @@ -1,6 +1,5 @@ from pywb.cdx.cdxserver import create_cdx_server -from pywb.framework.archivalrouter import ArchivalRouter, Route from pywb.framework.basehandlers import BaseHandler from pywb.framework.wbrequestresponse import WbResponse diff --git a/pywb/webapp/views.py b/pywb/webapp/views.py index cc1ea7be..c452d0e0 100644 --- a/pywb/webapp/views.py +++ b/pywb/webapp/views.py @@ -66,7 +66,7 @@ def is_wb_handler(obj): #================================================================= class J2TemplateView: - env_globals = {} + env_globals = {'static_path': 'static/default'} def __init__(self, filename): template_dir, template_file = path.split(filename) From 02fe78cb0bc27534f4bafd0feca971b47bd4c0e7 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 7 Apr 2014 17:41:14 -0700 Subject: [PATCH 08/55] update changes, add more tests --- CHANGES.rst | 6 +++++- pywb/utils/test/test_loaders.py | 4 ++++ pywb/utils/test/test_statusandheaders.py | 1 + 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/CHANGES.rst b/CHANGES.rst index 129f2307..2a05be24 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,7 +1,11 @@ pywb 0.3.1 changelist ~~~~~~~~~~~~~~~~~~~~~ -* Improve lxml rewriting, letting lxml handle decoding from bytestream (to address #36) +* Update wombat.js to support: scheme-relative urls rewriting, dom manipulation rewriting, disable web Worker api which could leak to live requests + +* Fixed support for empty arc/warc records. Indexed with '-', replay with '204 No Content' + +* Improve lxml rewriting, letting lxml handle parsing and decoding from bytestream directly (to address #36) pywb 0.3.0 changelist diff --git a/pywb/utils/test/test_loaders.py b/pywb/utils/test/test_loaders.py index c88805b5..88368146 100644 --- a/pywb/utils/test/test_loaders.py +++ b/pywb/utils/test/test_loaders.py @@ -32,6 +32,10 @@ True >>> BlockLoader(HMACCookieMaker('test', 'test', 5)).load('http://example.com', 41, 14).read() 'Example Domain' +# fixed cookie +>>> BlockLoader('some=value').load('http://example.com', 41, 14).read() +'Example Domain' + # test with extra id, ensure 4 parts of the A-B=C-D form are present >>> len(re.split('[-=]', HMACCookieMaker('test', 'test', 5).make('extra'))) 4 diff --git a/pywb/utils/test/test_statusandheaders.py b/pywb/utils/test/test_statusandheaders.py index 061532a3..2ee894b9 100644 --- a/pywb/utils/test/test_statusandheaders.py +++ b/pywb/utils/test/test_statusandheaders.py @@ -42,6 +42,7 @@ from io import BytesIO status_headers_1 = "\ HTTP/1.0 200 OK\r\n\ Content-Type: ABC\r\n\ +HTTP/1.0 200 OK\r\n\ Some: Value\r\n\ Multi-Line: Value1\r\n\ Also This\r\n\ From 8897a0a7c96017f359c8a28b78f584ea6cceef84 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 8 Apr 2014 21:49:04 -0700 Subject: [PATCH 09/55] decompressingbufferedreader: default to 'gzip' decompression instead of none. ChunkedDataReader also automatically attempts decompression, by default Add tests to verify --- pywb/utils/bufferedreaders.py | 2 +- pywb/utils/test/test_bufferedreaders.py | 17 +++++++++++++++-- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/pywb/utils/bufferedreaders.py b/pywb/utils/bufferedreaders.py index f434e492..aece175f 100644 --- a/pywb/utils/bufferedreaders.py +++ b/pywb/utils/bufferedreaders.py @@ -30,7 +30,7 @@ class DecompressingBufferedReader(object): DECOMPRESSORS = {'gzip': gzip_decompressor} def __init__(self, stream, block_size=1024, - decomp_type=None, + decomp_type='gzip', starting_data=None): self.stream = stream self.block_size = block_size diff --git a/pywb/utils/test/test_bufferedreaders.py b/pywb/utils/test/test_bufferedreaders.py index 558f8782..d061218c 100644 --- a/pywb/utils/test/test_bufferedreaders.py +++ b/pywb/utils/test/test_bufferedreaders.py @@ -10,8 +10,8 @@ r""" >>> DecompressingBufferedReader(open(test_cdx_dir + 'iana.cdx', 'rb'), decomp_type = 'gzip').readline() ' CDX N b a m s k r M S V g\n' -# decompress with on the fly compression ->>> DecompressingBufferedReader(BytesIO(compress('ABC\n1234\n')), decomp_type = 'gzip').read() +# decompress with on the fly compression, default gzip compression +>>> DecompressingBufferedReader(BytesIO(compress('ABC\n1234\n'))).read() 'ABC\n1234\n' # error: invalid compress type @@ -27,6 +27,11 @@ Exception: Decompression type not supported: bzip2 Traceback (most recent call last): error: Error -3 while decompressing: incorrect header check +# invalid output when reading compressed data as not compressed +>>> DecompressingBufferedReader(BytesIO(compress('ABC')), decomp_type = None).read() != 'ABC' +True + + # DecompressingBufferedReader readline() with decompression (zipnum file, no header) >>> DecompressingBufferedReader(open(test_zip_dir + 'zipnum-sample.cdx.gz', 'rb'), decomp_type = 'gzip').readline() 'com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz\n' @@ -60,6 +65,14 @@ Non-chunked data: >>> ChunkedDataReader(BytesIO("xyz123!@#")).read() 'xyz123!@#' +Non-chunked, compressed data +>>> ChunkedDataReader(BytesIO(compress('ABCDEF'))).read() +'ABCDEF' + +Non-chunked, compressed data +>>> DecompressingBufferedReader(ChunkedDataReader(BytesIO(compress('\nABCDEF\nGHIJ')))).read() +'\nABCDEF\nGHIJ' + Starts like chunked data, but isn't: >>> c = ChunkedDataReader(BytesIO("1\r\nxyz123!@#")); >>> c.read() + c.read() From 1fb6f5eff76ea8c4eaf664dd3ad02483707e1b00 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 8 Apr 2014 22:43:32 -0700 Subject: [PATCH 10/55] add rewriter_handler, frame wrapper support! --- pywb/apps/rewrite_live.py | 16 ++++ pywb/static/wb.css | 39 +++++++- pywb/static/wb.js | 54 ++++++++++- pywb/static/wombat.js | 106 +++++++++++++++------ pywb/ui/frame_insert.html | 54 +++++++++++ pywb/webapp/rewrite_handler.py | 165 +++++++++++++++++++++++++++++++++ 6 files changed, 398 insertions(+), 36 deletions(-) create mode 100644 pywb/apps/rewrite_live.py create mode 100644 pywb/ui/frame_insert.html create mode 100644 pywb/webapp/rewrite_handler.py diff --git a/pywb/apps/rewrite_live.py b/pywb/apps/rewrite_live.py new file mode 100644 index 00000000..e3b8f45b --- /dev/null +++ b/pywb/apps/rewrite_live.py @@ -0,0 +1,16 @@ +from pywb.framework.wsgi_wrappers import init_app, start_wsgi_server + +from pywb.webapp.rewrite_handler import create_rewrite_app + +#================================================================= +# init cdx server app +#================================================================= + +application = init_app(create_rewrite_app, load_yaml=False) + + +def main(): # pragma: no cover + start_wsgi_server(application, 'Rewrite App', default_port=8090) + +if __name__ == "__main__": + main() diff --git a/pywb/static/wb.css b/pywb/static/wb.css index 1367a2fe..3a36f54a 100644 --- a/pywb/static/wb.css +++ b/pywb/static/wb.css @@ -5,11 +5,12 @@ top: 0px !important; left: 0px !important; font-family: "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif !important; - position: absolute !important; - padding: 4px !important; + position: fixed !important; + /* padding: 4px !important; */ + height: 40px !important; width: 100% !important; font-size: 24px !important; - border: 1px solid !important; +/* border: 1px solid !important; */ background-color: lightYellow !important; color: black !important; text-align: center !important; @@ -17,3 +18,35 @@ line-height: normal !important; } +.wb_iframe_div +{ + width: 100%; + height: 100%; + padding: 40px 8px 8px 0px; + border: none; + box-sizing: border-box; + -moz-box-sizing: border-box; + -webkit-box-sizing: border-box; +} + +.wb_iframe +{ + width: 100%; + height: 100%; + border: 4px solid firebrick; +} + +.wb_iframe_all +{ + width: 100%; + height: 100%; + border: none; + background-color: firebrick; + padding: 44px 4px 4px 4px; + box-sizing: border-box; + -moz-box-sizing: border-box; + -webkit-box-sizing: border-box; +} + + + diff --git a/pywb/static/wb.js b/pywb/static/wb.js index ae5b586c..41343f81 100644 --- a/pywb/static/wb.js +++ b/pywb/static/wb.js @@ -26,6 +26,10 @@ function init_banner() { return; } + if (window.top != window.self) { + return; + } + if (!banner) { banner = document.createElement("wb_div"); banner.setAttribute("id", BANNER_ID); @@ -41,12 +45,54 @@ function init_banner() { } } -var readyStateCheckInterval = setInterval(function() { +function add_event(name, func, object) { + if (object.addEventListener) { + object.addEventListener(name, func); + return true; + } else if (object.attachEvent) { + object.attachEvent("on" + name, func); + return true; + } else { + return false; + } +} + +function remove_event(name, func, object) { + if (object.removeEventListener) { + object.removeEventListener(name, func); + return true; + } else if (object.detachEvent) { + object.detachEvent("on" + name, func); + return true; + } else { + return false; + } +} + +var notified_top = false; + +var detect_on_init = function() { + if (!notified_top && window && window.top && (window.self != window.top) && window.WB_wombat_location) { + if (!wbinfo.is_embed) { + window.top.postMessage(window.WB_wombat_location.href, "*"); + } + notified_top = true; + } + if (document.readyState === "interactive" || document.readyState === "complete") { init_banner(); - - clearInterval(readyStateCheckInterval); + + remove_event("readystatechange", detect_on_init, document); } -}, 10); +} + +add_event("readystatechange", detect_on_init, document); + +/* +if ((window.self == window.top) && !wbinfo.is_embed && window.location.href.indexOf("/rewrite/fr_/") == -1) { + new_loc = window.location.href.replace("/rewrite/", "/rewrite/fr_/"); + window.location.replace(new_loc); +} +*/ diff --git a/pywb/static/wombat.js b/pywb/static/wombat.js index 47d5042b..457d7d5a 100644 --- a/pywb/static/wombat.js +++ b/pywb/static/wombat.js @@ -60,18 +60,28 @@ WB_wombat_init = (function() { } } + function ends_with(str, suffix) { + if (str.indexOf(suffix, str.length - suffix.length) !== -1) { + return suffix; + } else { + return undefined; + } + } + //============================================ -/* function rewrite_url_debug(url) { + var rewrite_url = rewrite_url_debug; + + function rewrite_url_debug(url) { rewritten = rewrite_url_(url); if (url != rewritten) { console.log('REWRITE: ' + url + ' -> ' + rewritten); } else { - console.log('NOT REWRITTEN ' + url); + //console.log('NOT REWRITTEN ' + url); } return rewritten; } -*/ - function rewrite_url(url) { + + function rewrite_url_(url) { var http_prefix = "http://"; var https_prefix = "https://"; var rel_prefix = "//"; @@ -144,13 +154,22 @@ WB_wombat_init = (function() { if (!href) { return ""; } + href = href.toString(); + var index = href.indexOf("/http", 1); + + // extract original url from wburl if (index > 0) { - return href.substr(index + 1); - } else { - return href; + href = href.substr(index + 1); } + + // remove trailing slash + if (ends_with(href, "/")) { + href = href.substring(0, href.length - 1); + } + + return href; } //============================================ @@ -196,26 +215,39 @@ WB_wombat_init = (function() { } //============================================ - function update_location(req_href, orig_href, location) { - if (req_href && (extract_orig(orig_href) != extract_orig(req_href))) { - var final_href = rewrite_url(req_href); - - location.href = final_href; + function update_location(req_href, orig_href, actual_location) { + if (!req_href || req_href == orig_href) { + return; } + + ext_orig = extract_orig(orig_href); + ext_req = extract_orig(req_href); + + if (!ext_orig || ext_orig == ext_req) { + return; + } + + var final_href = rewrite_url(req_href); + + console.log(actual_location.href + ' -> ' + final_href); + + actual_location.href = final_href; } //============================================ function check_location_change(loc, is_top) { var locType = (typeof loc); - var location = (is_top ? window.top.location : window.location); + var actual_location = (is_top ? window.top.location : window.location); + + //console.log(loc.href); // String has been assigned to location, so assign it if (locType == "string") { - update_location(loc, location.href, location) + update_location(loc, actual_location.href, actual_location) } else if (locType == "object") { - update_location(loc.href, loc._orig_href, location); + update_location(loc.href, loc._orig_href, actual_location); } } @@ -306,7 +338,6 @@ WB_wombat_init = (function() { window.Worker = undefined; } - function rewrite_attr(elem, name) { if (!elem || !elem.getAttribute) { return; @@ -324,25 +355,41 @@ WB_wombat_init = (function() { orig_value = value; value = rewrite_url(value); - + elem.setAttribute(name, value); } function init_dom_override() { - if (!Element || - !Element.prototype) { + if (!Node || !Node.prototype) { return; } function replace_dom_func(funcname) { + var orig = Node.prototype[funcname]; - var orig = Element.prototype[funcname]; - - Element.prototype[funcname] = function() { + Node.prototype[funcname] = function() { rewrite_attr(arguments[0], "src"); rewrite_attr(arguments[0], "href"); - return orig.apply(this, arguments); + child = arguments[0]; + + var desc; + + if (child instanceof DocumentFragment) { + desc = child.querySelectorAll("*[href],*[src]"); + } else if (child.getElementsByTagName) { + desc = child.getElementsByTagName("*"); + } + + if (desc) { + for (var i = 0; i < desc.length; i++) { + rewrite_attr(desc[i], "src"); + rewrite_attr(desc[i], "href"); + } + } + + result = orig.apply(this, arguments); + return result; } } @@ -363,13 +410,14 @@ WB_wombat_init = (function() { window.WB_wombat_location = copy_location_obj(window.self.location); document.WB_wombat_location = window.WB_wombat_location; - if (window.self.location != window.top.location) { - window.top.WB_wombat_location = copy_location_obj(window.top.location); - } + //if (window.self.location != window.top.location) { + // window.top.WB_wombat_location = copy_location_obj(window.top.location); + //} + window.top.WB_wombat_location = window.WB_wombat_location; - if (window.opener) { - window.opener.WB_wombat_location = copy_location_obj(window.opener.location); - } + //if (window.opener) { + // window.opener.WB_wombat_location = copy_location_obj(window.opener.location); + //} // Domain document.WB_wombat_domain = orig_host; diff --git a/pywb/ui/frame_insert.html b/pywb/ui/frame_insert.html new file mode 100644 index 00000000..fd772251 --- /dev/null +++ b/pywb/ui/frame_insert.html @@ -0,0 +1,54 @@ + + + + + + + + + +
+