diff --git a/config.yaml b/config.yaml index 312f05cb..534eb4ff 100644 --- a/config.yaml +++ b/config.yaml @@ -101,8 +101,5 @@ enable_cdx_api: true # Memento support, enable enable_memento: true -# Use lxml parser, if available -use_lxml_parser: false - # Replay content in an iframe framed_replay: true diff --git a/pywb/rewrite/lxml_html_rewriter.py b/pywb/rewrite/lxml_html_rewriter.py deleted file mode 100644 index 651e9ba4..00000000 --- a/pywb/rewrite/lxml_html_rewriter.py +++ /dev/null @@ -1,111 +0,0 @@ -try: - import lxml.etree - LXML_SUPPORTED = True -except ImportError: - LXML_SUPPORTED = False - pass - -import cgi -import re - -from regex_rewriters import JSRewriter, CSSRewriter -from url_rewriter import UrlRewriter -from html_rewriter import HTMLRewriterMixin - - -#================================================================= -class LXMLHTMLRewriter(HTMLRewriterMixin): - END_HTML = re.compile(r'', re.IGNORECASE) - - def __init__(self, *args, **kwargs): - super(LXMLHTMLRewriter, self).__init__(*args, **kwargs) - - self.target = RewriterTarget(self) - self.parser = lxml.etree.HTMLParser(remove_pis=False, - remove_blank_text=False, - remove_comments=False, - strip_cdata=False, - compact=True, - target=self.target, - recover=True, - ) - - self.started = False - - def feed(self, string): - self.started = True - string = self.END_HTML.sub(b'', string) - #string = string.replace(b'', b'') - self.parser.feed(string) - - def parse(self, stream): - self.out = self.AccumBuff() - - lxml.etree.parse(stream, self.parser) - - result = self.out.getvalue() - - # Clear buffer to create new one for next rewrite() - self.out = None - - return result - - def _internal_close(self): - if self.started: - self.parser.close() - - -#================================================================= -class RewriterTarget(object): - def __init__(self, rewriter): - self.rewriter = rewriter - - def start(self, tag, attrs): - attrs = attrs.items() - - if not self.rewriter._rewrite_tag_attrs(tag, attrs, escape=True): - self.rewriter.out.write(b'<' + tag) - - for name, value in attrs: - self.rewriter._write_attr(name, value, escape=True) - else: - if tag == b'head': - if (self.rewriter._rewrite_head(False)): - return - - self.rewriter.out.write(b'>') - - def end(self, tag): - if (tag == self.rewriter._wb_parse_context): - self.rewriter._wb_parse_context = None - - self.rewriter.out.write(b'') - - def data(self, data): - if not self.rewriter._wb_parse_context: - data = cgi.escape(data, quote=True) - if isinstance(data, unicode): - data = data.replace(u'\xa0', ' ') - data = data.encode('utf-8') - self.rewriter.parse_data(data) - - def comment(self, data): - self.rewriter.out.write(b'') - - def doctype(self, root_tag, public_id, system_id): - self.rewriter.out.write(b'') - - def pi(self, target, data): - self.rewriter.out.write(b'') - - def close(self): - return '' diff --git a/pywb/rewrite/rewrite_content.py b/pywb/rewrite/rewrite_content.py index 6b6fb20f..01de8d2d 100644 --- a/pywb/rewrite/rewrite_content.py +++ b/pywb/rewrite/rewrite_content.py @@ -6,7 +6,7 @@ from io import BytesIO from header_rewriter import RewrittenStatusAndHeaders -from rewriterules import RewriteRules, is_lxml +from rewriterules import RewriteRules from pywb.utils.dsrules import RuleSet from pywb.utils.statusandheaders import StatusAndHeaders @@ -21,7 +21,6 @@ class RewriteContent: default_rule_config={}, ds_rules_file=ds_rules_file) self.defmod = defmod - self.decode_stream = False def sanitize_content(self, status_headers, stream): # remove transfer encoding chunked and wrap in a dechunking stream @@ -97,17 +96,15 @@ class RewriteContent: else: stream = DecompressingBufferedReader(stream) - if self.decode_stream: - if rewritten_headers.charset: - encoding = rewritten_headers.charset - elif is_lxml() and text_type == 'html': - stream_raw = True - else: - (encoding, first_buff) = self._detect_charset(stream) + #if self.decode_stream: + # if rewritten_headers.charset: + # encoding = rewritten_headers.charset + # else: + # (encoding, first_buff) = self._detect_charset(stream) # if encoding not set or chardet thinks its ascii, use utf-8 - if not encoding or encoding == 'ascii': - encoding = 'utf-8' + # if not encoding or encoding == 'ascii': + # encoding = 'utf-8' rule = self.ruleset.get_first_match(urlkey) @@ -132,37 +129,23 @@ class RewriteContent: rewriter = rewriter_class(urlrewriter) # Create rewriting generator - gen = self._rewriting_stream_gen(rewriter, encoding, stream_raw, + gen = self._rewriting_stream_gen(rewriter, encoding, stream, first_buff) return (status_headers, gen, True) - def _parse_full_gen(self, rewriter, encoding, stream): - buff = rewriter.parse(stream) - buff = buff.encode(encoding) - yield buff - # Create rewrite stream, may even be chunked by front-end - def _rewriting_stream_gen(self, rewriter, encoding, stream_raw, + def _rewriting_stream_gen(self, rewriter, encoding, stream, first_buff=None): - if stream_raw: - return self._parse_full_gen(rewriter, encoding, stream) - - def do_enc_rewrite(buff): - buff = self._decode_buff(buff, stream, encoding) - buff = rewriter.rewrite(buff) - buff = buff.encode(encoding) - return buff - def do_rewrite(buff): + if encoding: + buff = self._decode_buff(buff, stream, encoding) buff = rewriter.rewrite(buff) - return buff + if encoding: + buff = buff.encode(encoding) - if encoding: - rewrite_func = do_enc_rewrite - else: - rewrite_func = do_rewrite + return buff def do_finish(): result = rewriter.close() @@ -172,12 +155,12 @@ class RewriteContent: return result return self.stream_to_gen(stream, - rewrite_func=rewrite_func, + rewrite_func=do_rewrite, final_read_func=do_finish, first_buff=first_buff) @staticmethod - def _decode_buff(buff, stream, encoding): + def _decode_buff(buff, stream, encoding): # pragma: no coverage try: buff = buff.decode(encoding) except UnicodeDecodeError, e: @@ -194,7 +177,7 @@ class RewriteContent: return buff - def _detect_charset(self, stream): + def _detect_charset(self, stream): # pragma: no coverage full_buff = stream.read(8192) io_buff = BytesIO(full_buff) @@ -224,17 +207,17 @@ class RewriteContent: buff = first_buff else: buff = stream.read() - # if buff and (not hasattr(stream, 'closed') or - # not stream.closed): - # buff += stream.readline() + if buff and (not hasattr(stream, 'closed') or + not stream.closed): + buff += stream.readline() while buff: if rewrite_func: buff = rewrite_func(buff) yield buff buff = stream.read() - # if buff: - # buff += stream.readline() + if buff: + buff += stream.readline() # For adding a tail/handling final buffer if final_read_func: diff --git a/pywb/rewrite/rewriterules.py b/pywb/rewrite/rewriterules.py index f9eae0b9..d70d2d08 100644 --- a/pywb/rewrite/rewriterules.py +++ b/pywb/rewrite/rewriterules.py @@ -9,30 +9,6 @@ from html_rewriter import HTMLRewriter import itertools HTML = HTMLRewriter -_is_lxml = False - - -#================================================================= -def use_lxml_parser(): - import logging - from lxml_html_rewriter import LXMLHTMLRewriter, LXML_SUPPORTED - - if LXML_SUPPORTED: - global HTML - global _is_lxml - HTML = LXMLHTMLRewriter - logging.debug('Using LXML Parser') - _is_lxml = True - else: # pragma: no cover - logging.debug('LXML Parser not available') - _is_lxml = False - - return _is_lxml - - -#================================================================= -def is_lxml(): - return _is_lxml #================================================================= diff --git a/pywb/rewrite/test/test_lxml_html_rewriter.py b/pywb/rewrite/test/test_lxml_html_rewriter.py deleted file mode 100644 index cd84d4a0..00000000 --- a/pywb/rewrite/test/test_lxml_html_rewriter.py +++ /dev/null @@ -1,166 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -ur""" ->>> parse('Text') -Text - ->>> parse('
') -

- ->>> parse('
') -

- -# malformed html -- "selected" attrib dropped ->>> parse('') - - -# Base Tests ->>> parse('') - - ->>> parse('') - - -# Don't rewrite anchors ->>> parse('Text') -Text - -# Ensure attr values are not unescaped ->>> parse('

data

') -

data

- -# text moved out of input ->>> parse('data') -data - ->>> parse('') - - -# Unicode -#>>> parse('испытание') -испытание - -# Meta tag ->>> parse('') - - ->>> parse('') - - ->>> parse('') - - -# Custom -data attribs ->>> parse('
') -
- -# Script tag ->>> parse('') - - -# Script tag + crossorigin ->>> parse('') - - -# Unterminated script tag, will auto-terminate ->>> parse(' - ->>> parse('') - - ->>> parse('
') -
- ->>> parse('') - - -# Unterminated style tag, handle but don't auto-terminate ->>> parse(' - -# Head Insertion ->>> parse('Test', head_insert = '') -Test - ->>> parse('Test', head_insert = '') -Test - ->>> parse('
SomeTest
', head_insert = '/* Insert */') -/* Insert */
SomeTest
- ->>> parse('
SomeTest
', head_insert = '') -
SomeTest
- - -# content after ->>> parse('abc') -abc - -# no attr value ->>> parse(' - -# doctype ->>> parse('
abcdef
') -
abcdef
- ->>> parse('
abcdef
') -
abcdef
- ->>> parse('
abcdef
') -
abcdef
- -# uncommon markup ->>> parse('') - - -# no special cdata treatment, preserved in ') - - ->>> parse('') - - -# Test blank ->>> parse('') - - -# Test no parsing at all ->>> p = LXMLHTMLRewriter(urlrewriter) ->>> p.close() -'' - -# test   ->>> parse(' ') -

 

- -# test multiple rewrites:   extra >, split comment ->>> p = LXMLHTMLRewriter(urlrewriter) ->>> p.rewrite('
    >
') + p.close() -'
    >
' -""" - -from pywb.rewrite.url_rewriter import UrlRewriter - -from pywb.rewrite.lxml_html_rewriter import LXMLHTMLRewriter, LXML_SUPPORTED - -urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/web/') - -def parse(data, head_insert=None): - parser = LXMLHTMLRewriter(urlrewriter, head_insert=head_insert) - data = data.decode('utf-8') - print parser.rewrite(data) + parser.close() - -if __name__ == "__main__": - if LXML_SUPPORTED: - import doctest - doctest.testmod() -else: - # skip if not supported and lxml not available - if not LXML_SUPPORTED: - import pytest - lxml = pytest.importorskip('lxml.etree') - - diff --git a/pywb/webapp/live_rewrite_handler.py b/pywb/webapp/live_rewrite_handler.py index fcae5fa4..a69cf8e9 100644 --- a/pywb/webapp/live_rewrite_handler.py +++ b/pywb/webapp/live_rewrite_handler.py @@ -1,7 +1,6 @@ from pywb.framework.basehandlers import WbUrlHandler from pywb.framework.wbrequestresponse import WbResponse from pywb.framework.archivalrouter import ArchivalRouter, Route -from pywb.rewrite.rewriterules import use_lxml_parser from handlers import StaticHandler @@ -23,5 +22,4 @@ def create_live_rewriter_app(): Route('static/default', StaticHandler('pywb/static/')) ] -# use_lxml_parser() return ArchivalRouter(routes, hostpaths=['http://localhost:8080']) diff --git a/pywb/webapp/pywb_init.py b/pywb/webapp/pywb_init.py index 445014ed..3ec39dfc 100644 --- a/pywb/webapp/pywb_init.py +++ b/pywb/webapp/pywb_init.py @@ -9,8 +9,6 @@ from pywb.framework.basehandlers import BaseHandler from pywb.warc.recordloader import ArcWarcRecordLoader from pywb.warc.resolvingloader import ResolvingLoader -from pywb.rewrite.rewriterules import use_lxml_parser - from views import J2TemplateView, add_env_globals from views import J2HtmlCapturesView, HeadInsertView @@ -45,9 +43,7 @@ DEFAULTS = { 'domain_specific_rules': DEFAULT_RULES_FILE, - 'enable_memento': True, - - 'use_lxml_parser': True, + 'enable_memento': True } @@ -177,8 +173,8 @@ def create_wb_router(passed_config={}): else: request_class = WbRequest - if config.get('use_lxml_parser', False): - use_lxml_parser() + #if config.get('use_lxml_parser', False): + # use_lxml_parser() for name, value in collections.iteritems(): diff --git a/pywb/webapp/replay_views.py b/pywb/webapp/replay_views.py index 3692d68c..8cc14b7d 100644 --- a/pywb/webapp/replay_views.py +++ b/pywb/webapp/replay_views.py @@ -66,17 +66,19 @@ class BaseContentView(object): # render top level frame if in frame mode # (not supported in proxy mode) if (self.is_frame_mode and - not wbrequest.is_proxy and - not wbrequest.wb_url.mod): + not wbrequest.is_proxy and + not wbrequest.wb_url.mod): embed_url = wbrequest.wb_url.to_str(mod=self._mp_mod) timestamp = datetime_to_timestamp(datetime.datetime.utcnow()) url = wbrequest.wb_url.url + ctype='text/html' return self.frame_insert_view.render_response(embed_url=embed_url, wbrequest=wbrequest, timestamp=timestamp, - url=url) + url=url, + content_type=ctype) return self.render_content(wbrequest, *args) diff --git a/pywb/webapp/views.py b/pywb/webapp/views.py index 639bcc52..903cc818 100644 --- a/pywb/webapp/views.py +++ b/pywb/webapp/views.py @@ -98,7 +98,7 @@ class J2TemplateView(object): def render_response(self, **kwargs): template_result = self.render_to_string(**kwargs) status = kwargs.get('status', '200 OK') - content_type = 'text/html; charset=utf-8' + content_type = kwargs.get('content_type', 'text/html; charset=utf-8') return WbResponse.text_response(template_result.encode('utf-8'), status=status, content_type=content_type) diff --git a/tests/test_config.yaml b/tests/test_config.yaml index 77d06a4a..bace37eb 100644 --- a/tests/test_config.yaml +++ b/tests/test_config.yaml @@ -110,7 +110,7 @@ reporter: !!python/object/new:tests.fixture.PrintReporter [] #domain_specific_rules: rules.yaml # Use lxml parser, if available -use_lxml_parser: true +# use_lxml_parser: true # Replay content in an iframe framed_replay: true