diff --git a/pywb/rewrite/html_rewriter.py b/pywb/rewrite/html_rewriter.py index 4ec3e530..2c5a18b1 100644 --- a/pywb/rewrite/html_rewriter.py +++ b/pywb/rewrite/html_rewriter.py @@ -169,7 +169,9 @@ class HTMLRewriterMixin(object): else: # special case: base tag if (tag == 'base') and (attr_name == 'href') and attr_value: - self.url_rewriter.set_base_url(attr_value) + #self.url_rewriter.set_base_url(attr_value) + self.url_rewriter = (self.url_rewriter. + rebase_rewriter(attr_value)) rw_mod = handler.get(attr_name) if rw_mod is not None: diff --git a/pywb/rewrite/lxml_parser.py b/pywb/rewrite/lxml_parser.py index 6c9e13c9..4956c872 100644 --- a/pywb/rewrite/lxml_parser.py +++ b/pywb/rewrite/lxml_parser.py @@ -1,18 +1,18 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -import lxml.html +#import lxml.html import lxml.etree import cgi +import re from regex_rewriters import JSRewriter, CSSRewriter from url_rewriter import UrlRewriter from html_rewriter import HTMLRewriterMixin -from StringIO import StringIO class LXMLHTMLRewriter(HTMLRewriterMixin): - r""" + ur""" >>> parse('Text') Text @@ -45,7 +45,7 @@ class LXMLHTMLRewriter(HTMLRewriterMixin): # Unicode - #>>> parse('испытание') + >>> parse('испытание') испытание # Meta tag @@ -106,6 +106,8 @@ class LXMLHTMLRewriter(HTMLRewriterMixin): """ + END_HTML = re.compile(r'', re.IGNORECASE) + def __init__(self, url_rewriter, head_insert=None, js_rewriter_class=JSRewriter, @@ -116,7 +118,6 @@ class LXMLHTMLRewriter(HTMLRewriterMixin): js_rewriter_class, css_rewriter_class) - self.target = RewriterTarget(self) self.parser = lxml.etree.HTMLParser(remove_pis=False, remove_blank_text=False, @@ -127,15 +128,19 @@ class LXMLHTMLRewriter(HTMLRewriterMixin): recover=True, ) + self.is_closing = False + def feed(self, string): - string = string.replace('', '') + string = self.END_HTML.sub(u'', string) + #string = string.replace(u'', u'') self.parser.feed(string) def close(self): if not self.out: self.out = self.AccumBuff() + self.is_closing = True self.parser.close() result = self.out.getvalue() @@ -153,23 +158,26 @@ class RewriterTarget(object): attrs = attrs.items() if not self.rewriter._rewrite_tag_attrs(tag, attrs, escape=True): - self.rewriter.out.write('<' + tag) + self.rewriter.out.write(u'<' + tag) for name, value in attrs: self.rewriter._write_attr(name, value, escape=True) else: - if tag == 'head': + if tag == u'head': if (self.rewriter._rewrite_head(False)): return - self.rewriter.out.write('>') + self.rewriter.out.write(u'>') def end(self, tag): + #if tag == 'html' and not self.rewriter.is_closing: + # raise lxml.etree.LxmlError('test') + if (tag == self.rewriter._wb_parse_context): self.rewriter._wb_parse_context = None - self.rewriter.out.write('') + self.rewriter.out.write(u'') def data(self, data): if not self.rewriter._wb_parse_context: @@ -178,12 +186,12 @@ class RewriterTarget(object): self.rewriter.parse_data(data) def comment(self, data): - self.rewriter.out.write('') + self.rewriter.out.write(u'-->') def pi(self, data): - self.rewriter.out.write('') + self.rewriter.out.write(u'') def close(self): return '' @@ -192,6 +200,7 @@ urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.htm def parse(data, head_insert=None): parser = LXMLHTMLRewriter(urlrewriter, head_insert=head_insert) + data = data.decode('utf-8') print parser.rewrite(data) + parser.close() #return parser.rewrite(data) + parser.close() diff --git a/pywb/rewrite/rewrite_live.py b/pywb/rewrite/rewrite_live.py index f2c5a552..0d5245e0 100644 --- a/pywb/rewrite/rewrite_live.py +++ b/pywb/rewrite/rewrite_live.py @@ -66,11 +66,9 @@ def get_rewritten(url, urlrewriter, urlkey=None, head_insert_func=None): status_headers, gen = result - buff = '' - for x in gen: - buff += x + #buff = u''.join(gen) - return (status_headers, buff) + return (status_headers, gen) #================================================================= diff --git a/pywb/rewrite/test/test_rewrite.py b/pywb/rewrite/test/test_rewrite.py index 423490b1..9e3d0337 100644 --- a/pywb/rewrite/test/test_rewrite.py +++ b/pywb/rewrite/test/test_rewrite.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -r""" +ur""" #================================================================= # HTML Rewriting @@ -260,6 +260,7 @@ urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.htm def parse(data, head_insert = None): parser = HTMLRewriter(urlrewriter, head_insert = head_insert) + data = data.decode('utf-8') print parser.rewrite(data) + parser.close() arcrw = UrlRewriter('20131010im_/http://example.com/', '/web/') diff --git a/pywb/rewrite/test/test_rewrite_live.py b/pywb/rewrite/test/test_rewrite_live.py index 36e74848..13a941ea 100644 --- a/pywb/rewrite/test/test_rewrite_live.py +++ b/pywb/rewrite/test/test_rewrite_live.py @@ -64,7 +64,8 @@ def test_example_2(): def test_example_domain_specific_3(): - status_headers, buff = get_rewritten('http://facebook.com/digitalpreservation', urlrewriter) + urlrewriter2 = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/pywb/') + status_headers, buff = get_rewritten('http://facebook.com/digitalpreservation', urlrewriter2) # comment out bootloader assert '/* Bootloader.configurePage' in buff diff --git a/pywb/rewrite/url_rewriter.py b/pywb/rewrite/url_rewriter.py index 263e979a..9545a040 100644 --- a/pywb/rewrite/url_rewriter.py +++ b/pywb/rewrite/url_rewriter.py @@ -61,8 +61,11 @@ class UrlRewriter(object): return self.prefix + self.wburl.to_str(timestamp=timestamp, url=url) - def set_base_url(self, newUrl): - self.wburl.url = newUrl + def rebase_rewriter(self, new_url): + #self.wburl.url = newUrl + new_wburl = copy.copy(self.wburl) + new_wburl.url = new_url + return UrlRewriter(new_wburl, self.prefix) def __repr__(self): return "UrlRewriter('{0}', '{1}')".format(self.wburl, self.prefix) @@ -94,5 +97,5 @@ class HttpsUrlRewriter(object): def get_abs_url(self, url=''): return url - def set_base_url(self, newUrl): - pass + def rebase_rewriter(self, new_url): + return self