From 9654c22bed3ccfab4f5f7ddb9da9a728d84c0507 Mon Sep 17 00:00:00 2001
From: Ilya Kreymer
Date: Sun, 23 Mar 2014 23:46:49 -0700
Subject: [PATCH] rewrite: add doctype rewriting, more tests on various markup
edge cases
---
pywb/rewrite/lxml_html_rewriter.py | 14 +++++++++--
pywb/rewrite/test/test_html_rewriter.py | 26 ++++++++++++++++++--
pywb/rewrite/test/test_lxml_html_rewriter.py | 25 ++++++++++++++++---
3 files changed, 57 insertions(+), 8 deletions(-)
diff --git a/pywb/rewrite/lxml_html_rewriter.py b/pywb/rewrite/lxml_html_rewriter.py
index 415334d3..ef809cf8 100644
--- a/pywb/rewrite/lxml_html_rewriter.py
+++ b/pywb/rewrite/lxml_html_rewriter.py
@@ -77,8 +77,18 @@ class RewriterTarget(object):
self.rewriter.parse_data(data)
self.rewriter.out.write(u'-->')
- def pi(self, data):
- self.rewriter.out.write(u'' + data + u'>')
+ def doctype(self, root_tag, public_id, system_id):
+ self.rewriter.out.write(u'')
+
+ def pi(self, target, data):
+ self.rewriter.out.write(u'' + target + ' ' + data + u'>')
def close(self):
return ''
diff --git a/pywb/rewrite/test/test_html_rewriter.py b/pywb/rewrite/test/test_html_rewriter.py
index 20b0ad37..9dbe55ed 100644
--- a/pywb/rewrite/test/test_html_rewriter.py
+++ b/pywb/rewrite/test/test_html_rewriter.py
@@ -46,6 +46,9 @@ ur"""
>>> parse('')
+>>> parse('')
+
+
>>> parse('')
@@ -63,6 +66,7 @@ ur"""
>>> parse('')
+# Style
>>> parse('')
@@ -77,11 +81,29 @@ ur"""
>>> parse('
Test', head_insert = '')
Test
->>> parse('SomeTest
', head_insert = '/* Insert */')
-/* Insert */SomeTest
+>>> parse('SomeTest
', head_insert = '/* Insert */')
+/* Insert */SomeTest
>>> parse('SomeTest
', head_insert = '')
SomeTest
+
+>>> parse('')
+
+
+# uncommon markup
+>>> parse('')
+
+
+# no special cdata treatment, preserved in ')
+
+
+# CDATA outside of ')
+
+
+>>> parse('')
+
"""
from pywb.rewrite.url_rewriter import UrlRewriter