html_rewriter: default attrs without value to empty str value, instead of no value

2025-03-15 00:03:28 +01:00 · 2014-01-24 00:49:51 -08:00 · 2014-01-24 00:49:51 -08:00 · 94326dafc1
commit 94326dafc1
parent 5987a0c047
2 changed files with 101 additions and 7 deletions
--- a/pywb/binsearch.py
+++ b/pywb/binsearch.py
@ -0,0 +1,92 @@
+from collections import deque
+import os
+import itertools
+
+class FileReader:
+    def __init__(self, filename):
+        self.fh = open(filename, 'rb')
+        self.size = os.path.getsize(filename)
+
+    def getsize(self):
+        return self.size
+
+    def readline(self):
+        return self.fh.readline()
+
+    def seek(self, offset):
+        return self.fh.seek(offset)
+
+
+def binsearch_offset(reader, key, compare_func = cmp, block_size = 8192):
+    min = 0
+    max = reader.getsize() / block_size
+
+    while (max - min > 1):
+        mid = min + ((max - min) / 2)
+        reader.seek(mid * block_size)
+
+        if mid > 0:
+            reader.readline() # skip partial line
+
+        line = reader.readline()
+
+        if compare_func(key, line) > 0:
+            min = mid
+        else:
+            max = mid
+
+    return (min * block_size)
+
+
+def search(reader, key, prev_size = 0, compare_func = cmp, block_size = 8192):
+    min = binsearch_offset(reader, key, compare_func, block_size)
+
+    reader.seek(min)
+
+    if min > 0:
+        reader.readline() # skip partial line
+
+    if prev_size > 1:
+        prev_deque = deque(maxlen = prev_size)
+
+    line = None
+
+    while True:
+        line = reader.readline()
+        if not line:
+            break
+        if compare_func(line, key) >= 0:
+            break
+
+        if prev_size == 1:
+            prev = line
+        elif prev_size > 1:
+            prev_deque.append(line)
+
+    def gen_iter(line):
+        if prev_size == 1:
+            yield prev
+        elif prev_size > 1:
+            for i in prev_deque:
+                yield i
+
+        while line:
+            yield line
+            line = reader.readline()
+
+    return gen_iter(line)
+
+
+# Iterate over exact matches
+def iter_exact(reader, key):
+    lines = search(reader, key)
+    for x in lines:
+        if not x.startswith(key):
+            break
+
+        yield x
+
+
+
+
+
--- a/pywb/html_rewriter.py
+++ b/pywb/html_rewriter.py
@ -23,13 +23,14 @@ class WBHtml(HTMLParser):
    <body x="y"><img src="/web/20131226101010im_/http://example.com/img.gif"/><br/></body>

    >>> parse('<input "selected"><img src></div>')
-    <input "selected"><img src></div>
+    <input "selected"=""><img src=""></div>

    >>> parse('<html><head><base href="http://example.com/some/path/index.html"/>')
    <html><head><base href="/web/20131226101010/http://example.com/some/path/index.html"/>

+    # HTML Entities
    >>> parse('<a href="">&rsaquo; &nbsp; &#62;</div>')
-    <a href>&rsaquo; &nbsp; &#62;</div>
+    <a href="">&rsaquo; &nbsp; &#62;</div>

    # Don't rewrite anchors
    >>> parse('<HTML><A Href="#abc">Text</a></hTmL>')
@ -47,7 +48,7 @@ class WBHtml(HTMLParser):
    <meta http-equiv="Content-type" content="text/html; charset=utf-8"/>

    >>> parse('<META http-equiv="refresh" content>')
-    <meta http-equiv="refresh" content>
+    <meta http-equiv="refresh" content="">

    # Script tag
    >>> parse('<script>window.location = "http://example.com/a/b/c.html"</script>')
@ -61,7 +62,7 @@ class WBHtml(HTMLParser):
    <script>/*<![CDATA[*/window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html;/*]]>*/"</script>

    >>> parse('<div style="background: url(\'abc.html\')" onblah onclick="location = \'redirect.html\'"></div>')
-    <div style="background: url('/web/20131226101010/http://example.com/some/path/abc.html')" onblah onclick="WB_wombat_location = 'redirect.html'"></div>
+    <div style="background: url('/web/20131226101010/http://example.com/some/path/abc.html')" onblah="" onclick="WB_wombat_location = 'redirect.html'"></div>

    >>> parse('<style>@import "styles.css" .a { font-face: url(\'myfont.ttf\') }</style>')
    <style>@import "/web/20131226101010/http://example.com/some/path/styles.css" .a { font-face: url('/web/20131226101010/http://example.com/some/path/myfont.ttf') }</style>
@ -218,11 +219,12 @@ class WBHtml(HTMLParser):
                if rwMod is not None:
                    attrValue = self._rewriteURL(attrValue, rwMod)

-            if attrValue is not None:
-                #self.out.write(' {0}="{1}"'.format(attrName, attrValue))
+            # parser doesn't differentiate between 'attr=""' and just 'attr'
+            # 'attr=""' is more common, so use that form
+            if attrValue:
                self.out.write(' ' + attrName + '="' + attrValue + '"')
            else:
-                self.out.write(' ' + attrName)
+                self.out.write(' ' + attrName + '=""')

        self.out.write('/>' if isStartEnd else '>')