From 94326dafc14457ac0495a91e9b675be17405d897 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Fri, 24 Jan 2014 00:49:51 -0800 Subject: [PATCH] html_rewriter: default attrs without value to empty str value, instead of no value --- pywb/binsearch.py | 92 +++++++++++++++++++++++++++++++++++++++++++ pywb/html_rewriter.py | 16 ++++---- 2 files changed, 101 insertions(+), 7 deletions(-) create mode 100644 pywb/binsearch.py diff --git a/pywb/binsearch.py b/pywb/binsearch.py new file mode 100644 index 00000000..ef3171f1 --- /dev/null +++ b/pywb/binsearch.py @@ -0,0 +1,92 @@ +from collections import deque +import os +import itertools + +class FileReader: + def __init__(self, filename): + self.fh = open(filename, 'rb') + self.size = os.path.getsize(filename) + + def getsize(self): + return self.size + + def readline(self): + return self.fh.readline() + + def seek(self, offset): + return self.fh.seek(offset) + + +def binsearch_offset(reader, key, compare_func = cmp, block_size = 8192): + min = 0 + max = reader.getsize() / block_size + + while (max - min > 1): + mid = min + ((max - min) / 2) + reader.seek(mid * block_size) + + if mid > 0: + reader.readline() # skip partial line + + line = reader.readline() + + if compare_func(key, line) > 0: + min = mid + else: + max = mid + + return (min * block_size) + + +def search(reader, key, prev_size = 0, compare_func = cmp, block_size = 8192): + min = binsearch_offset(reader, key, compare_func, block_size) + + reader.seek(min) + + if min > 0: + reader.readline() # skip partial line + + if prev_size > 1: + prev_deque = deque(maxlen = prev_size) + + line = None + + while True: + line = reader.readline() + if not line: + break + if compare_func(line, key) >= 0: + break + + if prev_size == 1: + prev = line + elif prev_size > 1: + prev_deque.append(line) + + def gen_iter(line): + if prev_size == 1: + yield prev + elif prev_size > 1: + for i in prev_deque: + yield i + + while line: + yield line + line = reader.readline() + + return gen_iter(line) + + +# Iterate over exact matches +def iter_exact(reader, key): + lines = search(reader, key) + for x in lines: + if not x.startswith(key): + break + + yield x + + + + + diff --git a/pywb/html_rewriter.py b/pywb/html_rewriter.py index 800f5dda..0ee6bf2a 100644 --- a/pywb/html_rewriter.py +++ b/pywb/html_rewriter.py @@ -23,13 +23,14 @@ class WBHtml(HTMLParser):
>>> parse('') - + >>> parse('') + # HTML Entities >>> parse('›   >') - ›   > + ›   > # Don't rewrite anchors >>> parse('Text') @@ -47,7 +48,7 @@ class WBHtml(HTMLParser): >>> parse('') - + # Script tag >>> parse('') @@ -61,7 +62,7 @@ class WBHtml(HTMLParser): >>> parse('
') -
+
>>> parse('') @@ -218,11 +219,12 @@ class WBHtml(HTMLParser): if rwMod is not None: attrValue = self._rewriteURL(attrValue, rwMod) - if attrValue is not None: - #self.out.write(' {0}="{1}"'.format(attrName, attrValue)) + # parser doesn't differentiate between 'attr=""' and just 'attr' + # 'attr=""' is more common, so use that form + if attrValue: self.out.write(' ' + attrName + '="' + attrValue + '"') else: - self.out.write(' ' + attrName) + self.out.write(' ' + attrName + '=""') self.out.write('/>' if isStartEnd else '>')