From e95e17b9e69f483841456585a8a9655254b71543 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 23 Jan 2014 01:38:09 -0800 Subject: [PATCH] pycdx_server initial binsearch module, with support exact match iterator! fix html_rewriter missing ; on entities js rewriter: only rewrite full document.domain PathIndexPrefixResolver using binsearch on path index, for #9 resolvers moved to replay_resolvers.py improve path-resolver logic: each resolver returns an array of possible files (could be from primary or secondary storage). then, iterate over all possible files from all resolvers until a successful load, or raise exception if all failed --- __init__.py | 2 + pywb/__init__.py | 1 + pywb/html_rewriter.py | 9 ++-- pywb/pycdx_server/__init__.py | 4 ++ pywb/pycdx_server/binsearch.py | 92 ++++++++++++++++++++++++++++++++++ pywb/regex_rewriters.py | 10 +++- pywb/replay.py | 50 ++++++++---------- pywb/replay_resolvers.py | 41 +++++++++++++++ run.sh | 6 +-- static/wb.css | 2 +- 10 files changed, 181 insertions(+), 36 deletions(-) create mode 100644 __init__.py create mode 100644 pywb/pycdx_server/__init__.py create mode 100644 pywb/pycdx_server/binsearch.py create mode 100644 pywb/replay_resolvers.py diff --git a/__init__.py b/__init__.py new file mode 100644 index 00000000..769c3cc7 --- /dev/null +++ b/__init__.py @@ -0,0 +1,2 @@ +#Allow importing + diff --git a/pywb/__init__.py b/pywb/__init__.py index 769c3cc7..1cdc4fe6 100644 --- a/pywb/__init__.py +++ b/pywb/__init__.py @@ -1,2 +1,3 @@ #Allow importing + diff --git a/pywb/html_rewriter.py b/pywb/html_rewriter.py index 82113bfe..800f5dda 100644 --- a/pywb/html_rewriter.py +++ b/pywb/html_rewriter.py @@ -28,6 +28,9 @@ class WBHtml(HTMLParser): >>> parse('') + >>> parse('›   >') + ›   > + # Don't rewrite anchors >>> parse('Text') Text @@ -215,7 +218,7 @@ class WBHtml(HTMLParser): if rwMod is not None: attrValue = self._rewriteURL(attrValue, rwMod) - if attrValue: + if attrValue is not None: #self.out.write(' {0}="{1}"'.format(attrName, attrValue)) self.out.write(' ' + attrName + '="' + attrValue + '"') else: @@ -280,10 +283,10 @@ class WBHtml(HTMLParser): self.parseData(data) def handle_entityref(self, data): - self.out.write('&' + data) + self.out.write('&' + data + ';') def handle_charref(self, data): - self.out.write('&#' + data) + self.out.write('&#' + data + ';') def handle_comment(self, data): self.out.write('