pycdx_server initial binsearch module, with support exact match iterator!

fix html_rewriter missing ; on entities js rewriter: only rewrite full document.domain PathIndexPrefixResolver using binsearch on path index, for #9 resolvers moved to replay_resolvers.py improve path-resolver logic: each resolver returns an array of possible files (could be from primary or secondary storage). then, iterate over all possible files from all resolvers until a successful load, or raise exception if all failed
2025-03-15 00:03:28 +01:00 · 2014-01-23 01:38:09 -08:00 · 2014-01-23 01:38:09 -08:00 · e95e17b9e6
commit e95e17b9e6
parent b237b144ff
10 changed files with 181 additions and 36 deletions
--- a/init.py
+++ b/init.py
@ -0,0 +1,2 @@
+#Allow importing
+
--- a/pywb/init.py
+++ b/pywb/init.py
@ -1,2 +1,3 @@
 #Allow importing

+
--- a/pywb/html_rewriter.py
+++ b/pywb/html_rewriter.py
@ -28,6 +28,9 @@ class WBHtml(HTMLParser):
    >>> parse('<html><head><base href="http://example.com/some/path/index.html"/>')
    <html><head><base href="/web/20131226101010/http://example.com/some/path/index.html"/>

+    >>> parse('<a href="">&rsaquo; &nbsp; &#62;</div>')
+    <a href>&rsaquo; &nbsp; &#62;</div>
+
    # Don't rewrite anchors
    >>> parse('<HTML><A Href="#abc">Text</a></hTmL>')
    <HTML><a href="#abc">Text</a></html>
@ -215,7 +218,7 @@ class WBHtml(HTMLParser):
                if rwMod is not None:
                    attrValue = self._rewriteURL(attrValue, rwMod)

-            if attrValue:
+            if attrValue is not None:
                #self.out.write(' {0}="{1}"'.format(attrName, attrValue))
                self.out.write(' ' + attrName + '="' + attrValue + '"')
            else:
@ -280,10 +283,10 @@ class WBHtml(HTMLParser):
        self.parseData(data)

    def handle_entityref(self, data):
-        self.out.write('&' + data)
+        self.out.write('&' + data + ';')

    def handle_charref(self, data):
-        self.out.write('&#' + data)
+        self.out.write('&#' + data + ';')

    def handle_comment(self, data):
        self.out.write('<!--')
--- a/pywb/pycdx_server/init.py
+++ b/pywb/pycdx_server/init.py
@ -0,0 +1,4 @@
+#Allow importing
+
+#from pkgutil import extend_path
+#__path__ = extend_path(__path__, __name__)
--- a/pywb/pycdx_server/binsearch.py
+++ b/pywb/pycdx_server/binsearch.py
@ -0,0 +1,92 @@
+from collections import deque
+import os
+import itertools
+
+class FileReader:
+    def __init__(self, filename):
+        self.fh = open(filename, 'rb')
+        self.size = os.path.getsize(filename)
+
+    def getsize(self):
+        return self.size
+
+    def readline(self):
+        return self.fh.readline()
+
+    def seek(self, offset):
+        return self.fh.seek(offset)
+
+
+def binsearch_offset(reader, key, compare_func = cmp, block_size = 8192):
+    min = 0
+    max = reader.getsize() / block_size
+
+    while (max - min > 1):
+        mid = min + ((max - min) / 2)
+        reader.seek(mid * block_size)
+
+        if mid > 0:
+            reader.readline() # skip partial line
+
+        line = reader.readline()
+
+        if compare_func(key, line) > 0:
+            min = mid
+        else:
+            max = mid
+
+    return (min * block_size)
+
+
+def search(reader, key, prev_size = 0, compare_func = cmp, block_size = 8192):
+    min = binsearch_offset(reader, key, compare_func, block_size)
+
+    reader.seek(min)
+
+    if min > 0:
+        reader.readline() # skip partial line
+
+    if prev_size > 1:
+        prev_deque = deque(maxlen = prev_size)
+
+    line = None
+
+    while True:
+        line = reader.readline()
+        if not line:
+            break
+        if compare_func(line, key) >= 0:
+            break
+
+        if prev_size == 1:
+            prev = line
+        elif prev_size > 1:
+            prev_deque.append(line)
+
+    def gen_iter(line):
+        if prev_size == 1:
+            yield prev
+        elif prev_size > 1:
+            for i in prev_deque:
+                yield i
+
+        while line:
+            yield line
+            line = reader.readline()
+
+    return gen_iter(line)
+
+
+# Iterate over exact matches
+def iter_exact(reader, key):
+    lines = search(reader, key)
+    for x in lines:
+        if not x.startswith(key):
+            break
+
+        yield x
+
+
+
+
+
--- a/pywb/regex_rewriters.py
+++ b/pywb/regex_rewriters.py
@ -28,6 +28,10 @@ class RegexRewriter:
    def archivalRewrite(rewriter):
        return lambda x: rewriter.rewrite(x)

+    @staticmethod
+    def replacer(string):
+        return lambda x: string 
+
    HTTPX_MATCH_STR = 'https?:\\\\?/\\\\?/[A-Za-z0-9:_@.-]+'

    DEFAULT_OP = addPrefix
@ -97,6 +101,9 @@ class JSRewriter(RegexRewriter):
    >>> test_js('window.location = "http://example.com/abc.html" document.domain = "anotherdomain.com"')
    'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html" document.WB_wombat_domain = "anotherdomain.com"'

+    >>> test_js('document_domain = "anotherdomain.com"; window.document.domain = "example.com"')
+    'document_domain = "anotherdomain.com"; window.document.WB_wombat_domain = "example.com"'
+
    # custom rules added
    >>> test_js('window.location = "http://example.com/abc.html"; some_func(); ', [('some_func\(\).*', RegexRewriter.commentOut, 0)])
    'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"; /*some_func(); */'
@ -113,7 +120,8 @@ class JSRewriter(RegexRewriter):
    def _createRules(self, httpPrefix):
        return [
             (RegexRewriter.HTTPX_MATCH_STR, httpPrefix, 0),
-             ('location|domain', 'WB_wombat_', 0),
+             ('location', 'WB_wombat_', 0),
+             ('(?<=document\.)domain', 'WB_wombat_', 0),
        ]


--- a/pywb/replay.py
+++ b/pywb/replay.py
@ -1,7 +1,6 @@
 import StringIO
 from urllib2 import URLError
 import chardet
-import redis
 import copy

 import indexreader, archiveloader
@ -89,14 +88,30 @@ class ReplayHandler(object):
        if failedFiles and filename in failedFiles:
            raise wbexceptions.ArchiveLoadFailed(filename, 'Skipping Already Failed')

-        try:
-            return self.archiveloader.load(self.resolveFull(filename), offset, length)
+        any_found = False
+        last_exc = None
+        for resolver in self.resolvers:
+            possible_paths = resolver(filename)

-        except URLError as ue:
-            if failedFiles:
-                failedFiles.append(filename)
+            if possible_paths:
+                for path in possible_paths:
+                    any_found = True
+                    try:
+                        return self.archiveloader.load(path, offset, length)

-            raise wbexceptions.ArchiveLoadFailed(filename, ue.reason)
+                    except URLError as ue:
+                        last_exc = ue
+                        print last_exc
+                        pass
+
+        # Unsuccessful if reached here
+        if failedFiles:
+           failedFiles.append(filename)
+
+        if not any_found:
+            raise wbexceptions.UnresolvedArchiveFileException('Archive File Not Found: ' + filename)
+        else:
+            raise wbexceptions.ArchiveLoadFailed(filename, last_exc.reason if last_exc else '')


    def doReplay(self, cdx, wbrequest, query, failedFiles):
@ -399,25 +414,4 @@ class RewritingReplayHandler(ReplayHandler):
        return (ArchivalUrlRewriter.stripProtocol(requestUrl) == ArchivalUrlRewriter.stripProtocol(locationUrl))


-#======================================
-# PrefixResolver - convert cdx file entry to url with prefix if url contains specified string
-#======================================
-def PrefixResolver(prefix, contains):
-    def makeUrl(url):
-        return prefix + url if (contains in url) else None

-    return makeUrl
-
-#======================================
-class RedisResolver:
-    def __init__(self, redisUrl, keyPrefix = 'w:'):
-        self.redisUrl = redisUrl
-        self.keyPrefix = keyPrefix
-        self.redis = redis.StrictRedis.from_url(redisUrl)
-
-    def __call__(self, filename):
-        try:
-            return self.redis.hget(self.keyPrefix + filename, 'path')
-        except Exception as e:
-            print e
-            return None
--- a/pywb/replay_resolvers.py
+++ b/pywb/replay_resolvers.py
@ -0,0 +1,41 @@
+import redis
+import pycdx_server.binsearch as binsearch
+#======================================
+# PrefixResolver - convert cdx file entry to url with prefix if url contains specified string
+#======================================
+def PrefixResolver(prefix, contains):
+    def makeUrl(url):
+        return [prefix + url] if (contains in url) else []
+
+    return makeUrl
+
+#======================================
+class RedisResolver:
+    def __init__(self, redisUrl, keyPrefix = 'w:'):
+        self.redisUrl = redisUrl
+        self.keyPrefix = keyPrefix
+        self.redis = redis.StrictRedis.from_url(redisUrl)
+
+    def __call__(self, filename):
+        try:
+            return [self.redis.hget(self.keyPrefix + filename, 'path')]
+        except Exception as e:
+            print e
+            return None
+
+#======================================
+class PathIndexResolver:
+    def __init__(self, pathindex_file):
+        self.reader = binsearch.FileReader(pathindex_file)
+
+    def __call__(self, filename):
+        result = binsearch.iter_exact(self.reader, filename)
+
+        def gen_list(result):
+            for pathline in result:
+                path = pathline.split('\t')
+                if len(path) == 2:
+                    yield path[1].rstrip()
+
+        return gen_list(result)
+
--- a/run.sh
+++ b/run.sh
@ -3,14 +3,14 @@
 mypath=$(cd `dirname $0` && pwd)

 app=$2
-cd $mypath/pywb
+#cd $mypath/pywb
 if [ -z "$app" ]; then
-  app=wbapp.py
+  app=pywb.wbapp
 fi

 if [ -z "$1" ]; then
  # Standard root config
-  uwsgi --static-map /static=$mypath/static --http-socket :8080 --wsgi-file $app
+  uwsgi --static-map /static=$mypath/static --http-socket :8080 -b 65536 --wsgi $app
 else
  # Test on non-root mount
  uwsgi --static-map /static=$mypath/static --http-socket :8080 --mount "$1=$app" --no-default-app --manage-script-name
--- a/static/wb.css
+++ b/static/wb.css
@ -4,7 +4,7 @@
  display: block !important;
  top: 0px !important;
  left: 0px !important;
-  position: absolute !important;
+  position: inherit !important;
  padding: 4px !important;
  width: 100% !important;
  font-size: 24px !important;