1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

pycdx_server initial binsearch module, with support exact match iterator!

fix html_rewriter missing ; on entities
js rewriter: only rewrite full document.domain
PathIndexPrefixResolver using binsearch on path index, for #9
resolvers moved to replay_resolvers.py

improve path-resolver logic: each resolver returns an array of possible
files (could be from primary or secondary storage).
then, iterate over all possible files from all resolvers until
a successful load, or raise exception if all failed
This commit is contained in:
Ilya Kreymer 2014-01-23 01:38:09 -08:00
parent b237b144ff
commit e95e17b9e6
10 changed files with 181 additions and 36 deletions

2
__init__.py Normal file
View File

@ -0,0 +1,2 @@
#Allow importing

View File

@ -1,2 +1,3 @@
#Allow importing

View File

@ -28,6 +28,9 @@ class WBHtml(HTMLParser):
>>> parse('<html><head><base href="http://example.com/some/path/index.html"/>')
<html><head><base href="/web/20131226101010/http://example.com/some/path/index.html"/>
>>> parse('<a href="">&rsaquo; &nbsp; &#62;</div>')
<a href>&rsaquo; &nbsp; &#62;</div>
# Don't rewrite anchors
>>> parse('<HTML><A Href="#abc">Text</a></hTmL>')
<HTML><a href="#abc">Text</a></html>
@ -215,7 +218,7 @@ class WBHtml(HTMLParser):
if rwMod is not None:
attrValue = self._rewriteURL(attrValue, rwMod)
if attrValue:
if attrValue is not None:
#self.out.write(' {0}="{1}"'.format(attrName, attrValue))
self.out.write(' ' + attrName + '="' + attrValue + '"')
else:
@ -280,10 +283,10 @@ class WBHtml(HTMLParser):
self.parseData(data)
def handle_entityref(self, data):
self.out.write('&' + data)
self.out.write('&' + data + ';')
def handle_charref(self, data):
self.out.write('&#' + data)
self.out.write('&#' + data + ';')
def handle_comment(self, data):
self.out.write('<!--')

View File

@ -0,0 +1,4 @@
#Allow importing
#from pkgutil import extend_path
#__path__ = extend_path(__path__, __name__)

View File

@ -0,0 +1,92 @@
from collections import deque
import os
import itertools
class FileReader:
def __init__(self, filename):
self.fh = open(filename, 'rb')
self.size = os.path.getsize(filename)
def getsize(self):
return self.size
def readline(self):
return self.fh.readline()
def seek(self, offset):
return self.fh.seek(offset)
def binsearch_offset(reader, key, compare_func = cmp, block_size = 8192):
min = 0
max = reader.getsize() / block_size
while (max - min > 1):
mid = min + ((max - min) / 2)
reader.seek(mid * block_size)
if mid > 0:
reader.readline() # skip partial line
line = reader.readline()
if compare_func(key, line) > 0:
min = mid
else:
max = mid
return (min * block_size)
def search(reader, key, prev_size = 0, compare_func = cmp, block_size = 8192):
min = binsearch_offset(reader, key, compare_func, block_size)
reader.seek(min)
if min > 0:
reader.readline() # skip partial line
if prev_size > 1:
prev_deque = deque(maxlen = prev_size)
line = None
while True:
line = reader.readline()
if not line:
break
if compare_func(line, key) >= 0:
break
if prev_size == 1:
prev = line
elif prev_size > 1:
prev_deque.append(line)
def gen_iter(line):
if prev_size == 1:
yield prev
elif prev_size > 1:
for i in prev_deque:
yield i
while line:
yield line
line = reader.readline()
return gen_iter(line)
# Iterate over exact matches
def iter_exact(reader, key):
lines = search(reader, key)
for x in lines:
if not x.startswith(key):
break
yield x

View File

@ -28,6 +28,10 @@ class RegexRewriter:
def archivalRewrite(rewriter):
return lambda x: rewriter.rewrite(x)
@staticmethod
def replacer(string):
return lambda x: string
HTTPX_MATCH_STR = 'https?:\\\\?/\\\\?/[A-Za-z0-9:_@.-]+'
DEFAULT_OP = addPrefix
@ -97,6 +101,9 @@ class JSRewriter(RegexRewriter):
>>> test_js('window.location = "http://example.com/abc.html" document.domain = "anotherdomain.com"')
'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html" document.WB_wombat_domain = "anotherdomain.com"'
>>> test_js('document_domain = "anotherdomain.com"; window.document.domain = "example.com"')
'document_domain = "anotherdomain.com"; window.document.WB_wombat_domain = "example.com"'
# custom rules added
>>> test_js('window.location = "http://example.com/abc.html"; some_func(); ', [('some_func\(\).*', RegexRewriter.commentOut, 0)])
'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"; /*some_func(); */'
@ -113,7 +120,8 @@ class JSRewriter(RegexRewriter):
def _createRules(self, httpPrefix):
return [
(RegexRewriter.HTTPX_MATCH_STR, httpPrefix, 0),
('location|domain', 'WB_wombat_', 0),
('location', 'WB_wombat_', 0),
('(?<=document\.)domain', 'WB_wombat_', 0),
]

View File

@ -1,7 +1,6 @@
import StringIO
from urllib2 import URLError
import chardet
import redis
import copy
import indexreader, archiveloader
@ -89,14 +88,30 @@ class ReplayHandler(object):
if failedFiles and filename in failedFiles:
raise wbexceptions.ArchiveLoadFailed(filename, 'Skipping Already Failed')
try:
return self.archiveloader.load(self.resolveFull(filename), offset, length)
any_found = False
last_exc = None
for resolver in self.resolvers:
possible_paths = resolver(filename)
except URLError as ue:
if failedFiles:
failedFiles.append(filename)
if possible_paths:
for path in possible_paths:
any_found = True
try:
return self.archiveloader.load(path, offset, length)
raise wbexceptions.ArchiveLoadFailed(filename, ue.reason)
except URLError as ue:
last_exc = ue
print last_exc
pass
# Unsuccessful if reached here
if failedFiles:
failedFiles.append(filename)
if not any_found:
raise wbexceptions.UnresolvedArchiveFileException('Archive File Not Found: ' + filename)
else:
raise wbexceptions.ArchiveLoadFailed(filename, last_exc.reason if last_exc else '')
def doReplay(self, cdx, wbrequest, query, failedFiles):
@ -399,25 +414,4 @@ class RewritingReplayHandler(ReplayHandler):
return (ArchivalUrlRewriter.stripProtocol(requestUrl) == ArchivalUrlRewriter.stripProtocol(locationUrl))
#======================================
# PrefixResolver - convert cdx file entry to url with prefix if url contains specified string
#======================================
def PrefixResolver(prefix, contains):
def makeUrl(url):
return prefix + url if (contains in url) else None
return makeUrl
#======================================
class RedisResolver:
def __init__(self, redisUrl, keyPrefix = 'w:'):
self.redisUrl = redisUrl
self.keyPrefix = keyPrefix
self.redis = redis.StrictRedis.from_url(redisUrl)
def __call__(self, filename):
try:
return self.redis.hget(self.keyPrefix + filename, 'path')
except Exception as e:
print e
return None

41
pywb/replay_resolvers.py Normal file
View File

@ -0,0 +1,41 @@
import redis
import pycdx_server.binsearch as binsearch
#======================================
# PrefixResolver - convert cdx file entry to url with prefix if url contains specified string
#======================================
def PrefixResolver(prefix, contains):
def makeUrl(url):
return [prefix + url] if (contains in url) else []
return makeUrl
#======================================
class RedisResolver:
def __init__(self, redisUrl, keyPrefix = 'w:'):
self.redisUrl = redisUrl
self.keyPrefix = keyPrefix
self.redis = redis.StrictRedis.from_url(redisUrl)
def __call__(self, filename):
try:
return [self.redis.hget(self.keyPrefix + filename, 'path')]
except Exception as e:
print e
return None
#======================================
class PathIndexResolver:
def __init__(self, pathindex_file):
self.reader = binsearch.FileReader(pathindex_file)
def __call__(self, filename):
result = binsearch.iter_exact(self.reader, filename)
def gen_list(result):
for pathline in result:
path = pathline.split('\t')
if len(path) == 2:
yield path[1].rstrip()
return gen_list(result)

6
run.sh
View File

@ -3,14 +3,14 @@
mypath=$(cd `dirname $0` && pwd)
app=$2
cd $mypath/pywb
#cd $mypath/pywb
if [ -z "$app" ]; then
app=wbapp.py
app=pywb.wbapp
fi
if [ -z "$1" ]; then
# Standard root config
uwsgi --static-map /static=$mypath/static --http-socket :8080 --wsgi-file $app
uwsgi --static-map /static=$mypath/static --http-socket :8080 -b 65536 --wsgi $app
else
# Test on non-root mount
uwsgi --static-map /static=$mypath/static --http-socket :8080 --mount "$1=$app" --no-default-app --manage-script-name

View File

@ -4,7 +4,7 @@
display: block !important;
top: 0px !important;
left: 0px !important;
position: absolute !important;
position: inherit !important;
padding: 4px !important;
width: 100% !important;
font-size: 24px !important;