mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
pycdx_server initial binsearch module, with support exact match iterator!
fix html_rewriter missing ; on entities js rewriter: only rewrite full document.domain PathIndexPrefixResolver using binsearch on path index, for #9 resolvers moved to replay_resolvers.py improve path-resolver logic: each resolver returns an array of possible files (could be from primary or secondary storage). then, iterate over all possible files from all resolvers until a successful load, or raise exception if all failed
This commit is contained in:
parent
b237b144ff
commit
e95e17b9e6
2
__init__.py
Normal file
2
__init__.py
Normal file
@ -0,0 +1,2 @@
|
||||
#Allow importing
|
||||
|
@ -1,2 +1,3 @@
|
||||
#Allow importing
|
||||
|
||||
|
||||
|
@ -28,6 +28,9 @@ class WBHtml(HTMLParser):
|
||||
>>> parse('<html><head><base href="http://example.com/some/path/index.html"/>')
|
||||
<html><head><base href="/web/20131226101010/http://example.com/some/path/index.html"/>
|
||||
|
||||
>>> parse('<a href="">› ></div>')
|
||||
<a href>› ></div>
|
||||
|
||||
# Don't rewrite anchors
|
||||
>>> parse('<HTML><A Href="#abc">Text</a></hTmL>')
|
||||
<HTML><a href="#abc">Text</a></html>
|
||||
@ -215,7 +218,7 @@ class WBHtml(HTMLParser):
|
||||
if rwMod is not None:
|
||||
attrValue = self._rewriteURL(attrValue, rwMod)
|
||||
|
||||
if attrValue:
|
||||
if attrValue is not None:
|
||||
#self.out.write(' {0}="{1}"'.format(attrName, attrValue))
|
||||
self.out.write(' ' + attrName + '="' + attrValue + '"')
|
||||
else:
|
||||
@ -280,10 +283,10 @@ class WBHtml(HTMLParser):
|
||||
self.parseData(data)
|
||||
|
||||
def handle_entityref(self, data):
|
||||
self.out.write('&' + data)
|
||||
self.out.write('&' + data + ';')
|
||||
|
||||
def handle_charref(self, data):
|
||||
self.out.write('&#' + data)
|
||||
self.out.write('&#' + data + ';')
|
||||
|
||||
def handle_comment(self, data):
|
||||
self.out.write('<!--')
|
||||
|
4
pywb/pycdx_server/__init__.py
Normal file
4
pywb/pycdx_server/__init__.py
Normal file
@ -0,0 +1,4 @@
|
||||
#Allow importing
|
||||
|
||||
#from pkgutil import extend_path
|
||||
#__path__ = extend_path(__path__, __name__)
|
92
pywb/pycdx_server/binsearch.py
Normal file
92
pywb/pycdx_server/binsearch.py
Normal file
@ -0,0 +1,92 @@
|
||||
from collections import deque
|
||||
import os
|
||||
import itertools
|
||||
|
||||
class FileReader:
|
||||
def __init__(self, filename):
|
||||
self.fh = open(filename, 'rb')
|
||||
self.size = os.path.getsize(filename)
|
||||
|
||||
def getsize(self):
|
||||
return self.size
|
||||
|
||||
def readline(self):
|
||||
return self.fh.readline()
|
||||
|
||||
def seek(self, offset):
|
||||
return self.fh.seek(offset)
|
||||
|
||||
|
||||
def binsearch_offset(reader, key, compare_func = cmp, block_size = 8192):
|
||||
min = 0
|
||||
max = reader.getsize() / block_size
|
||||
|
||||
while (max - min > 1):
|
||||
mid = min + ((max - min) / 2)
|
||||
reader.seek(mid * block_size)
|
||||
|
||||
if mid > 0:
|
||||
reader.readline() # skip partial line
|
||||
|
||||
line = reader.readline()
|
||||
|
||||
if compare_func(key, line) > 0:
|
||||
min = mid
|
||||
else:
|
||||
max = mid
|
||||
|
||||
return (min * block_size)
|
||||
|
||||
|
||||
def search(reader, key, prev_size = 0, compare_func = cmp, block_size = 8192):
|
||||
min = binsearch_offset(reader, key, compare_func, block_size)
|
||||
|
||||
reader.seek(min)
|
||||
|
||||
if min > 0:
|
||||
reader.readline() # skip partial line
|
||||
|
||||
if prev_size > 1:
|
||||
prev_deque = deque(maxlen = prev_size)
|
||||
|
||||
line = None
|
||||
|
||||
while True:
|
||||
line = reader.readline()
|
||||
if not line:
|
||||
break
|
||||
if compare_func(line, key) >= 0:
|
||||
break
|
||||
|
||||
if prev_size == 1:
|
||||
prev = line
|
||||
elif prev_size > 1:
|
||||
prev_deque.append(line)
|
||||
|
||||
def gen_iter(line):
|
||||
if prev_size == 1:
|
||||
yield prev
|
||||
elif prev_size > 1:
|
||||
for i in prev_deque:
|
||||
yield i
|
||||
|
||||
while line:
|
||||
yield line
|
||||
line = reader.readline()
|
||||
|
||||
return gen_iter(line)
|
||||
|
||||
|
||||
# Iterate over exact matches
|
||||
def iter_exact(reader, key):
|
||||
lines = search(reader, key)
|
||||
for x in lines:
|
||||
if not x.startswith(key):
|
||||
break
|
||||
|
||||
yield x
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -28,6 +28,10 @@ class RegexRewriter:
|
||||
def archivalRewrite(rewriter):
|
||||
return lambda x: rewriter.rewrite(x)
|
||||
|
||||
@staticmethod
|
||||
def replacer(string):
|
||||
return lambda x: string
|
||||
|
||||
HTTPX_MATCH_STR = 'https?:\\\\?/\\\\?/[A-Za-z0-9:_@.-]+'
|
||||
|
||||
DEFAULT_OP = addPrefix
|
||||
@ -97,6 +101,9 @@ class JSRewriter(RegexRewriter):
|
||||
>>> test_js('window.location = "http://example.com/abc.html" document.domain = "anotherdomain.com"')
|
||||
'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html" document.WB_wombat_domain = "anotherdomain.com"'
|
||||
|
||||
>>> test_js('document_domain = "anotherdomain.com"; window.document.domain = "example.com"')
|
||||
'document_domain = "anotherdomain.com"; window.document.WB_wombat_domain = "example.com"'
|
||||
|
||||
# custom rules added
|
||||
>>> test_js('window.location = "http://example.com/abc.html"; some_func(); ', [('some_func\(\).*', RegexRewriter.commentOut, 0)])
|
||||
'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"; /*some_func(); */'
|
||||
@ -113,7 +120,8 @@ class JSRewriter(RegexRewriter):
|
||||
def _createRules(self, httpPrefix):
|
||||
return [
|
||||
(RegexRewriter.HTTPX_MATCH_STR, httpPrefix, 0),
|
||||
('location|domain', 'WB_wombat_', 0),
|
||||
('location', 'WB_wombat_', 0),
|
||||
('(?<=document\.)domain', 'WB_wombat_', 0),
|
||||
]
|
||||
|
||||
|
||||
|
@ -1,7 +1,6 @@
|
||||
import StringIO
|
||||
from urllib2 import URLError
|
||||
import chardet
|
||||
import redis
|
||||
import copy
|
||||
|
||||
import indexreader, archiveloader
|
||||
@ -89,14 +88,30 @@ class ReplayHandler(object):
|
||||
if failedFiles and filename in failedFiles:
|
||||
raise wbexceptions.ArchiveLoadFailed(filename, 'Skipping Already Failed')
|
||||
|
||||
try:
|
||||
return self.archiveloader.load(self.resolveFull(filename), offset, length)
|
||||
any_found = False
|
||||
last_exc = None
|
||||
for resolver in self.resolvers:
|
||||
possible_paths = resolver(filename)
|
||||
|
||||
except URLError as ue:
|
||||
if failedFiles:
|
||||
failedFiles.append(filename)
|
||||
if possible_paths:
|
||||
for path in possible_paths:
|
||||
any_found = True
|
||||
try:
|
||||
return self.archiveloader.load(path, offset, length)
|
||||
|
||||
raise wbexceptions.ArchiveLoadFailed(filename, ue.reason)
|
||||
except URLError as ue:
|
||||
last_exc = ue
|
||||
print last_exc
|
||||
pass
|
||||
|
||||
# Unsuccessful if reached here
|
||||
if failedFiles:
|
||||
failedFiles.append(filename)
|
||||
|
||||
if not any_found:
|
||||
raise wbexceptions.UnresolvedArchiveFileException('Archive File Not Found: ' + filename)
|
||||
else:
|
||||
raise wbexceptions.ArchiveLoadFailed(filename, last_exc.reason if last_exc else '')
|
||||
|
||||
|
||||
def doReplay(self, cdx, wbrequest, query, failedFiles):
|
||||
@ -399,25 +414,4 @@ class RewritingReplayHandler(ReplayHandler):
|
||||
return (ArchivalUrlRewriter.stripProtocol(requestUrl) == ArchivalUrlRewriter.stripProtocol(locationUrl))
|
||||
|
||||
|
||||
#======================================
|
||||
# PrefixResolver - convert cdx file entry to url with prefix if url contains specified string
|
||||
#======================================
|
||||
def PrefixResolver(prefix, contains):
|
||||
def makeUrl(url):
|
||||
return prefix + url if (contains in url) else None
|
||||
|
||||
return makeUrl
|
||||
|
||||
#======================================
|
||||
class RedisResolver:
|
||||
def __init__(self, redisUrl, keyPrefix = 'w:'):
|
||||
self.redisUrl = redisUrl
|
||||
self.keyPrefix = keyPrefix
|
||||
self.redis = redis.StrictRedis.from_url(redisUrl)
|
||||
|
||||
def __call__(self, filename):
|
||||
try:
|
||||
return self.redis.hget(self.keyPrefix + filename, 'path')
|
||||
except Exception as e:
|
||||
print e
|
||||
return None
|
||||
|
41
pywb/replay_resolvers.py
Normal file
41
pywb/replay_resolvers.py
Normal file
@ -0,0 +1,41 @@
|
||||
import redis
|
||||
import pycdx_server.binsearch as binsearch
|
||||
#======================================
|
||||
# PrefixResolver - convert cdx file entry to url with prefix if url contains specified string
|
||||
#======================================
|
||||
def PrefixResolver(prefix, contains):
|
||||
def makeUrl(url):
|
||||
return [prefix + url] if (contains in url) else []
|
||||
|
||||
return makeUrl
|
||||
|
||||
#======================================
|
||||
class RedisResolver:
|
||||
def __init__(self, redisUrl, keyPrefix = 'w:'):
|
||||
self.redisUrl = redisUrl
|
||||
self.keyPrefix = keyPrefix
|
||||
self.redis = redis.StrictRedis.from_url(redisUrl)
|
||||
|
||||
def __call__(self, filename):
|
||||
try:
|
||||
return [self.redis.hget(self.keyPrefix + filename, 'path')]
|
||||
except Exception as e:
|
||||
print e
|
||||
return None
|
||||
|
||||
#======================================
|
||||
class PathIndexResolver:
|
||||
def __init__(self, pathindex_file):
|
||||
self.reader = binsearch.FileReader(pathindex_file)
|
||||
|
||||
def __call__(self, filename):
|
||||
result = binsearch.iter_exact(self.reader, filename)
|
||||
|
||||
def gen_list(result):
|
||||
for pathline in result:
|
||||
path = pathline.split('\t')
|
||||
if len(path) == 2:
|
||||
yield path[1].rstrip()
|
||||
|
||||
return gen_list(result)
|
||||
|
6
run.sh
6
run.sh
@ -3,14 +3,14 @@
|
||||
mypath=$(cd `dirname $0` && pwd)
|
||||
|
||||
app=$2
|
||||
cd $mypath/pywb
|
||||
#cd $mypath/pywb
|
||||
if [ -z "$app" ]; then
|
||||
app=wbapp.py
|
||||
app=pywb.wbapp
|
||||
fi
|
||||
|
||||
if [ -z "$1" ]; then
|
||||
# Standard root config
|
||||
uwsgi --static-map /static=$mypath/static --http-socket :8080 --wsgi-file $app
|
||||
uwsgi --static-map /static=$mypath/static --http-socket :8080 -b 65536 --wsgi $app
|
||||
else
|
||||
# Test on non-root mount
|
||||
uwsgi --static-map /static=$mypath/static --http-socket :8080 --mount "$1=$app" --no-default-app --manage-script-name
|
||||
|
@ -4,7 +4,7 @@
|
||||
display: block !important;
|
||||
top: 0px !important;
|
||||
left: 0px !important;
|
||||
position: absolute !important;
|
||||
position: inherit !important;
|
||||
padding: 4px !important;
|
||||
width: 100% !important;
|
||||
font-size: 24px !important;
|
||||
|
Loading…
x
Reference in New Issue
Block a user