mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
pycdx_server initial binsearch module, with support exact match iterator!
fix html_rewriter missing ; on entities js rewriter: only rewrite full document.domain PathIndexPrefixResolver using binsearch on path index, for #9 resolvers moved to replay_resolvers.py improve path-resolver logic: each resolver returns an array of possible files (could be from primary or secondary storage). then, iterate over all possible files from all resolvers until a successful load, or raise exception if all failed
This commit is contained in:
parent
b237b144ff
commit
e95e17b9e6
2
__init__.py
Normal file
2
__init__.py
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
#Allow importing
|
||||||
|
|
@ -1,2 +1,3 @@
|
|||||||
#Allow importing
|
#Allow importing
|
||||||
|
|
||||||
|
|
||||||
|
@ -28,6 +28,9 @@ class WBHtml(HTMLParser):
|
|||||||
>>> parse('<html><head><base href="http://example.com/some/path/index.html"/>')
|
>>> parse('<html><head><base href="http://example.com/some/path/index.html"/>')
|
||||||
<html><head><base href="/web/20131226101010/http://example.com/some/path/index.html"/>
|
<html><head><base href="/web/20131226101010/http://example.com/some/path/index.html"/>
|
||||||
|
|
||||||
|
>>> parse('<a href="">› ></div>')
|
||||||
|
<a href>› ></div>
|
||||||
|
|
||||||
# Don't rewrite anchors
|
# Don't rewrite anchors
|
||||||
>>> parse('<HTML><A Href="#abc">Text</a></hTmL>')
|
>>> parse('<HTML><A Href="#abc">Text</a></hTmL>')
|
||||||
<HTML><a href="#abc">Text</a></html>
|
<HTML><a href="#abc">Text</a></html>
|
||||||
@ -215,7 +218,7 @@ class WBHtml(HTMLParser):
|
|||||||
if rwMod is not None:
|
if rwMod is not None:
|
||||||
attrValue = self._rewriteURL(attrValue, rwMod)
|
attrValue = self._rewriteURL(attrValue, rwMod)
|
||||||
|
|
||||||
if attrValue:
|
if attrValue is not None:
|
||||||
#self.out.write(' {0}="{1}"'.format(attrName, attrValue))
|
#self.out.write(' {0}="{1}"'.format(attrName, attrValue))
|
||||||
self.out.write(' ' + attrName + '="' + attrValue + '"')
|
self.out.write(' ' + attrName + '="' + attrValue + '"')
|
||||||
else:
|
else:
|
||||||
@ -280,10 +283,10 @@ class WBHtml(HTMLParser):
|
|||||||
self.parseData(data)
|
self.parseData(data)
|
||||||
|
|
||||||
def handle_entityref(self, data):
|
def handle_entityref(self, data):
|
||||||
self.out.write('&' + data)
|
self.out.write('&' + data + ';')
|
||||||
|
|
||||||
def handle_charref(self, data):
|
def handle_charref(self, data):
|
||||||
self.out.write('&#' + data)
|
self.out.write('&#' + data + ';')
|
||||||
|
|
||||||
def handle_comment(self, data):
|
def handle_comment(self, data):
|
||||||
self.out.write('<!--')
|
self.out.write('<!--')
|
||||||
|
4
pywb/pycdx_server/__init__.py
Normal file
4
pywb/pycdx_server/__init__.py
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
#Allow importing
|
||||||
|
|
||||||
|
#from pkgutil import extend_path
|
||||||
|
#__path__ = extend_path(__path__, __name__)
|
92
pywb/pycdx_server/binsearch.py
Normal file
92
pywb/pycdx_server/binsearch.py
Normal file
@ -0,0 +1,92 @@
|
|||||||
|
from collections import deque
|
||||||
|
import os
|
||||||
|
import itertools
|
||||||
|
|
||||||
|
class FileReader:
|
||||||
|
def __init__(self, filename):
|
||||||
|
self.fh = open(filename, 'rb')
|
||||||
|
self.size = os.path.getsize(filename)
|
||||||
|
|
||||||
|
def getsize(self):
|
||||||
|
return self.size
|
||||||
|
|
||||||
|
def readline(self):
|
||||||
|
return self.fh.readline()
|
||||||
|
|
||||||
|
def seek(self, offset):
|
||||||
|
return self.fh.seek(offset)
|
||||||
|
|
||||||
|
|
||||||
|
def binsearch_offset(reader, key, compare_func = cmp, block_size = 8192):
|
||||||
|
min = 0
|
||||||
|
max = reader.getsize() / block_size
|
||||||
|
|
||||||
|
while (max - min > 1):
|
||||||
|
mid = min + ((max - min) / 2)
|
||||||
|
reader.seek(mid * block_size)
|
||||||
|
|
||||||
|
if mid > 0:
|
||||||
|
reader.readline() # skip partial line
|
||||||
|
|
||||||
|
line = reader.readline()
|
||||||
|
|
||||||
|
if compare_func(key, line) > 0:
|
||||||
|
min = mid
|
||||||
|
else:
|
||||||
|
max = mid
|
||||||
|
|
||||||
|
return (min * block_size)
|
||||||
|
|
||||||
|
|
||||||
|
def search(reader, key, prev_size = 0, compare_func = cmp, block_size = 8192):
|
||||||
|
min = binsearch_offset(reader, key, compare_func, block_size)
|
||||||
|
|
||||||
|
reader.seek(min)
|
||||||
|
|
||||||
|
if min > 0:
|
||||||
|
reader.readline() # skip partial line
|
||||||
|
|
||||||
|
if prev_size > 1:
|
||||||
|
prev_deque = deque(maxlen = prev_size)
|
||||||
|
|
||||||
|
line = None
|
||||||
|
|
||||||
|
while True:
|
||||||
|
line = reader.readline()
|
||||||
|
if not line:
|
||||||
|
break
|
||||||
|
if compare_func(line, key) >= 0:
|
||||||
|
break
|
||||||
|
|
||||||
|
if prev_size == 1:
|
||||||
|
prev = line
|
||||||
|
elif prev_size > 1:
|
||||||
|
prev_deque.append(line)
|
||||||
|
|
||||||
|
def gen_iter(line):
|
||||||
|
if prev_size == 1:
|
||||||
|
yield prev
|
||||||
|
elif prev_size > 1:
|
||||||
|
for i in prev_deque:
|
||||||
|
yield i
|
||||||
|
|
||||||
|
while line:
|
||||||
|
yield line
|
||||||
|
line = reader.readline()
|
||||||
|
|
||||||
|
return gen_iter(line)
|
||||||
|
|
||||||
|
|
||||||
|
# Iterate over exact matches
|
||||||
|
def iter_exact(reader, key):
|
||||||
|
lines = search(reader, key)
|
||||||
|
for x in lines:
|
||||||
|
if not x.startswith(key):
|
||||||
|
break
|
||||||
|
|
||||||
|
yield x
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -28,6 +28,10 @@ class RegexRewriter:
|
|||||||
def archivalRewrite(rewriter):
|
def archivalRewrite(rewriter):
|
||||||
return lambda x: rewriter.rewrite(x)
|
return lambda x: rewriter.rewrite(x)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def replacer(string):
|
||||||
|
return lambda x: string
|
||||||
|
|
||||||
HTTPX_MATCH_STR = 'https?:\\\\?/\\\\?/[A-Za-z0-9:_@.-]+'
|
HTTPX_MATCH_STR = 'https?:\\\\?/\\\\?/[A-Za-z0-9:_@.-]+'
|
||||||
|
|
||||||
DEFAULT_OP = addPrefix
|
DEFAULT_OP = addPrefix
|
||||||
@ -97,6 +101,9 @@ class JSRewriter(RegexRewriter):
|
|||||||
>>> test_js('window.location = "http://example.com/abc.html" document.domain = "anotherdomain.com"')
|
>>> test_js('window.location = "http://example.com/abc.html" document.domain = "anotherdomain.com"')
|
||||||
'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html" document.WB_wombat_domain = "anotherdomain.com"'
|
'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html" document.WB_wombat_domain = "anotherdomain.com"'
|
||||||
|
|
||||||
|
>>> test_js('document_domain = "anotherdomain.com"; window.document.domain = "example.com"')
|
||||||
|
'document_domain = "anotherdomain.com"; window.document.WB_wombat_domain = "example.com"'
|
||||||
|
|
||||||
# custom rules added
|
# custom rules added
|
||||||
>>> test_js('window.location = "http://example.com/abc.html"; some_func(); ', [('some_func\(\).*', RegexRewriter.commentOut, 0)])
|
>>> test_js('window.location = "http://example.com/abc.html"; some_func(); ', [('some_func\(\).*', RegexRewriter.commentOut, 0)])
|
||||||
'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"; /*some_func(); */'
|
'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"; /*some_func(); */'
|
||||||
@ -113,7 +120,8 @@ class JSRewriter(RegexRewriter):
|
|||||||
def _createRules(self, httpPrefix):
|
def _createRules(self, httpPrefix):
|
||||||
return [
|
return [
|
||||||
(RegexRewriter.HTTPX_MATCH_STR, httpPrefix, 0),
|
(RegexRewriter.HTTPX_MATCH_STR, httpPrefix, 0),
|
||||||
('location|domain', 'WB_wombat_', 0),
|
('location', 'WB_wombat_', 0),
|
||||||
|
('(?<=document\.)domain', 'WB_wombat_', 0),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,7 +1,6 @@
|
|||||||
import StringIO
|
import StringIO
|
||||||
from urllib2 import URLError
|
from urllib2 import URLError
|
||||||
import chardet
|
import chardet
|
||||||
import redis
|
|
||||||
import copy
|
import copy
|
||||||
|
|
||||||
import indexreader, archiveloader
|
import indexreader, archiveloader
|
||||||
@ -89,14 +88,30 @@ class ReplayHandler(object):
|
|||||||
if failedFiles and filename in failedFiles:
|
if failedFiles and filename in failedFiles:
|
||||||
raise wbexceptions.ArchiveLoadFailed(filename, 'Skipping Already Failed')
|
raise wbexceptions.ArchiveLoadFailed(filename, 'Skipping Already Failed')
|
||||||
|
|
||||||
try:
|
any_found = False
|
||||||
return self.archiveloader.load(self.resolveFull(filename), offset, length)
|
last_exc = None
|
||||||
|
for resolver in self.resolvers:
|
||||||
|
possible_paths = resolver(filename)
|
||||||
|
|
||||||
except URLError as ue:
|
if possible_paths:
|
||||||
if failedFiles:
|
for path in possible_paths:
|
||||||
failedFiles.append(filename)
|
any_found = True
|
||||||
|
try:
|
||||||
|
return self.archiveloader.load(path, offset, length)
|
||||||
|
|
||||||
raise wbexceptions.ArchiveLoadFailed(filename, ue.reason)
|
except URLError as ue:
|
||||||
|
last_exc = ue
|
||||||
|
print last_exc
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Unsuccessful if reached here
|
||||||
|
if failedFiles:
|
||||||
|
failedFiles.append(filename)
|
||||||
|
|
||||||
|
if not any_found:
|
||||||
|
raise wbexceptions.UnresolvedArchiveFileException('Archive File Not Found: ' + filename)
|
||||||
|
else:
|
||||||
|
raise wbexceptions.ArchiveLoadFailed(filename, last_exc.reason if last_exc else '')
|
||||||
|
|
||||||
|
|
||||||
def doReplay(self, cdx, wbrequest, query, failedFiles):
|
def doReplay(self, cdx, wbrequest, query, failedFiles):
|
||||||
@ -399,25 +414,4 @@ class RewritingReplayHandler(ReplayHandler):
|
|||||||
return (ArchivalUrlRewriter.stripProtocol(requestUrl) == ArchivalUrlRewriter.stripProtocol(locationUrl))
|
return (ArchivalUrlRewriter.stripProtocol(requestUrl) == ArchivalUrlRewriter.stripProtocol(locationUrl))
|
||||||
|
|
||||||
|
|
||||||
#======================================
|
|
||||||
# PrefixResolver - convert cdx file entry to url with prefix if url contains specified string
|
|
||||||
#======================================
|
|
||||||
def PrefixResolver(prefix, contains):
|
|
||||||
def makeUrl(url):
|
|
||||||
return prefix + url if (contains in url) else None
|
|
||||||
|
|
||||||
return makeUrl
|
|
||||||
|
|
||||||
#======================================
|
|
||||||
class RedisResolver:
|
|
||||||
def __init__(self, redisUrl, keyPrefix = 'w:'):
|
|
||||||
self.redisUrl = redisUrl
|
|
||||||
self.keyPrefix = keyPrefix
|
|
||||||
self.redis = redis.StrictRedis.from_url(redisUrl)
|
|
||||||
|
|
||||||
def __call__(self, filename):
|
|
||||||
try:
|
|
||||||
return self.redis.hget(self.keyPrefix + filename, 'path')
|
|
||||||
except Exception as e:
|
|
||||||
print e
|
|
||||||
return None
|
|
||||||
|
41
pywb/replay_resolvers.py
Normal file
41
pywb/replay_resolvers.py
Normal file
@ -0,0 +1,41 @@
|
|||||||
|
import redis
|
||||||
|
import pycdx_server.binsearch as binsearch
|
||||||
|
#======================================
|
||||||
|
# PrefixResolver - convert cdx file entry to url with prefix if url contains specified string
|
||||||
|
#======================================
|
||||||
|
def PrefixResolver(prefix, contains):
|
||||||
|
def makeUrl(url):
|
||||||
|
return [prefix + url] if (contains in url) else []
|
||||||
|
|
||||||
|
return makeUrl
|
||||||
|
|
||||||
|
#======================================
|
||||||
|
class RedisResolver:
|
||||||
|
def __init__(self, redisUrl, keyPrefix = 'w:'):
|
||||||
|
self.redisUrl = redisUrl
|
||||||
|
self.keyPrefix = keyPrefix
|
||||||
|
self.redis = redis.StrictRedis.from_url(redisUrl)
|
||||||
|
|
||||||
|
def __call__(self, filename):
|
||||||
|
try:
|
||||||
|
return [self.redis.hget(self.keyPrefix + filename, 'path')]
|
||||||
|
except Exception as e:
|
||||||
|
print e
|
||||||
|
return None
|
||||||
|
|
||||||
|
#======================================
|
||||||
|
class PathIndexResolver:
|
||||||
|
def __init__(self, pathindex_file):
|
||||||
|
self.reader = binsearch.FileReader(pathindex_file)
|
||||||
|
|
||||||
|
def __call__(self, filename):
|
||||||
|
result = binsearch.iter_exact(self.reader, filename)
|
||||||
|
|
||||||
|
def gen_list(result):
|
||||||
|
for pathline in result:
|
||||||
|
path = pathline.split('\t')
|
||||||
|
if len(path) == 2:
|
||||||
|
yield path[1].rstrip()
|
||||||
|
|
||||||
|
return gen_list(result)
|
||||||
|
|
6
run.sh
6
run.sh
@ -3,14 +3,14 @@
|
|||||||
mypath=$(cd `dirname $0` && pwd)
|
mypath=$(cd `dirname $0` && pwd)
|
||||||
|
|
||||||
app=$2
|
app=$2
|
||||||
cd $mypath/pywb
|
#cd $mypath/pywb
|
||||||
if [ -z "$app" ]; then
|
if [ -z "$app" ]; then
|
||||||
app=wbapp.py
|
app=pywb.wbapp
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ -z "$1" ]; then
|
if [ -z "$1" ]; then
|
||||||
# Standard root config
|
# Standard root config
|
||||||
uwsgi --static-map /static=$mypath/static --http-socket :8080 --wsgi-file $app
|
uwsgi --static-map /static=$mypath/static --http-socket :8080 -b 65536 --wsgi $app
|
||||||
else
|
else
|
||||||
# Test on non-root mount
|
# Test on non-root mount
|
||||||
uwsgi --static-map /static=$mypath/static --http-socket :8080 --mount "$1=$app" --no-default-app --manage-script-name
|
uwsgi --static-map /static=$mypath/static --http-socket :8080 --mount "$1=$app" --no-default-app --manage-script-name
|
||||||
|
@ -4,7 +4,7 @@
|
|||||||
display: block !important;
|
display: block !important;
|
||||||
top: 0px !important;
|
top: 0px !important;
|
||||||
left: 0px !important;
|
left: 0px !important;
|
||||||
position: absolute !important;
|
position: inherit !important;
|
||||||
padding: 4px !important;
|
padding: 4px !important;
|
||||||
width: 100% !important;
|
width: 100% !important;
|
||||||
font-size: 24px !important;
|
font-size: 24px !important;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user