mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
first iteration of archival mode working w/ banner insertion!!
This commit is contained in:
parent
16f458d5ec
commit
a84ec2abc7
@ -15,8 +15,8 @@ class HttpStreamLoader:
|
|||||||
self.hmacDuration = hmacDuration
|
self.hmacDuration = hmacDuration
|
||||||
|
|
||||||
def load(self, url, offset, length):
|
def load(self, url, offset, length):
|
||||||
if length:
|
if length > 0:
|
||||||
rangeHeader = 'bytes={0}-{1}'.format(offset, int(offset) + int(length) - 1)
|
rangeHeader = 'bytes={0}-{1}'.format(offset, offset + length - 1)
|
||||||
else:
|
else:
|
||||||
rangeHeader = 'bytes={0}-'.format(offset)
|
rangeHeader = 'bytes={0}-'.format(offset)
|
||||||
|
|
||||||
@ -31,7 +31,20 @@ class HttpStreamLoader:
|
|||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
WBArchiveRecord = collections.namedtuple('WBArchiveRecord', 'parsed, stream, statusline, httpHeaders')
|
# Untested, but for completeness
|
||||||
|
class FileStreamLoader:
|
||||||
|
def load(self, url, offset, length):
|
||||||
|
if url.startswith('file://'):
|
||||||
|
url = url[len('file://'):]
|
||||||
|
|
||||||
|
afile = open(url, 'rb')
|
||||||
|
afile.seek(offset)
|
||||||
|
return afile
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
WBArchiveRecord = collections.namedtuple('WBArchiveRecord', 'type, record, stream, statusline, httpHeaders')
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class ArchiveLoader:
|
class ArchiveLoader:
|
||||||
@ -46,14 +59,17 @@ class ArchiveLoader:
|
|||||||
'.arc': (hanzo.warctools.ArcRecord, 'arc', False),
|
'.arc': (hanzo.warctools.ArcRecord, 'arc', False),
|
||||||
}
|
}
|
||||||
|
|
||||||
HTTP_STATUS_REGEX = re.compile('^HTTP/[\d.]+ ((\d+).*)$')
|
HTTP_STATUS_REGEX = re.compile('^HTTP/[\d.]+ (\d+.*)$')
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def createDefaultLoaders():
|
def createDefaultLoaders():
|
||||||
http = HttpStreamLoader()
|
http = HttpStreamLoader()
|
||||||
|
file = FileStreamLoader()
|
||||||
return {
|
return {
|
||||||
'http': http,
|
'http': http,
|
||||||
'https': http,
|
'https': http,
|
||||||
|
'file': file,
|
||||||
|
'': file
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -86,10 +102,15 @@ class ArchiveLoader:
|
|||||||
else:
|
else:
|
||||||
decomp = None
|
decomp = None
|
||||||
|
|
||||||
|
try:
|
||||||
|
length = int(length)
|
||||||
|
except:
|
||||||
|
length = -1
|
||||||
|
|
||||||
raw = loader.load(url, offset, length)
|
|
||||||
|
|
||||||
reader = LineReader(raw, self.chunkSize, decomp)
|
raw = loader.load(url, long(offset), length)
|
||||||
|
|
||||||
|
reader = LineReader(raw, length, self.chunkSize, decomp)
|
||||||
|
|
||||||
parser = loaderCls.make_parser()
|
parser = loaderCls.make_parser()
|
||||||
|
|
||||||
@ -104,27 +125,33 @@ class ArchiveLoader:
|
|||||||
|
|
||||||
|
|
||||||
if aFormat == 'arc':
|
if aFormat == 'arc':
|
||||||
recType = 'arc-response'
|
recType = 'response'
|
||||||
empty = (utils.get_header(parsed.headers, 'length') == 0)
|
empty = (utils.get_header(parsed.headers, 'length') == 0)
|
||||||
else:
|
else:
|
||||||
recType = utils.get_header(parsed.headers, 'WARC-Type')
|
recType = utils.get_header(parsed.headers, 'WARC-Type')
|
||||||
empty = (utils.get_header(parsed.headers, 'Content-Length') == '0')
|
empty = (utils.get_header(parsed.headers, 'Content-Length') == '0')
|
||||||
|
|
||||||
parsed.recType = recType
|
# special case: empty w/arc record (hopefully a revisit)
|
||||||
parsed.aFormat = aFormat
|
|
||||||
|
|
||||||
if empty:
|
if empty:
|
||||||
return WBArchiveRecord(parsed, reader, '400', [])
|
statusline = '204 No Content'
|
||||||
|
headers = []
|
||||||
|
|
||||||
|
# special case: warc records that are not expected to have http headers
|
||||||
|
# attempt to add 200 status and content-type
|
||||||
elif recType == 'metadata' or recType == 'resource':
|
elif recType == 'metadata' or recType == 'resource':
|
||||||
|
statusline = '200 OK'
|
||||||
headers = [('Content-Type', utils.get_header(parsed.headers, 'Content-Type'))]
|
headers = [('Content-Type', utils.get_header(parsed.headers, 'Content-Type'))]
|
||||||
|
|
||||||
return WBArchiveRecord(parsed, reader, '200 OK', headers)
|
# special case: http 0.9 response, no status or headers
|
||||||
|
elif recType == 'response' and (';version=0.9' in utils.get_header(parsed.headers, 'Content-Type')):
|
||||||
|
statusline = '200 OK'
|
||||||
|
headers = []
|
||||||
|
|
||||||
|
# response record: parse HTTP status and headers!
|
||||||
else:
|
else:
|
||||||
(statusline, headers) = self.parseHttpHeaders(reader)
|
(statusline, headers) = self.parseHttpHeaders(reader)
|
||||||
|
|
||||||
return WBArchiveRecord(parsed, reader, statusline, headers)
|
return WBArchiveRecord((aFormat, recType), parsed, reader, statusline, headers)
|
||||||
|
|
||||||
|
|
||||||
def parseHttpHeaders(self, stream):
|
def parseHttpHeaders(self, stream):
|
||||||
@ -153,20 +180,23 @@ class ArchiveLoader:
|
|||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class LineReader:
|
class LineReader:
|
||||||
def __init__(self, stream, chunkSize = 1024, decomp = None):
|
def __init__(self, stream, maxLen = 0, chunkSize = 1024, decomp = None):
|
||||||
self.stream = stream
|
self.stream = stream
|
||||||
self.chunkSize = chunkSize
|
self.chunkSize = chunkSize
|
||||||
self.decomp = decomp
|
self.decomp = decomp
|
||||||
self.buff = None
|
self.buff = None
|
||||||
self.numread = 0
|
self.numRead = 0
|
||||||
|
self.maxLen = maxLen
|
||||||
|
|
||||||
def _fillbuff(self, chunkSize = None):
|
def _fillbuff(self, chunkSize = None):
|
||||||
if not chunkSize:
|
if not chunkSize:
|
||||||
chunkSize = self.chunkSize
|
chunkSize = self.chunkSize
|
||||||
|
|
||||||
if not self.buff or self.buff.pos >= self.buff.len:
|
if not self.buff or self.buff.pos >= self.buff.len:
|
||||||
data = self.stream.read(chunkSize)
|
toRead = min(self.maxLen - self.numRead, self.chunkSize) if (self.maxLen > 0) else self.chunkSize
|
||||||
self.numread += len(data)
|
data = self.stream.read(toRead)
|
||||||
|
self.numRead += len(data)
|
||||||
|
|
||||||
if self.decomp:
|
if self.decomp:
|
||||||
data = self.decomp.decompress(data)
|
data = self.decomp.decompress(data)
|
||||||
|
|
||||||
|
@ -80,6 +80,9 @@ class RemoteCDXServer:
|
|||||||
ArchivalUrl.REPLAY:
|
ArchivalUrl.REPLAY:
|
||||||
{'sort': 'closest', 'filter': '!statuscode:(500|502|504)', 'limit': replayClosest, 'closest': wburl.timestamp, 'resolveRevisits': True},
|
{'sort': 'closest', 'filter': '!statuscode:(500|502|504)', 'limit': replayClosest, 'closest': wburl.timestamp, 'resolveRevisits': True},
|
||||||
|
|
||||||
|
# BUG: resolveRevisits currently doesn't work for this type of query
|
||||||
|
# This is not an issue in archival mode, as there is a redirect to the actual timestamp query
|
||||||
|
# but may be an issue in proxy mode
|
||||||
ArchivalUrl.LATEST_REPLAY:
|
ArchivalUrl.LATEST_REPLAY:
|
||||||
{'sort': 'reverse', 'filter': 'statuscode:[23]..', 'limit': '1', 'resolveRevisits': True}
|
{'sort': 'reverse', 'filter': 'statuscode:[23]..', 'limit': '1', 'resolveRevisits': True}
|
||||||
|
|
||||||
|
@ -91,8 +91,8 @@ class JSRewriter(RegexRewriter):
|
|||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, httpPrefix, extra = []):
|
def __init__(self, rewriter, extra = []):
|
||||||
rules = self._createRules(httpPrefix)
|
rules = self._createRules(rewriter.getAbsUrl())
|
||||||
rules.extend(extra)
|
rules.extend(extra)
|
||||||
|
|
||||||
RegexRewriter.__init__(self, rules)
|
RegexRewriter.__init__(self, rules)
|
||||||
@ -167,12 +167,10 @@ class CSSRewriter(RegexRewriter):
|
|||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import doctest
|
import doctest
|
||||||
|
|
||||||
rwPrefix = '/web/20131010im_/'
|
|
||||||
|
|
||||||
arcrw = ArchivalUrlRewriter('/20131010im_/http://example.com/', '/web/')
|
arcrw = ArchivalUrlRewriter('/20131010im_/http://example.com/', '/web/')
|
||||||
|
|
||||||
def test_js(string, extra = []):
|
def test_js(string, extra = []):
|
||||||
return JSRewriter(rwPrefix, extra).replaceAll(string)
|
return JSRewriter(arcrw, extra).replaceAll(string)
|
||||||
|
|
||||||
def test_css(string):
|
def test_css(string):
|
||||||
return CSSRewriter(arcrw).replaceAll(string)
|
return CSSRewriter(arcrw).replaceAll(string)
|
||||||
|
228
pywb/replay.py
228
pywb/replay.py
@ -1,8 +1,32 @@
|
|||||||
|
import StringIO
|
||||||
|
|
||||||
import indexreader
|
import indexreader
|
||||||
from wbrequestresponse import WbResponse
|
from wbrequestresponse import WbResponse
|
||||||
|
from wbarchivalurl import ArchivalUrl
|
||||||
import utils
|
import utils
|
||||||
|
from wburlrewriter import ArchivalUrlRewriter
|
||||||
|
|
||||||
class ReplayHandler:
|
import wbhtml
|
||||||
|
import regexmatch
|
||||||
|
import wbexceptions
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
class FullHandler:
|
||||||
|
def __init__(self, query, replay):
|
||||||
|
self.query = query
|
||||||
|
self.replay = replay
|
||||||
|
|
||||||
|
def __call__(self, wbrequest, _):
|
||||||
|
query_response = self.query(wbrequest, None)
|
||||||
|
|
||||||
|
if (wbrequest.wb_url.type == ArchivalUrl.QUERY) or (wbrequest.wb_url.type == ArchivalUrl.URL_QUERY):
|
||||||
|
return query_response
|
||||||
|
|
||||||
|
return self.replay(wbrequest, query_response)
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
class ReplayHandler(object):
|
||||||
def __init__(self, resolvers, archiveloader):
|
def __init__(self, resolvers, archiveloader):
|
||||||
self.resolvers = resolvers
|
self.resolvers = resolvers
|
||||||
self.archiveloader = archiveloader
|
self.archiveloader = archiveloader
|
||||||
@ -11,38 +35,45 @@ class ReplayHandler:
|
|||||||
cdxlist = query_response.body
|
cdxlist = query_response.body
|
||||||
last_e = None
|
last_e = None
|
||||||
first = True
|
first = True
|
||||||
|
|
||||||
for cdx in cdxlist:
|
for cdx in cdxlist:
|
||||||
try:
|
try:
|
||||||
cdx = indexreader.CDXCaptureResult(cdx)
|
cdx = indexreader.CDXCaptureResult(cdx)
|
||||||
|
|
||||||
# First time through, check if do redirect before warc load
|
# ability to intercept and redirect
|
||||||
if first and (cdx['timestamp'] != wbrequest.wb_url.timestamp):
|
if first:
|
||||||
return WbResponse.better_timestamp_response(wbrequest, cdx['timestamp'])
|
self._checkRedir(wbrequest, cdx)
|
||||||
|
first = False
|
||||||
|
|
||||||
response = self.doReplay(cdx, wbrequest)
|
response = self.doReplay(cdx, wbrequest)
|
||||||
|
|
||||||
if response:
|
if response:
|
||||||
# if a fallback, redirect to exact timestamp!
|
response.cdx = cdx
|
||||||
if not first and (cdx['timestamp'] != wbrequest.wb_url.timestamp):
|
|
||||||
response.close()
|
|
||||||
return WbResponse.better_timestamp_response(wbrequest, cdx['timestamp'])
|
|
||||||
|
|
||||||
return response
|
return response
|
||||||
|
|
||||||
first = False
|
#except wbexceptions.InternalRedirect as ir:
|
||||||
|
# raise ir
|
||||||
|
|
||||||
except Exception, e:
|
except wbexceptions.CaptureException as ce:
|
||||||
import traceback
|
import traceback
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
last_e = e
|
last_e = ce
|
||||||
pass
|
pass
|
||||||
|
|
||||||
if last_e:
|
if last_e:
|
||||||
raise last_e
|
raise last_e
|
||||||
|
else:
|
||||||
|
raise wbexceptions.ArchiveLoadFailed()
|
||||||
|
|
||||||
|
def _checkRedir(self, wbrequest, cdx):
|
||||||
|
return None
|
||||||
|
|
||||||
def _load(self, cdx, revisit = False):
|
def _load(self, cdx, revisit = False):
|
||||||
prefix = '' if not revisit else 'orig.'
|
if revisit:
|
||||||
return self.archiveloader.load(self.resolveFull(cdx[prefix + 'filename']), cdx[prefix + 'offset'], cdx[prefix + 'length'])
|
return self.archiveloader.load(self.resolveFull(cdx['orig.filename']), cdx['orig.offset'], cdx['orig.length'])
|
||||||
|
else:
|
||||||
|
return self.archiveloader.load(self.resolveFull(cdx['filename']), cdx['offset'], cdx['length'])
|
||||||
|
|
||||||
|
|
||||||
def doReplay(self, cdx, wbrequest):
|
def doReplay(self, cdx, wbrequest):
|
||||||
hasCurr = (cdx['filename'] != '-')
|
hasCurr = (cdx['filename'] != '-')
|
||||||
@ -75,19 +106,8 @@ class ReplayHandler:
|
|||||||
else:
|
else:
|
||||||
raise wbexceptions.CaptureException('Invalid CDX' + cdx)
|
raise wbexceptions.CaptureException('Invalid CDX' + cdx)
|
||||||
|
|
||||||
# Check for self redirect
|
|
||||||
if headersRecord.statusline.startswith('3'):
|
|
||||||
if self.isSelfRedirect(wbrequest, headersRecord):
|
|
||||||
raise wbexception.CaptureException('Self Redirect: ' + cdx)
|
|
||||||
|
|
||||||
return WbResponse.stream_response(headersRecord.statusline, headersRecord.httpHeaders, payloadRecord.stream)
|
return WbResponse.stream_response(headersRecord.statusline, headersRecord.httpHeaders, payloadRecord.stream)
|
||||||
|
|
||||||
def isSelfRedirect(self, wbrequest, record):
|
|
||||||
requestUrl = wbrequest.wb_url.url.lower()
|
|
||||||
locationUrl = utils.get_header(record.httpHeaders, 'Location').lower()
|
|
||||||
return requestUrl == locationUrl
|
|
||||||
#ArchivalUrlRewriter.stripProtocol(requestUrl) == ArchivalUrlRewriter.stripProtocol(locationUrl)
|
|
||||||
|
|
||||||
|
|
||||||
def resolveFull(self, filename):
|
def resolveFull(self, filename):
|
||||||
# Attempt to resolve cdx file to full path
|
# Attempt to resolve cdx file to full path
|
||||||
@ -100,6 +120,164 @@ class ReplayHandler:
|
|||||||
raise exceptions.UnresolvedArchiveFileException('Archive File Not Found: ' + cdx.filename)
|
raise exceptions.UnresolvedArchiveFileException('Archive File Not Found: ' + cdx.filename)
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
class RewritingReplayHandler(ReplayHandler):
|
||||||
|
|
||||||
|
|
||||||
|
REWRITE_TYPES = {
|
||||||
|
'html': ('text/html', 'application/xhtml'),
|
||||||
|
'css': ('text/css'),
|
||||||
|
'js': ('text/javascript', 'application/javascript', 'application/x-javascript'),
|
||||||
|
'xml': ('/xml', '+xml', '.xml', '.rss'),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
PROXY_HEADERS = ('content-type', 'content-disposition')
|
||||||
|
|
||||||
|
URL_REWRITE_HEADERS = ('location', 'content-location', 'content-base')
|
||||||
|
|
||||||
|
ENCODING_HEADERS = ('content-encoding', 'transfer-encoding')
|
||||||
|
|
||||||
|
|
||||||
|
def __init__(self, resolvers, archiveloader, headerPrefix = 'X-Archive-Orig-', headInsert = None):
|
||||||
|
ReplayHandler.__init__(self, resolvers, archiveloader)
|
||||||
|
self.headerPrefix = headerPrefix
|
||||||
|
self.headInsert = headInsert
|
||||||
|
|
||||||
|
|
||||||
|
def _canonContentType(self, contentType):
|
||||||
|
for type, mimelist in self.REWRITE_TYPES.iteritems():
|
||||||
|
for mime in mimelist:
|
||||||
|
if mime in contentType:
|
||||||
|
return type
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def __call__(self, wbrequest, query_response):
|
||||||
|
urlrewriter = ArchivalUrlRewriter(wbrequest.wb_url, wbrequest.wb_prefix)
|
||||||
|
wbrequest.urlrewriter = urlrewriter
|
||||||
|
|
||||||
|
response = ReplayHandler.__call__(self, wbrequest, query_response)
|
||||||
|
|
||||||
|
if response and response.cdx:
|
||||||
|
self._checkRedir(wbrequest, response.cdx)
|
||||||
|
|
||||||
|
# Transparent!
|
||||||
|
if wbrequest.wb_url.mod == 'id_':
|
||||||
|
return response
|
||||||
|
|
||||||
|
contentType = utils.get_header(response.headersList, 'Content-Type')
|
||||||
|
|
||||||
|
canonType = self._canonContentType(contentType)
|
||||||
|
|
||||||
|
(newHeaders, remHeaders) = self._rewriteHeaders(response.headersList, (canonType is not None))
|
||||||
|
|
||||||
|
# binary type, just send through
|
||||||
|
if canonType is None:
|
||||||
|
response.headersList = newHeaders
|
||||||
|
return response
|
||||||
|
|
||||||
|
# Handle text rewriting
|
||||||
|
# TODO: better way to pass this
|
||||||
|
stream = response._stream
|
||||||
|
|
||||||
|
# special case -- need to ungzip the body
|
||||||
|
if (utils.contains_header(remHeaders, ('Content-Encoding', 'gzip'))):
|
||||||
|
stream = archiveloader.LineStream(stream, decomp = zlib.decompressobj(16 + zlib.MAX_WBITS))
|
||||||
|
|
||||||
|
return self._rewriteContent(canonType, urlrewriter, stream, newHeaders, response)
|
||||||
|
|
||||||
|
# TODO: first non-streaming attempt, probably want to stream
|
||||||
|
def _rewriteContent(self, canonType, urlrewriter, stream, newHeaders, origResponse):
|
||||||
|
if canonType == 'html':
|
||||||
|
out = StringIO.StringIO()
|
||||||
|
htmlrewriter = wbhtml.WBHtml(urlrewriter, out, self.headInsert)
|
||||||
|
|
||||||
|
try:
|
||||||
|
buff = stream.read()
|
||||||
|
while buff:
|
||||||
|
htmlrewriter.feed(buff)
|
||||||
|
buff = stream.read()
|
||||||
|
|
||||||
|
htmlrewriter.close()
|
||||||
|
|
||||||
|
#except Exception as e:
|
||||||
|
# print e
|
||||||
|
|
||||||
|
finally:
|
||||||
|
value = [out.getvalue()]
|
||||||
|
out.close()
|
||||||
|
|
||||||
|
else:
|
||||||
|
if canonType == 'css':
|
||||||
|
rewriter = regexmatch.CSSRewriter(urlrewriter)
|
||||||
|
elif canonType == 'js':
|
||||||
|
rewriter = regexmatch.JSRewriter(urlrewriter)
|
||||||
|
|
||||||
|
def gen():
|
||||||
|
try:
|
||||||
|
buff = stream.read()
|
||||||
|
while buff:
|
||||||
|
yield rewriter.replaceAll(buff)
|
||||||
|
buff = stream.read()
|
||||||
|
|
||||||
|
finally:
|
||||||
|
stream.close()
|
||||||
|
|
||||||
|
value = gen()
|
||||||
|
|
||||||
|
return WbResponse(status = origResponse.status, headersList = newHeaders, value = value)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def _rewriteHeaders(self, headers, stripEncoding = False):
|
||||||
|
newHeaders = []
|
||||||
|
removedHeaders = []
|
||||||
|
|
||||||
|
for (name, value) in headers:
|
||||||
|
lowername = name.lower()
|
||||||
|
if lowername in self.PROXY_HEADERS:
|
||||||
|
newHeaders.append((name, value))
|
||||||
|
elif lowername in self.URL_REWRITE_HEADERS:
|
||||||
|
newHeaders.append((name, urlrewriter.rewrite(value)))
|
||||||
|
elif lowername in self.ENCODING_HEADERS:
|
||||||
|
if stripEncoding:
|
||||||
|
removedHeaders.append((name, value))
|
||||||
|
else:
|
||||||
|
newHeaders.append((name, value))
|
||||||
|
else:
|
||||||
|
newHeaders.append((self.headerPrefix + name, value))
|
||||||
|
|
||||||
|
return (newHeaders, removedHeaders)
|
||||||
|
|
||||||
|
|
||||||
|
def _checkRedir(self, wbrequest, cdx):
|
||||||
|
if cdx and (cdx['timestamp'] != wbrequest.wb_url.timestamp):
|
||||||
|
newUrl = wbrequest.urlrewriter.getTimestampUrl(cdx['timestamp'], cdx['original'])
|
||||||
|
raise wbexceptions.InternalRedirect(newUrl)
|
||||||
|
#return WbResponse.better_timestamp_response(wbrequest, cdx['timestamp'])
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def doReplay(self, cdx, wbrequest):
|
||||||
|
wbresponse = ReplayHandler.doReplay(self, cdx, wbrequest)
|
||||||
|
|
||||||
|
# Check for self redirect
|
||||||
|
if wbresponse.status.startswith('3'):
|
||||||
|
if self.isSelfRedirect(wbrequest, wbresponse.headersList):
|
||||||
|
raise wbexceptions.CaptureException('Self Redirect: ' + str(cdx))
|
||||||
|
|
||||||
|
return wbresponse
|
||||||
|
|
||||||
|
def isSelfRedirect(self, wbrequest, httpHeaders):
|
||||||
|
requestUrl = wbrequest.wb_url.url.lower()
|
||||||
|
locationUrl = utils.get_header(httpHeaders, 'Location').lower()
|
||||||
|
#return requestUrl == locationUrl
|
||||||
|
return (ArchivalUrlRewriter.stripProtocol(requestUrl) == ArchivalUrlRewriter.stripProtocol(locationUrl))
|
||||||
|
|
||||||
|
|
||||||
#======================================
|
#======================================
|
||||||
# PrefixResolver - convert cdx file entry to url with prefix if url contains specified string
|
# PrefixResolver - convert cdx file entry to url with prefix if url contains specified string
|
||||||
#======================================
|
#======================================
|
||||||
|
@ -1,8 +0,0 @@
|
|||||||
#!/bin/sh
|
|
||||||
|
|
||||||
app=$1
|
|
||||||
if [ -z "$app" ]; then
|
|
||||||
app=wbapp.py
|
|
||||||
fi
|
|
||||||
|
|
||||||
uwsgi --http :9090 --wsgi-file $app
|
|
@ -17,6 +17,16 @@ def get_header(headersList, name):
|
|||||||
if (value[0].lower() == nameLower):
|
if (value[0].lower() == nameLower):
|
||||||
return value[1]
|
return value[1]
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
def contains_header(headersList, seekHeader):
|
||||||
|
header = get_header(headersList, seekHeader[0])
|
||||||
|
if not header:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# see if found header matches value!
|
||||||
|
return (header == seekHeader[1])
|
||||||
|
|
||||||
class HMACCookieMaker:
|
class HMACCookieMaker:
|
||||||
def __init__(self, key, name):
|
def __init__(self, key, name):
|
||||||
self.key = key
|
self.key = key
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
from query import QueryHandler
|
from query import QueryHandler
|
||||||
|
from replay import FullHandler
|
||||||
import wbexceptions
|
import wbexceptions
|
||||||
|
|
||||||
from wbrequestresponse import WbResponse
|
from wbrequestresponse import WbResponse
|
||||||
@ -19,7 +20,16 @@ class WBHandler:
|
|||||||
query = QueryHandler()
|
query = QueryHandler()
|
||||||
|
|
||||||
import testwb
|
import testwb
|
||||||
replay = testwb.createReplay()
|
|
||||||
|
headInsert = """
|
||||||
|
|
||||||
|
<!-- WB Insert -->
|
||||||
|
<script src='/static/wb.js'> </script>
|
||||||
|
<link rel='stylesheet' href='/static/wb.css'/>
|
||||||
|
<!-- End WB Insert -->
|
||||||
|
"""
|
||||||
|
|
||||||
|
replay = testwb.createReplay(headInsert)
|
||||||
|
|
||||||
## ===========
|
## ===========
|
||||||
parser = ArchivalRequestRouter(
|
parser = ArchivalRequestRouter(
|
||||||
@ -28,6 +38,7 @@ parser = ArchivalRequestRouter(
|
|||||||
't1' : [WBHandler()],
|
't1' : [WBHandler()],
|
||||||
't2' : [query],
|
't2' : [query],
|
||||||
't3' : [query, replay],
|
't3' : [query, replay],
|
||||||
|
'web': FullHandler(query, replay)
|
||||||
},
|
},
|
||||||
hostpaths = ['http://localhost:9090/'])
|
hostpaths = ['http://localhost:9090/'])
|
||||||
## ===========
|
## ===========
|
||||||
@ -42,6 +53,9 @@ def application(env, start_response):
|
|||||||
if not response:
|
if not response:
|
||||||
raise wbexceptions.NotFoundException(env['REQUEST_URI'] + ' was not found')
|
raise wbexceptions.NotFoundException(env['REQUEST_URI'] + ' was not found')
|
||||||
|
|
||||||
|
except wbexceptions.InternalRedirect as ir:
|
||||||
|
response = WbResponse(status = ir.status, headersList = ir.httpHeaders)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
last_exc = e
|
last_exc = e
|
||||||
import traceback
|
import traceback
|
||||||
|
@ -38,4 +38,15 @@ class InvalidArchiveRecordException(CaptureException):
|
|||||||
super(InvalidArchiveRecordException, self).__init__(msg)
|
super(InvalidArchiveRecordException, self).__init__(msg)
|
||||||
self.errList = errList
|
self.errList = errList
|
||||||
|
|
||||||
|
class ArchiveLoadFailed(CaptureException):
|
||||||
|
pass
|
||||||
|
|
||||||
|
class InternalRedirect(Exception):
|
||||||
|
def __init__(self, location, status = '302 Internal Redirect'):
|
||||||
|
Exception.__init__(self, 'Redirecting -> ' + location)
|
||||||
|
self.status = status
|
||||||
|
self.httpHeaders = [('Location', location)]
|
||||||
|
|
||||||
|
def status(_):
|
||||||
|
return self.status
|
||||||
|
|
||||||
|
@ -22,6 +22,9 @@ class WBHtml(HTMLParser):
|
|||||||
>>> parse('<META http-equiv="refresh" content="10; URL=/abc/def.html">')
|
>>> parse('<META http-equiv="refresh" content="10; URL=/abc/def.html">')
|
||||||
<meta http-equiv="refresh" content="10; URL=/web/20131226101010/http://example.com/abc/def.html">
|
<meta http-equiv="refresh" content="10; URL=/web/20131226101010/http://example.com/abc/def.html">
|
||||||
|
|
||||||
|
>>> parse('<meta http-equiv="Content-type" content="text/html; charset=utf-8" />')
|
||||||
|
<meta http-equiv="Content-type" content="text/html; charset=utf-8"/>
|
||||||
|
|
||||||
>>> parse('<script>window.location = "http://example.com/a/b/c.html"</script>')
|
>>> parse('<script>window.location = "http://example.com/a/b/c.html"</script>')
|
||||||
<script>window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html"</script>
|
<script>window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html"</script>
|
||||||
|
|
||||||
@ -41,7 +44,18 @@ class WBHtml(HTMLParser):
|
|||||||
# Unterminated style tag auto-terminate
|
# Unterminated style tag auto-terminate
|
||||||
>>> parse('<style>@import url(styles.css)')
|
>>> parse('<style>@import url(styles.css)')
|
||||||
<style>@import url(/web/20131226101010/http://example.com/some/path/styles.css)</style>
|
<style>@import url(/web/20131226101010/http://example.com/some/path/styles.css)</style>
|
||||||
"""
|
|
||||||
|
# Head Insertion
|
||||||
|
>>> parse('<html><head><script src="other.js"></script></head><body>Test</body></html>', headInsert = '<script src="cool.js"></script>')
|
||||||
|
<html><head><script src="cool.js"></script><script src="/web/20131226101010js_/http://example.com/some/path/other.js"></script></head><body>Test</body></html>
|
||||||
|
|
||||||
|
>>> parse('<body><div>SomeTest</div>', headInsert = '/* Insert */')
|
||||||
|
/* Insert */<body><div>SomeTest</div>
|
||||||
|
|
||||||
|
>>> parse('<link href="abc.txt"><div>SomeTest</div>', headInsert = '<script>load_stuff();</script>')
|
||||||
|
<link href="/web/20131226101010oe_/http://example.com/some/path/abc.txt"><script>load_stuff();</script><div>SomeTest</div>
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
REWRITE_TAGS = {
|
REWRITE_TAGS = {
|
||||||
'a': {'href': ''},
|
'a': {'href': ''},
|
||||||
@ -53,6 +67,7 @@ class WBHtml(HTMLParser):
|
|||||||
'body': {'background': 'im_'},
|
'body': {'background': 'im_'},
|
||||||
'del': {'cite': ''},
|
'del': {'cite': ''},
|
||||||
'embed': {'src': 'oe_'},
|
'embed': {'src': 'oe_'},
|
||||||
|
'head': {'': ''}, # for head rewriting
|
||||||
'iframe': {'src': 'if_'},
|
'iframe': {'src': 'if_'},
|
||||||
'img': {'src': 'im_'},
|
'img': {'src': 'im_'},
|
||||||
'ins': {'cite': ''},
|
'ins': {'cite': ''},
|
||||||
@ -64,6 +79,7 @@ class WBHtml(HTMLParser):
|
|||||||
'object': {'codebase': 'oe_',
|
'object': {'codebase': 'oe_',
|
||||||
'data': 'oe_'},
|
'data': 'oe_'},
|
||||||
'q': {'cite': ''},
|
'q': {'cite': ''},
|
||||||
|
'ref': {'href': 'oe_'},
|
||||||
'script': {'src': 'js_'},
|
'script': {'src': 'js_'},
|
||||||
'div': {'data-src' : '',
|
'div': {'data-src' : '',
|
||||||
'data-uri' : ''},
|
'data-uri' : ''},
|
||||||
@ -73,17 +89,21 @@ class WBHtml(HTMLParser):
|
|||||||
|
|
||||||
STATE_TAGS = ['script', 'style']
|
STATE_TAGS = ['script', 'style']
|
||||||
|
|
||||||
|
HEAD_TAGS = ['html', 'head', 'base', 'link', 'meta', 'title', 'style', 'script', 'object', 'bgsound']
|
||||||
|
|
||||||
def __init__(self, rewriter, outstream = None):
|
|
||||||
|
def __init__(self, rewriter, outstream = None, headInsert = None):
|
||||||
HTMLParser.__init__(self)
|
HTMLParser.__init__(self)
|
||||||
|
|
||||||
self.rewriter = rewriter
|
self.rewriter = rewriter
|
||||||
self._wbParseContext = None
|
self._wbParseContext = None
|
||||||
self.out = outstream if outstream else sys.stdout
|
self.out = outstream if outstream else sys.stdout
|
||||||
|
|
||||||
self.jsRewriter = JSRewriter(rewriter.getAbsUrl())
|
self.jsRewriter = JSRewriter(rewriter)
|
||||||
self.cssRewriter = CSSRewriter(rewriter)
|
self.cssRewriter = CSSRewriter(rewriter)
|
||||||
|
|
||||||
|
self.headInsert = headInsert
|
||||||
|
|
||||||
|
|
||||||
def close(self):
|
def close(self):
|
||||||
if (self._wbParseContext):
|
if (self._wbParseContext):
|
||||||
@ -137,6 +157,11 @@ class WBHtml(HTMLParser):
|
|||||||
elif (tag in WBHtml.STATE_TAGS) and (self._wbParseContext == None):
|
elif (tag in WBHtml.STATE_TAGS) and (self._wbParseContext == None):
|
||||||
self._wbParseContext = tag
|
self._wbParseContext = tag
|
||||||
|
|
||||||
|
# special case: head insertion, non-head tags
|
||||||
|
elif (self.headInsert and (self._wbParseContext == None) and (tag not in WBHtml.HEAD_TAGS)):
|
||||||
|
self.out.write(self.headInsert)
|
||||||
|
self.headInsert = None
|
||||||
|
|
||||||
# attr rewriting
|
# attr rewriting
|
||||||
handler = WBHtml.REWRITE_TAGS.get(tag)
|
handler = WBHtml.REWRITE_TAGS.get(tag)
|
||||||
if not handler:
|
if not handler:
|
||||||
@ -159,8 +184,9 @@ class WBHtml(HTMLParser):
|
|||||||
attrValue = self._rewriteCSS(attrValue)
|
attrValue = self._rewriteCSS(attrValue)
|
||||||
|
|
||||||
# special case: meta tag
|
# special case: meta tag
|
||||||
elif (tag == 'meta') and (attrName == 'content') and self.hasAttr(tagAttrs, ('http-equiv', 'refresh')):
|
elif (tag == 'meta') and (attrName == 'content'):
|
||||||
attrValue = self._rewriteMetaRefresh(attrValue)
|
if self.hasAttr(tagAttrs, ('http-equiv', 'refresh')):
|
||||||
|
attrValue = self._rewriteMetaRefresh(attrValue)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
rwMod = handler.get(attrName)
|
rwMod = handler.get(attrName)
|
||||||
@ -171,6 +197,11 @@ class WBHtml(HTMLParser):
|
|||||||
|
|
||||||
self.out.write('/>' if isStartEnd else '>')
|
self.out.write('/>' if isStartEnd else '>')
|
||||||
|
|
||||||
|
# special case: head tag
|
||||||
|
if (self.headInsert) and (self._wbParseContext == None) and (tag == "head"):
|
||||||
|
self.out.write(self.headInsert)
|
||||||
|
self.headInsert = None
|
||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def handle_starttag(self, tag, attrs):
|
def handle_starttag(self, tag, attrs):
|
||||||
@ -233,8 +264,8 @@ if __name__ == "__main__":
|
|||||||
|
|
||||||
rewriter = ArchivalUrlRewriter('/20131226101010/http://example.com/some/path/index.html', '/web/')
|
rewriter = ArchivalUrlRewriter('/20131226101010/http://example.com/some/path/index.html', '/web/')
|
||||||
|
|
||||||
def parse(data):
|
def parse(data, headInsert = None):
|
||||||
parser = WBHtml(rewriter)
|
parser = WBHtml(rewriter, headInsert = headInsert)
|
||||||
parser.feed(data)
|
parser.feed(data)
|
||||||
parser.close()
|
parser.close()
|
||||||
|
|
||||||
|
@ -118,7 +118,9 @@ class WbResponse:
|
|||||||
finally:
|
finally:
|
||||||
stream.close()
|
stream.close()
|
||||||
|
|
||||||
return WbResponse(statusline, headersList = headers, value = streamGen())
|
response = WbResponse(statusline, headersList = headers, value = streamGen())
|
||||||
|
response._stream = stream
|
||||||
|
return response
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def better_timestamp_response(wbrequest, newTimestamp):
|
def better_timestamp_response(wbrequest, newTimestamp):
|
||||||
@ -139,7 +141,6 @@ class WbResponse:
|
|||||||
if env['REQUEST_METHOD'] == 'HEAD':
|
if env['REQUEST_METHOD'] == 'HEAD':
|
||||||
if hasattr(self.body, 'close'):
|
if hasattr(self.body, 'close'):
|
||||||
self.body.close()
|
self.body.close()
|
||||||
return self.body
|
|
||||||
return []
|
return []
|
||||||
|
|
||||||
if hasattr(self.body, '__iter__'):
|
if hasattr(self.body, '__iter__'):
|
||||||
|
@ -38,6 +38,9 @@ class ArchivalUrlRewriter:
|
|||||||
>>> ArchivalUrlRewriter('/19960708im_/http://domain.example.com/path.txt', '/abc/').getAbsUrl()
|
>>> ArchivalUrlRewriter('/19960708im_/http://domain.example.com/path.txt', '/abc/').getAbsUrl()
|
||||||
'/abc/19960708im_/'
|
'/abc/19960708im_/'
|
||||||
|
|
||||||
|
>>> ArchivalUrlRewriter('/2013id_/example.com/file/path/blah.html', '/123/').getTimestampUrl('20131024')
|
||||||
|
'/123/20131024id_/http://example.com/file/path/blah.html'
|
||||||
|
|
||||||
>>> ArchivalUrlRewriter.stripProtocol('https://example.com') == ArchivalUrlRewriter.stripProtocol('http://example.com')
|
>>> ArchivalUrlRewriter.stripProtocol('https://example.com') == ArchivalUrlRewriter.stripProtocol('http://example.com')
|
||||||
True
|
True
|
||||||
"""
|
"""
|
||||||
@ -46,8 +49,8 @@ class ArchivalUrlRewriter:
|
|||||||
|
|
||||||
PROTOCOLS = ['http://', 'https://', '//', 'ftp://', 'mms://', 'rtsp://', 'wais://']
|
PROTOCOLS = ['http://', 'https://', '//', 'ftp://', 'mms://', 'rtsp://', 'wais://']
|
||||||
|
|
||||||
def __init__(self, wburl_str, prefix):
|
def __init__(self, wburl, prefix):
|
||||||
self.wburl = ArchivalUrl(wburl_str)
|
self.wburl = wburl if isinstance(wburl, ArchivalUrl) else ArchivalUrl(wburl)
|
||||||
self.prefix = prefix
|
self.prefix = prefix
|
||||||
|
|
||||||
if self.prefix.endswith('/'):
|
if self.prefix.endswith('/'):
|
||||||
@ -84,6 +87,12 @@ class ArchivalUrlRewriter:
|
|||||||
def getAbsUrl(self, url = ''):
|
def getAbsUrl(self, url = ''):
|
||||||
return self.prefix + ArchivalUrl.to_str(self.wburl.type, self.wburl.mod, self.wburl.timestamp, url)
|
return self.prefix + ArchivalUrl.to_str(self.wburl.type, self.wburl.mod, self.wburl.timestamp, url)
|
||||||
|
|
||||||
|
def getTimestampUrl(self, timestamp, url = None):
|
||||||
|
if not url:
|
||||||
|
url = self.wburl.url
|
||||||
|
|
||||||
|
return self.prefix + ArchivalUrl.to_str(self.wburl.type, self.wburl.mod, timestamp, url)
|
||||||
|
|
||||||
|
|
||||||
def setBaseUrl(self, newUrl):
|
def setBaseUrl(self, newUrl):
|
||||||
self.wburl.url = newUrl
|
self.wburl.url = newUrl
|
||||||
|
11
run.sh
Executable file
11
run.sh
Executable file
@ -0,0 +1,11 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
|
||||||
|
mypath=$(cd `dirname $0` && pwd)
|
||||||
|
|
||||||
|
app=$1
|
||||||
|
cd $mypath/pywb
|
||||||
|
if [ -z "$app" ]; then
|
||||||
|
app=wbapp.py
|
||||||
|
fi
|
||||||
|
|
||||||
|
uwsgi --static-map /static=$mypath/static --http :9090 --wsgi-file $app
|
13
static/wb.css
Normal file
13
static/wb.css
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
|
||||||
|
#_wayback_banner
|
||||||
|
{
|
||||||
|
display: block;
|
||||||
|
position: absolute;
|
||||||
|
top: 0px;
|
||||||
|
width: 100%;
|
||||||
|
border: 1px solid;
|
||||||
|
background-color: lightYellow;
|
||||||
|
text-align: center;
|
||||||
|
z-index: 2147483643;
|
||||||
|
}
|
||||||
|
|
42
static/wb.js
Normal file
42
static/wb.js
Normal file
@ -0,0 +1,42 @@
|
|||||||
|
|
||||||
|
|
||||||
|
// Rewritten location and domain obj setup
|
||||||
|
window.WB_wombat_location = window.location
|
||||||
|
|
||||||
|
if (window.top != window) {
|
||||||
|
window.top.WB_wombat_location = window.top.location
|
||||||
|
}
|
||||||
|
|
||||||
|
if (window.opener) {
|
||||||
|
window.opener.WB_wombat_location = window.opener.location
|
||||||
|
}
|
||||||
|
|
||||||
|
document.WB_wombat_domain = document.domain
|
||||||
|
|
||||||
|
function initBanner()
|
||||||
|
{
|
||||||
|
var BANNER_ID = "_wayback_banner";
|
||||||
|
|
||||||
|
var banner = document.getElementById(BANNER_ID);
|
||||||
|
|
||||||
|
if (!banner) {
|
||||||
|
banner = document.createElement("wb_div");
|
||||||
|
banner.setAttribute("id", BANNER_ID);
|
||||||
|
banner.style.cssText = "display: block; width: 100%; border: 1px solid; background-color: lightYellow; text-align: center";
|
||||||
|
|
||||||
|
//banner.innerHTML = "<img src='http://wbgrp-svc112.us.archive.org:8080/images/logo_WM.png#wb_pass'/>";
|
||||||
|
banner.innerHTML = "PyWb Banner!"
|
||||||
|
document.body.insertBefore(banner, document.body.firstChild);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var readyStateCheckInterval = setInterval(function() {
|
||||||
|
if (document.readyState === "interactive" || document.readyState === "complete") {
|
||||||
|
initBanner();
|
||||||
|
clearInterval(readyStateCheckInterval);
|
||||||
|
}
|
||||||
|
}, 10);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
x
Reference in New Issue
Block a user