1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

support utf-8 (so far)

support protocol-agnostic prefix //
failedFile list for warc loading
This commit is contained in:
Ilya Kreymer 2013-12-31 00:18:12 +00:00
parent b8c4a453c9
commit d9930322f1
6 changed files with 62 additions and 34 deletions

View File

@ -1,4 +1,5 @@
import StringIO
from urllib2 import URLError
import indexreader
from wbrequestresponse import WbResponse
@ -35,6 +36,9 @@ class ReplayHandler(object):
cdxlist = query_response.body
last_e = None
first = True
# List of already failed w/arcs
failedFiles = []
for cdx in cdxlist:
try:
@ -45,15 +49,12 @@ class ReplayHandler(object):
self._checkRedir(wbrequest, cdx)
first = False
response = self.doReplay(cdx, wbrequest)
response = self.doReplay(cdx, wbrequest, failedFiles)
if response:
response.cdx = cdx
return response
#except wbexceptions.InternalRedirect as ir:
# raise ir
except wbexceptions.CaptureException as ce:
import traceback
traceback.print_exc()
@ -63,38 +64,51 @@ class ReplayHandler(object):
if last_e:
raise last_e
else:
raise wbexceptions.ArchiveLoadFailed()
raise wbexceptions.UnresolvedArchiveFileException()
def _checkRedir(self, wbrequest, cdx):
return None
def _load(self, cdx, revisit = False):
def _load(self, cdx, revisit, failedFiles):
if revisit:
return self.archiveloader.load(self.resolveFull(cdx['orig.filename']), cdx['orig.offset'], cdx['orig.length'])
(filename, offset, length) = (cdx['orig.filename'], cdx['orig.offset'], cdx['orig.length'])
else:
return self.archiveloader.load(self.resolveFull(cdx['filename']), cdx['offset'], cdx['length'])
(filename, offset, length) = (cdx['filename'], cdx['offset'], cdx['length'])
#optimization: if same file already failed this request, don't try again
if failedFiles and filename in failedFiles:
raise wbexceptions.ArchiveLoadFailed(filename, 'Skipping Already Failed')
try:
return self.archiveloader.load(self.resolveFull(filename), offset, length)
except URLError as ue:
if failedFiles:
failedFiles.append(filename)
raise wbexceptions.ArchiveLoadFailed(filename, ue.reason)
def doReplay(self, cdx, wbrequest):
def doReplay(self, cdx, wbrequest, failedFiles):
hasCurr = (cdx['filename'] != '-')
hasOrig = (cdx['orig.filename'] != '-')
# Case 1: non-revisit
if (hasCurr and not hasOrig):
headersRecord = self._load(cdx, False)
headersRecord = self._load(cdx, False, failedFiles)
payloadRecord = headersRecord
isRevisit = False
# Case 2: old-style revisit, load headers from original payload
elif (not hasCurr and hasOrig):
payloadRecord = self._load(cdx, False)
payloadRecord = self._load(cdx, False, failedFiles)
headersRecord = payloadRecord
isRevisit = True
# Case 3: modern revisit, load headers from curr, payload from original
elif (hasCurr and hasOrig):
headersRecord = self._load(cdx, False)
payloadRecord = self._load(cdx, True)
headersRecord = self._load(cdx, False, failedFiles)
payloadRecord = self._load(cdx, True, failedFiles)
# Case 4: if headers record is actually empty (eg empty revisit), then use headers from revisit
if not headersRecord.httpHeaders:
@ -191,13 +205,15 @@ class RewritingReplayHandler(ReplayHandler):
def _rewriteContent(self, textType, urlrewriter, stream, newHeaders, origResponse, encoding = 'utf-8'):
if textType == 'html':
out = StringIO.StringIO()
#out = SimpleWriter()
htmlrewriter = wbhtml.WBHtml(urlrewriter, out, self.headInsert)
try:
buff = stream.read()#.decode(encoding)
buff = stream.read()
while buff:
buff = buff.decode(encoding)
htmlrewriter.feed(buff)
buff = stream.read()#.decode(encoding)
buff = stream.read()
htmlrewriter.close()
@ -205,29 +221,23 @@ class RewritingReplayHandler(ReplayHandler):
# print e
finally:
value = [out.getvalue()]
value = [out.getvalue().encode(encoding)]
newHeaders.append(('Content-Length', str(len(value[0]))))
out.close()
return WbResponse(status = origResponse.status, headersList = newHeaders, value = value)
else:
if textType == 'css':
rewriter = regexmatch.CSSRewriter(urlrewriter)
elif textType == 'js':
rewriter = regexmatch.JSRewriter(urlrewriter)
def gen():
try:
buff = stream.read()
while buff:
yield rewriter.replaceAll(buff)
buff = stream.read()
def doRewrite(buff):
return rewriter.replaceAll(buff)
finally:
stream.close()
return WbResponse.stream_response(origResponse.status, newHeaders, stream, doRewrite)
value = gen()
return WbResponse(status = origResponse.status, headersList = newHeaders, value = value)
@ -261,8 +271,8 @@ class RewritingReplayHandler(ReplayHandler):
return None
def doReplay(self, cdx, wbrequest):
wbresponse = ReplayHandler.doReplay(self, cdx, wbrequest)
def doReplay(self, cdx, wbrequest, failedFiles):
wbresponse = ReplayHandler.doReplay(self, cdx, wbrequest, failedFiles)
# Check for self redirect
if wbresponse.status.startswith('3'):
@ -286,3 +296,5 @@ def PrefixResolver(prefix, contains):
return prefix + url if (contains in url) else None
return makeUrl

View File

@ -17,6 +17,10 @@ class ArchivalUrl:
>>> repr(ArchivalUrl('/20130102im_/https://example.com'))
"('replay', '20130102', 'im_', 'https://example.com', '/20130102im_/https://example.com')"
# Protocol agnostic convert to http
>>> repr(ArchivalUrl('/20130102im_///example.com'))
"('replay', '20130102', 'im_', 'http://example.com', '/20130102im_/http://example.com')"
>>> repr(ArchivalUrl('/cs_/example.com'))
"('latest_replay', '', 'cs_', 'http://example.com', '/cs_/http://example.com')"
@ -81,7 +85,11 @@ class ArchivalUrl:
if len(self.url) == 0:
raise wbexceptions.RequestParseException('Invalid WB Request Url: ' + url)
if not self.url.startswith('//') and not '://' in self.url:
# protocol agnostic url -> http://
if self.url.startswith('//'):
self.url = ArchivalUrl.DEFAULT_SCHEME + self.url[2:]
# no protocol -> http://
elif not '://' in self.url:
self.url = ArchivalUrl.DEFAULT_SCHEME + self.url
matcher = rfc3987.match(self.url, 'IRI')

View File

@ -39,7 +39,13 @@ class InvalidArchiveRecordException(CaptureException):
self.errList = errList
class ArchiveLoadFailed(CaptureException):
pass
def __init__(self, filename, reason):
super(ArchiveLoadFailed, self).__init__(filename + ':' + str(reason))
self.filename = filename
self.reason = reason
def status(_):
return '503'
class InternalRedirect(Exception):
def __init__(self, location, status = '302 Internal Redirect'):

View File

@ -108,11 +108,13 @@ class WbResponse:
return WbResponse(status, headersList = [('Location', location)])
@staticmethod
def stream_response(statusline, headers, stream):
def stream_response(statusline, headers, stream, proc = None):
def streamGen():
try:
buff = stream.read()
while buff:
if proc:
buff = proc(buff)
yield buff
buff = stream.read()
finally:

2
run.sh
View File

@ -8,4 +8,4 @@ if [ -z "$app" ]; then
app=wbapp.py
fi
uwsgi --static-map /static=$mypath/static --http :9090 --wsgi-file $app
uwsgi --static-map /static=$mypath/static --http :8080 --wsgi-file $app

View File

@ -2,7 +2,7 @@
#_wayback_banner
{
display: block;
position: absolute;
position: fixed;
top: 0px;
width: 100%;
border: 1px solid;