mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
support utf-8 (so far)
support protocol-agnostic prefix // failedFile list for warc loading
This commit is contained in:
parent
b8c4a453c9
commit
d9930322f1
@ -1,4 +1,5 @@
|
|||||||
import StringIO
|
import StringIO
|
||||||
|
from urllib2 import URLError
|
||||||
|
|
||||||
import indexreader
|
import indexreader
|
||||||
from wbrequestresponse import WbResponse
|
from wbrequestresponse import WbResponse
|
||||||
@ -35,6 +36,9 @@ class ReplayHandler(object):
|
|||||||
cdxlist = query_response.body
|
cdxlist = query_response.body
|
||||||
last_e = None
|
last_e = None
|
||||||
first = True
|
first = True
|
||||||
|
|
||||||
|
# List of already failed w/arcs
|
||||||
|
failedFiles = []
|
||||||
|
|
||||||
for cdx in cdxlist:
|
for cdx in cdxlist:
|
||||||
try:
|
try:
|
||||||
@ -45,15 +49,12 @@ class ReplayHandler(object):
|
|||||||
self._checkRedir(wbrequest, cdx)
|
self._checkRedir(wbrequest, cdx)
|
||||||
first = False
|
first = False
|
||||||
|
|
||||||
response = self.doReplay(cdx, wbrequest)
|
response = self.doReplay(cdx, wbrequest, failedFiles)
|
||||||
|
|
||||||
if response:
|
if response:
|
||||||
response.cdx = cdx
|
response.cdx = cdx
|
||||||
return response
|
return response
|
||||||
|
|
||||||
#except wbexceptions.InternalRedirect as ir:
|
|
||||||
# raise ir
|
|
||||||
|
|
||||||
except wbexceptions.CaptureException as ce:
|
except wbexceptions.CaptureException as ce:
|
||||||
import traceback
|
import traceback
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
@ -63,38 +64,51 @@ class ReplayHandler(object):
|
|||||||
if last_e:
|
if last_e:
|
||||||
raise last_e
|
raise last_e
|
||||||
else:
|
else:
|
||||||
raise wbexceptions.ArchiveLoadFailed()
|
raise wbexceptions.UnresolvedArchiveFileException()
|
||||||
|
|
||||||
def _checkRedir(self, wbrequest, cdx):
|
def _checkRedir(self, wbrequest, cdx):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def _load(self, cdx, revisit = False):
|
def _load(self, cdx, revisit, failedFiles):
|
||||||
if revisit:
|
if revisit:
|
||||||
return self.archiveloader.load(self.resolveFull(cdx['orig.filename']), cdx['orig.offset'], cdx['orig.length'])
|
(filename, offset, length) = (cdx['orig.filename'], cdx['orig.offset'], cdx['orig.length'])
|
||||||
else:
|
else:
|
||||||
return self.archiveloader.load(self.resolveFull(cdx['filename']), cdx['offset'], cdx['length'])
|
(filename, offset, length) = (cdx['filename'], cdx['offset'], cdx['length'])
|
||||||
|
|
||||||
|
#optimization: if same file already failed this request, don't try again
|
||||||
|
if failedFiles and filename in failedFiles:
|
||||||
|
raise wbexceptions.ArchiveLoadFailed(filename, 'Skipping Already Failed')
|
||||||
|
|
||||||
|
try:
|
||||||
|
return self.archiveloader.load(self.resolveFull(filename), offset, length)
|
||||||
|
|
||||||
|
except URLError as ue:
|
||||||
|
if failedFiles:
|
||||||
|
failedFiles.append(filename)
|
||||||
|
|
||||||
|
raise wbexceptions.ArchiveLoadFailed(filename, ue.reason)
|
||||||
|
|
||||||
|
|
||||||
def doReplay(self, cdx, wbrequest):
|
def doReplay(self, cdx, wbrequest, failedFiles):
|
||||||
hasCurr = (cdx['filename'] != '-')
|
hasCurr = (cdx['filename'] != '-')
|
||||||
hasOrig = (cdx['orig.filename'] != '-')
|
hasOrig = (cdx['orig.filename'] != '-')
|
||||||
|
|
||||||
# Case 1: non-revisit
|
# Case 1: non-revisit
|
||||||
if (hasCurr and not hasOrig):
|
if (hasCurr and not hasOrig):
|
||||||
headersRecord = self._load(cdx, False)
|
headersRecord = self._load(cdx, False, failedFiles)
|
||||||
payloadRecord = headersRecord
|
payloadRecord = headersRecord
|
||||||
isRevisit = False
|
isRevisit = False
|
||||||
|
|
||||||
# Case 2: old-style revisit, load headers from original payload
|
# Case 2: old-style revisit, load headers from original payload
|
||||||
elif (not hasCurr and hasOrig):
|
elif (not hasCurr and hasOrig):
|
||||||
payloadRecord = self._load(cdx, False)
|
payloadRecord = self._load(cdx, False, failedFiles)
|
||||||
headersRecord = payloadRecord
|
headersRecord = payloadRecord
|
||||||
isRevisit = True
|
isRevisit = True
|
||||||
|
|
||||||
# Case 3: modern revisit, load headers from curr, payload from original
|
# Case 3: modern revisit, load headers from curr, payload from original
|
||||||
elif (hasCurr and hasOrig):
|
elif (hasCurr and hasOrig):
|
||||||
headersRecord = self._load(cdx, False)
|
headersRecord = self._load(cdx, False, failedFiles)
|
||||||
payloadRecord = self._load(cdx, True)
|
payloadRecord = self._load(cdx, True, failedFiles)
|
||||||
|
|
||||||
# Case 4: if headers record is actually empty (eg empty revisit), then use headers from revisit
|
# Case 4: if headers record is actually empty (eg empty revisit), then use headers from revisit
|
||||||
if not headersRecord.httpHeaders:
|
if not headersRecord.httpHeaders:
|
||||||
@ -191,13 +205,15 @@ class RewritingReplayHandler(ReplayHandler):
|
|||||||
def _rewriteContent(self, textType, urlrewriter, stream, newHeaders, origResponse, encoding = 'utf-8'):
|
def _rewriteContent(self, textType, urlrewriter, stream, newHeaders, origResponse, encoding = 'utf-8'):
|
||||||
if textType == 'html':
|
if textType == 'html':
|
||||||
out = StringIO.StringIO()
|
out = StringIO.StringIO()
|
||||||
|
#out = SimpleWriter()
|
||||||
htmlrewriter = wbhtml.WBHtml(urlrewriter, out, self.headInsert)
|
htmlrewriter = wbhtml.WBHtml(urlrewriter, out, self.headInsert)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
buff = stream.read()#.decode(encoding)
|
buff = stream.read()
|
||||||
while buff:
|
while buff:
|
||||||
|
buff = buff.decode(encoding)
|
||||||
htmlrewriter.feed(buff)
|
htmlrewriter.feed(buff)
|
||||||
buff = stream.read()#.decode(encoding)
|
buff = stream.read()
|
||||||
|
|
||||||
htmlrewriter.close()
|
htmlrewriter.close()
|
||||||
|
|
||||||
@ -205,29 +221,23 @@ class RewritingReplayHandler(ReplayHandler):
|
|||||||
# print e
|
# print e
|
||||||
|
|
||||||
finally:
|
finally:
|
||||||
value = [out.getvalue()]
|
value = [out.getvalue().encode(encoding)]
|
||||||
newHeaders.append(('Content-Length', str(len(value[0]))))
|
newHeaders.append(('Content-Length', str(len(value[0]))))
|
||||||
out.close()
|
out.close()
|
||||||
|
|
||||||
|
return WbResponse(status = origResponse.status, headersList = newHeaders, value = value)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
if textType == 'css':
|
if textType == 'css':
|
||||||
rewriter = regexmatch.CSSRewriter(urlrewriter)
|
rewriter = regexmatch.CSSRewriter(urlrewriter)
|
||||||
elif textType == 'js':
|
elif textType == 'js':
|
||||||
rewriter = regexmatch.JSRewriter(urlrewriter)
|
rewriter = regexmatch.JSRewriter(urlrewriter)
|
||||||
|
|
||||||
def gen():
|
def doRewrite(buff):
|
||||||
try:
|
return rewriter.replaceAll(buff)
|
||||||
buff = stream.read()
|
|
||||||
while buff:
|
|
||||||
yield rewriter.replaceAll(buff)
|
|
||||||
buff = stream.read()
|
|
||||||
|
|
||||||
finally:
|
return WbResponse.stream_response(origResponse.status, newHeaders, stream, doRewrite)
|
||||||
stream.close()
|
|
||||||
|
|
||||||
value = gen()
|
|
||||||
|
|
||||||
return WbResponse(status = origResponse.status, headersList = newHeaders, value = value)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -261,8 +271,8 @@ class RewritingReplayHandler(ReplayHandler):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def doReplay(self, cdx, wbrequest):
|
def doReplay(self, cdx, wbrequest, failedFiles):
|
||||||
wbresponse = ReplayHandler.doReplay(self, cdx, wbrequest)
|
wbresponse = ReplayHandler.doReplay(self, cdx, wbrequest, failedFiles)
|
||||||
|
|
||||||
# Check for self redirect
|
# Check for self redirect
|
||||||
if wbresponse.status.startswith('3'):
|
if wbresponse.status.startswith('3'):
|
||||||
@ -286,3 +296,5 @@ def PrefixResolver(prefix, contains):
|
|||||||
return prefix + url if (contains in url) else None
|
return prefix + url if (contains in url) else None
|
||||||
|
|
||||||
return makeUrl
|
return makeUrl
|
||||||
|
|
||||||
|
|
||||||
|
@ -17,6 +17,10 @@ class ArchivalUrl:
|
|||||||
>>> repr(ArchivalUrl('/20130102im_/https://example.com'))
|
>>> repr(ArchivalUrl('/20130102im_/https://example.com'))
|
||||||
"('replay', '20130102', 'im_', 'https://example.com', '/20130102im_/https://example.com')"
|
"('replay', '20130102', 'im_', 'https://example.com', '/20130102im_/https://example.com')"
|
||||||
|
|
||||||
|
# Protocol agnostic convert to http
|
||||||
|
>>> repr(ArchivalUrl('/20130102im_///example.com'))
|
||||||
|
"('replay', '20130102', 'im_', 'http://example.com', '/20130102im_/http://example.com')"
|
||||||
|
|
||||||
>>> repr(ArchivalUrl('/cs_/example.com'))
|
>>> repr(ArchivalUrl('/cs_/example.com'))
|
||||||
"('latest_replay', '', 'cs_', 'http://example.com', '/cs_/http://example.com')"
|
"('latest_replay', '', 'cs_', 'http://example.com', '/cs_/http://example.com')"
|
||||||
|
|
||||||
@ -81,7 +85,11 @@ class ArchivalUrl:
|
|||||||
if len(self.url) == 0:
|
if len(self.url) == 0:
|
||||||
raise wbexceptions.RequestParseException('Invalid WB Request Url: ' + url)
|
raise wbexceptions.RequestParseException('Invalid WB Request Url: ' + url)
|
||||||
|
|
||||||
if not self.url.startswith('//') and not '://' in self.url:
|
# protocol agnostic url -> http://
|
||||||
|
if self.url.startswith('//'):
|
||||||
|
self.url = ArchivalUrl.DEFAULT_SCHEME + self.url[2:]
|
||||||
|
# no protocol -> http://
|
||||||
|
elif not '://' in self.url:
|
||||||
self.url = ArchivalUrl.DEFAULT_SCHEME + self.url
|
self.url = ArchivalUrl.DEFAULT_SCHEME + self.url
|
||||||
|
|
||||||
matcher = rfc3987.match(self.url, 'IRI')
|
matcher = rfc3987.match(self.url, 'IRI')
|
||||||
|
@ -39,7 +39,13 @@ class InvalidArchiveRecordException(CaptureException):
|
|||||||
self.errList = errList
|
self.errList = errList
|
||||||
|
|
||||||
class ArchiveLoadFailed(CaptureException):
|
class ArchiveLoadFailed(CaptureException):
|
||||||
pass
|
def __init__(self, filename, reason):
|
||||||
|
super(ArchiveLoadFailed, self).__init__(filename + ':' + str(reason))
|
||||||
|
self.filename = filename
|
||||||
|
self.reason = reason
|
||||||
|
|
||||||
|
def status(_):
|
||||||
|
return '503'
|
||||||
|
|
||||||
class InternalRedirect(Exception):
|
class InternalRedirect(Exception):
|
||||||
def __init__(self, location, status = '302 Internal Redirect'):
|
def __init__(self, location, status = '302 Internal Redirect'):
|
||||||
|
@ -108,11 +108,13 @@ class WbResponse:
|
|||||||
return WbResponse(status, headersList = [('Location', location)])
|
return WbResponse(status, headersList = [('Location', location)])
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def stream_response(statusline, headers, stream):
|
def stream_response(statusline, headers, stream, proc = None):
|
||||||
def streamGen():
|
def streamGen():
|
||||||
try:
|
try:
|
||||||
buff = stream.read()
|
buff = stream.read()
|
||||||
while buff:
|
while buff:
|
||||||
|
if proc:
|
||||||
|
buff = proc(buff)
|
||||||
yield buff
|
yield buff
|
||||||
buff = stream.read()
|
buff = stream.read()
|
||||||
finally:
|
finally:
|
||||||
|
2
run.sh
2
run.sh
@ -8,4 +8,4 @@ if [ -z "$app" ]; then
|
|||||||
app=wbapp.py
|
app=wbapp.py
|
||||||
fi
|
fi
|
||||||
|
|
||||||
uwsgi --static-map /static=$mypath/static --http :9090 --wsgi-file $app
|
uwsgi --static-map /static=$mypath/static --http :8080 --wsgi-file $app
|
||||||
|
@ -2,7 +2,7 @@
|
|||||||
#_wayback_banner
|
#_wayback_banner
|
||||||
{
|
{
|
||||||
display: block;
|
display: block;
|
||||||
position: absolute;
|
position: fixed;
|
||||||
top: 0px;
|
top: 0px;
|
||||||
width: 100%;
|
width: 100%;
|
||||||
border: 1px solid;
|
border: 1px solid;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user