1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

support utf-8 (so far)

support protocol-agnostic prefix //
failedFile list for warc loading
This commit is contained in:
Ilya Kreymer 2013-12-31 00:18:12 +00:00
parent b8c4a453c9
commit d9930322f1
6 changed files with 62 additions and 34 deletions

View File

@ -1,4 +1,5 @@
import StringIO import StringIO
from urllib2 import URLError
import indexreader import indexreader
from wbrequestresponse import WbResponse from wbrequestresponse import WbResponse
@ -35,6 +36,9 @@ class ReplayHandler(object):
cdxlist = query_response.body cdxlist = query_response.body
last_e = None last_e = None
first = True first = True
# List of already failed w/arcs
failedFiles = []
for cdx in cdxlist: for cdx in cdxlist:
try: try:
@ -45,15 +49,12 @@ class ReplayHandler(object):
self._checkRedir(wbrequest, cdx) self._checkRedir(wbrequest, cdx)
first = False first = False
response = self.doReplay(cdx, wbrequest) response = self.doReplay(cdx, wbrequest, failedFiles)
if response: if response:
response.cdx = cdx response.cdx = cdx
return response return response
#except wbexceptions.InternalRedirect as ir:
# raise ir
except wbexceptions.CaptureException as ce: except wbexceptions.CaptureException as ce:
import traceback import traceback
traceback.print_exc() traceback.print_exc()
@ -63,38 +64,51 @@ class ReplayHandler(object):
if last_e: if last_e:
raise last_e raise last_e
else: else:
raise wbexceptions.ArchiveLoadFailed() raise wbexceptions.UnresolvedArchiveFileException()
def _checkRedir(self, wbrequest, cdx): def _checkRedir(self, wbrequest, cdx):
return None return None
def _load(self, cdx, revisit = False): def _load(self, cdx, revisit, failedFiles):
if revisit: if revisit:
return self.archiveloader.load(self.resolveFull(cdx['orig.filename']), cdx['orig.offset'], cdx['orig.length']) (filename, offset, length) = (cdx['orig.filename'], cdx['orig.offset'], cdx['orig.length'])
else: else:
return self.archiveloader.load(self.resolveFull(cdx['filename']), cdx['offset'], cdx['length']) (filename, offset, length) = (cdx['filename'], cdx['offset'], cdx['length'])
#optimization: if same file already failed this request, don't try again
if failedFiles and filename in failedFiles:
raise wbexceptions.ArchiveLoadFailed(filename, 'Skipping Already Failed')
try:
return self.archiveloader.load(self.resolveFull(filename), offset, length)
except URLError as ue:
if failedFiles:
failedFiles.append(filename)
raise wbexceptions.ArchiveLoadFailed(filename, ue.reason)
def doReplay(self, cdx, wbrequest): def doReplay(self, cdx, wbrequest, failedFiles):
hasCurr = (cdx['filename'] != '-') hasCurr = (cdx['filename'] != '-')
hasOrig = (cdx['orig.filename'] != '-') hasOrig = (cdx['orig.filename'] != '-')
# Case 1: non-revisit # Case 1: non-revisit
if (hasCurr and not hasOrig): if (hasCurr and not hasOrig):
headersRecord = self._load(cdx, False) headersRecord = self._load(cdx, False, failedFiles)
payloadRecord = headersRecord payloadRecord = headersRecord
isRevisit = False isRevisit = False
# Case 2: old-style revisit, load headers from original payload # Case 2: old-style revisit, load headers from original payload
elif (not hasCurr and hasOrig): elif (not hasCurr and hasOrig):
payloadRecord = self._load(cdx, False) payloadRecord = self._load(cdx, False, failedFiles)
headersRecord = payloadRecord headersRecord = payloadRecord
isRevisit = True isRevisit = True
# Case 3: modern revisit, load headers from curr, payload from original # Case 3: modern revisit, load headers from curr, payload from original
elif (hasCurr and hasOrig): elif (hasCurr and hasOrig):
headersRecord = self._load(cdx, False) headersRecord = self._load(cdx, False, failedFiles)
payloadRecord = self._load(cdx, True) payloadRecord = self._load(cdx, True, failedFiles)
# Case 4: if headers record is actually empty (eg empty revisit), then use headers from revisit # Case 4: if headers record is actually empty (eg empty revisit), then use headers from revisit
if not headersRecord.httpHeaders: if not headersRecord.httpHeaders:
@ -191,13 +205,15 @@ class RewritingReplayHandler(ReplayHandler):
def _rewriteContent(self, textType, urlrewriter, stream, newHeaders, origResponse, encoding = 'utf-8'): def _rewriteContent(self, textType, urlrewriter, stream, newHeaders, origResponse, encoding = 'utf-8'):
if textType == 'html': if textType == 'html':
out = StringIO.StringIO() out = StringIO.StringIO()
#out = SimpleWriter()
htmlrewriter = wbhtml.WBHtml(urlrewriter, out, self.headInsert) htmlrewriter = wbhtml.WBHtml(urlrewriter, out, self.headInsert)
try: try:
buff = stream.read()#.decode(encoding) buff = stream.read()
while buff: while buff:
buff = buff.decode(encoding)
htmlrewriter.feed(buff) htmlrewriter.feed(buff)
buff = stream.read()#.decode(encoding) buff = stream.read()
htmlrewriter.close() htmlrewriter.close()
@ -205,29 +221,23 @@ class RewritingReplayHandler(ReplayHandler):
# print e # print e
finally: finally:
value = [out.getvalue()] value = [out.getvalue().encode(encoding)]
newHeaders.append(('Content-Length', str(len(value[0])))) newHeaders.append(('Content-Length', str(len(value[0]))))
out.close() out.close()
return WbResponse(status = origResponse.status, headersList = newHeaders, value = value)
else: else:
if textType == 'css': if textType == 'css':
rewriter = regexmatch.CSSRewriter(urlrewriter) rewriter = regexmatch.CSSRewriter(urlrewriter)
elif textType == 'js': elif textType == 'js':
rewriter = regexmatch.JSRewriter(urlrewriter) rewriter = regexmatch.JSRewriter(urlrewriter)
def gen(): def doRewrite(buff):
try: return rewriter.replaceAll(buff)
buff = stream.read()
while buff:
yield rewriter.replaceAll(buff)
buff = stream.read()
finally: return WbResponse.stream_response(origResponse.status, newHeaders, stream, doRewrite)
stream.close()
value = gen()
return WbResponse(status = origResponse.status, headersList = newHeaders, value = value)
@ -261,8 +271,8 @@ class RewritingReplayHandler(ReplayHandler):
return None return None
def doReplay(self, cdx, wbrequest): def doReplay(self, cdx, wbrequest, failedFiles):
wbresponse = ReplayHandler.doReplay(self, cdx, wbrequest) wbresponse = ReplayHandler.doReplay(self, cdx, wbrequest, failedFiles)
# Check for self redirect # Check for self redirect
if wbresponse.status.startswith('3'): if wbresponse.status.startswith('3'):
@ -286,3 +296,5 @@ def PrefixResolver(prefix, contains):
return prefix + url if (contains in url) else None return prefix + url if (contains in url) else None
return makeUrl return makeUrl

View File

@ -17,6 +17,10 @@ class ArchivalUrl:
>>> repr(ArchivalUrl('/20130102im_/https://example.com')) >>> repr(ArchivalUrl('/20130102im_/https://example.com'))
"('replay', '20130102', 'im_', 'https://example.com', '/20130102im_/https://example.com')" "('replay', '20130102', 'im_', 'https://example.com', '/20130102im_/https://example.com')"
# Protocol agnostic convert to http
>>> repr(ArchivalUrl('/20130102im_///example.com'))
"('replay', '20130102', 'im_', 'http://example.com', '/20130102im_/http://example.com')"
>>> repr(ArchivalUrl('/cs_/example.com')) >>> repr(ArchivalUrl('/cs_/example.com'))
"('latest_replay', '', 'cs_', 'http://example.com', '/cs_/http://example.com')" "('latest_replay', '', 'cs_', 'http://example.com', '/cs_/http://example.com')"
@ -81,7 +85,11 @@ class ArchivalUrl:
if len(self.url) == 0: if len(self.url) == 0:
raise wbexceptions.RequestParseException('Invalid WB Request Url: ' + url) raise wbexceptions.RequestParseException('Invalid WB Request Url: ' + url)
if not self.url.startswith('//') and not '://' in self.url: # protocol agnostic url -> http://
if self.url.startswith('//'):
self.url = ArchivalUrl.DEFAULT_SCHEME + self.url[2:]
# no protocol -> http://
elif not '://' in self.url:
self.url = ArchivalUrl.DEFAULT_SCHEME + self.url self.url = ArchivalUrl.DEFAULT_SCHEME + self.url
matcher = rfc3987.match(self.url, 'IRI') matcher = rfc3987.match(self.url, 'IRI')

View File

@ -39,7 +39,13 @@ class InvalidArchiveRecordException(CaptureException):
self.errList = errList self.errList = errList
class ArchiveLoadFailed(CaptureException): class ArchiveLoadFailed(CaptureException):
pass def __init__(self, filename, reason):
super(ArchiveLoadFailed, self).__init__(filename + ':' + str(reason))
self.filename = filename
self.reason = reason
def status(_):
return '503'
class InternalRedirect(Exception): class InternalRedirect(Exception):
def __init__(self, location, status = '302 Internal Redirect'): def __init__(self, location, status = '302 Internal Redirect'):

View File

@ -108,11 +108,13 @@ class WbResponse:
return WbResponse(status, headersList = [('Location', location)]) return WbResponse(status, headersList = [('Location', location)])
@staticmethod @staticmethod
def stream_response(statusline, headers, stream): def stream_response(statusline, headers, stream, proc = None):
def streamGen(): def streamGen():
try: try:
buff = stream.read() buff = stream.read()
while buff: while buff:
if proc:
buff = proc(buff)
yield buff yield buff
buff = stream.read() buff = stream.read()
finally: finally:

2
run.sh
View File

@ -8,4 +8,4 @@ if [ -z "$app" ]; then
app=wbapp.py app=wbapp.py
fi fi
uwsgi --static-map /static=$mypath/static --http :9090 --wsgi-file $app uwsgi --static-map /static=$mypath/static --http :8080 --wsgi-file $app

View File

@ -2,7 +2,7 @@
#_wayback_banner #_wayback_banner
{ {
display: block; display: block;
position: absolute; position: fixed;
top: 0px; top: 0px;
width: 100%; width: 100%;
border: 1px solid; border: 1px solid;