diff --git a/pywb/replay.py b/pywb/replay.py index 08ed8763..fe49cce8 100644 --- a/pywb/replay.py +++ b/pywb/replay.py @@ -1,4 +1,5 @@ import StringIO +from urllib2 import URLError import indexreader from wbrequestresponse import WbResponse @@ -35,6 +36,9 @@ class ReplayHandler(object): cdxlist = query_response.body last_e = None first = True + + # List of already failed w/arcs + failedFiles = [] for cdx in cdxlist: try: @@ -45,15 +49,12 @@ class ReplayHandler(object): self._checkRedir(wbrequest, cdx) first = False - response = self.doReplay(cdx, wbrequest) + response = self.doReplay(cdx, wbrequest, failedFiles) if response: response.cdx = cdx return response - #except wbexceptions.InternalRedirect as ir: - # raise ir - except wbexceptions.CaptureException as ce: import traceback traceback.print_exc() @@ -63,38 +64,51 @@ class ReplayHandler(object): if last_e: raise last_e else: - raise wbexceptions.ArchiveLoadFailed() + raise wbexceptions.UnresolvedArchiveFileException() def _checkRedir(self, wbrequest, cdx): return None - def _load(self, cdx, revisit = False): + def _load(self, cdx, revisit, failedFiles): if revisit: - return self.archiveloader.load(self.resolveFull(cdx['orig.filename']), cdx['orig.offset'], cdx['orig.length']) + (filename, offset, length) = (cdx['orig.filename'], cdx['orig.offset'], cdx['orig.length']) else: - return self.archiveloader.load(self.resolveFull(cdx['filename']), cdx['offset'], cdx['length']) + (filename, offset, length) = (cdx['filename'], cdx['offset'], cdx['length']) + + #optimization: if same file already failed this request, don't try again + if failedFiles and filename in failedFiles: + raise wbexceptions.ArchiveLoadFailed(filename, 'Skipping Already Failed') + + try: + return self.archiveloader.load(self.resolveFull(filename), offset, length) + + except URLError as ue: + if failedFiles: + failedFiles.append(filename) + + raise wbexceptions.ArchiveLoadFailed(filename, ue.reason) - def doReplay(self, cdx, wbrequest): + def doReplay(self, cdx, wbrequest, failedFiles): hasCurr = (cdx['filename'] != '-') hasOrig = (cdx['orig.filename'] != '-') # Case 1: non-revisit if (hasCurr and not hasOrig): - headersRecord = self._load(cdx, False) + headersRecord = self._load(cdx, False, failedFiles) payloadRecord = headersRecord isRevisit = False # Case 2: old-style revisit, load headers from original payload elif (not hasCurr and hasOrig): - payloadRecord = self._load(cdx, False) + payloadRecord = self._load(cdx, False, failedFiles) headersRecord = payloadRecord isRevisit = True # Case 3: modern revisit, load headers from curr, payload from original elif (hasCurr and hasOrig): - headersRecord = self._load(cdx, False) - payloadRecord = self._load(cdx, True) + headersRecord = self._load(cdx, False, failedFiles) + payloadRecord = self._load(cdx, True, failedFiles) # Case 4: if headers record is actually empty (eg empty revisit), then use headers from revisit if not headersRecord.httpHeaders: @@ -191,13 +205,15 @@ class RewritingReplayHandler(ReplayHandler): def _rewriteContent(self, textType, urlrewriter, stream, newHeaders, origResponse, encoding = 'utf-8'): if textType == 'html': out = StringIO.StringIO() + #out = SimpleWriter() htmlrewriter = wbhtml.WBHtml(urlrewriter, out, self.headInsert) try: - buff = stream.read()#.decode(encoding) + buff = stream.read() while buff: + buff = buff.decode(encoding) htmlrewriter.feed(buff) - buff = stream.read()#.decode(encoding) + buff = stream.read() htmlrewriter.close() @@ -205,29 +221,23 @@ class RewritingReplayHandler(ReplayHandler): # print e finally: - value = [out.getvalue()] + value = [out.getvalue().encode(encoding)] newHeaders.append(('Content-Length', str(len(value[0])))) out.close() + return WbResponse(status = origResponse.status, headersList = newHeaders, value = value) + else: if textType == 'css': rewriter = regexmatch.CSSRewriter(urlrewriter) elif textType == 'js': rewriter = regexmatch.JSRewriter(urlrewriter) - def gen(): - try: - buff = stream.read() - while buff: - yield rewriter.replaceAll(buff) - buff = stream.read() + def doRewrite(buff): + return rewriter.replaceAll(buff) - finally: - stream.close() + return WbResponse.stream_response(origResponse.status, newHeaders, stream, doRewrite) - value = gen() - - return WbResponse(status = origResponse.status, headersList = newHeaders, value = value) @@ -261,8 +271,8 @@ class RewritingReplayHandler(ReplayHandler): return None - def doReplay(self, cdx, wbrequest): - wbresponse = ReplayHandler.doReplay(self, cdx, wbrequest) + def doReplay(self, cdx, wbrequest, failedFiles): + wbresponse = ReplayHandler.doReplay(self, cdx, wbrequest, failedFiles) # Check for self redirect if wbresponse.status.startswith('3'): @@ -286,3 +296,5 @@ def PrefixResolver(prefix, contains): return prefix + url if (contains in url) else None return makeUrl + + diff --git a/pywb/wbarchivalurl.py b/pywb/wbarchivalurl.py index 89345f02..ecc07f13 100644 --- a/pywb/wbarchivalurl.py +++ b/pywb/wbarchivalurl.py @@ -17,6 +17,10 @@ class ArchivalUrl: >>> repr(ArchivalUrl('/20130102im_/https://example.com')) "('replay', '20130102', 'im_', 'https://example.com', '/20130102im_/https://example.com')" + # Protocol agnostic convert to http + >>> repr(ArchivalUrl('/20130102im_///example.com')) + "('replay', '20130102', 'im_', 'http://example.com', '/20130102im_/http://example.com')" + >>> repr(ArchivalUrl('/cs_/example.com')) "('latest_replay', '', 'cs_', 'http://example.com', '/cs_/http://example.com')" @@ -81,7 +85,11 @@ class ArchivalUrl: if len(self.url) == 0: raise wbexceptions.RequestParseException('Invalid WB Request Url: ' + url) - if not self.url.startswith('//') and not '://' in self.url: + # protocol agnostic url -> http:// + if self.url.startswith('//'): + self.url = ArchivalUrl.DEFAULT_SCHEME + self.url[2:] + # no protocol -> http:// + elif not '://' in self.url: self.url = ArchivalUrl.DEFAULT_SCHEME + self.url matcher = rfc3987.match(self.url, 'IRI') diff --git a/pywb/wbexceptions.py b/pywb/wbexceptions.py index 1c61a7d6..22975422 100644 --- a/pywb/wbexceptions.py +++ b/pywb/wbexceptions.py @@ -39,7 +39,13 @@ class InvalidArchiveRecordException(CaptureException): self.errList = errList class ArchiveLoadFailed(CaptureException): - pass + def __init__(self, filename, reason): + super(ArchiveLoadFailed, self).__init__(filename + ':' + str(reason)) + self.filename = filename + self.reason = reason + + def status(_): + return '503' class InternalRedirect(Exception): def __init__(self, location, status = '302 Internal Redirect'): diff --git a/pywb/wbrequestresponse.py b/pywb/wbrequestresponse.py index 402a6b13..0e35eb36 100644 --- a/pywb/wbrequestresponse.py +++ b/pywb/wbrequestresponse.py @@ -108,11 +108,13 @@ class WbResponse: return WbResponse(status, headersList = [('Location', location)]) @staticmethod - def stream_response(statusline, headers, stream): + def stream_response(statusline, headers, stream, proc = None): def streamGen(): try: buff = stream.read() while buff: + if proc: + buff = proc(buff) yield buff buff = stream.read() finally: diff --git a/run.sh b/run.sh index e0ee2f35..40d6facf 100755 --- a/run.sh +++ b/run.sh @@ -8,4 +8,4 @@ if [ -z "$app" ]; then app=wbapp.py fi -uwsgi --static-map /static=$mypath/static --http :9090 --wsgi-file $app +uwsgi --static-map /static=$mypath/static --http :8080 --wsgi-file $app diff --git a/static/wb.css b/static/wb.css index 33e55701..7ae92277 100644 --- a/static/wb.css +++ b/static/wb.css @@ -2,7 +2,7 @@ #_wayback_banner { display: block; - position: absolute; + position: fixed; top: 0px; width: 100%; border: 1px solid;