diff --git a/pywb/archiveloader.py b/pywb/archiveloader.py index 51ae4498..365492fd 100644 --- a/pywb/archiveloader.py +++ b/pywb/archiveloader.py @@ -7,6 +7,7 @@ import urllib2 import StringIO import urlparse import collections +import wbexceptions #================================================================= class HttpStreamLoader: @@ -143,9 +144,9 @@ class ArchiveLoader: headers = [('Content-Type', utils.get_header(parsed.headers, 'Content-Type'))] # special case: http 0.9 response, no status or headers - elif recType == 'response' and (';version=0.9' in utils.get_header(parsed.headers, 'Content-Type')): - statusline = '200 OK' - headers = [] + #elif recType == 'response' and (';version=0.9' in utils.get_header(parsed.headers, 'Content-Type')): + # statusline = '200 OK' + # headers = [] # response record: parse HTTP status and headers! else: @@ -202,13 +203,13 @@ class LineReader: self.buff = StringIO.StringIO(data) - def read(self): + def read(self, length = None): self._fillbuff() - return self.buff.read() + return self.buff.read(length) - def readline(self): + def readline(self, length = None): self._fillbuff() - return self.buff.readline() + return self.buff.readline(length) def close(self): if self.stream: diff --git a/pywb/replay.py b/pywb/replay.py index a992a5e4..08ed8763 100644 --- a/pywb/replay.py +++ b/pywb/replay.py @@ -117,7 +117,7 @@ class ReplayHandler(object): if fullUrl: return fullUrl - raise exceptions.UnresolvedArchiveFileException('Archive File Not Found: ' + cdx.filename) + raise wbexceptions.UnresolvedArchiveFileException('Archive File Not Found: ' + filename) #================================================================= @@ -125,10 +125,10 @@ class RewritingReplayHandler(ReplayHandler): REWRITE_TYPES = { - 'html': ('text/html', 'application/xhtml'), - 'css': ('text/css'), - 'js': ('text/javascript', 'application/javascript', 'application/x-javascript'), - 'xml': ('/xml', '+xml', '.xml', '.rss'), + 'html': ['text/html', 'application/xhtml'], + 'css': ['text/css'], + 'js': ['text/javascript', 'application/javascript', 'application/x-javascript'], + 'xml': ['/xml', '+xml', '.xml', '.rss'], } @@ -145,11 +145,10 @@ class RewritingReplayHandler(ReplayHandler): self.headInsert = headInsert - def _canonContentType(self, contentType): - for type, mimelist in self.REWRITE_TYPES.iteritems(): - for mime in mimelist: - if mime in contentType: - return type + def _textContentType(self, contentType): + for ctype, mimelist in self.REWRITE_TYPES.iteritems(): + if any ((mime in contentType) for mime in mimelist): + return ctype return None @@ -168,13 +167,13 @@ class RewritingReplayHandler(ReplayHandler): return response contentType = utils.get_header(response.headersList, 'Content-Type') - - canonType = self._canonContentType(contentType) - - (newHeaders, remHeaders) = self._rewriteHeaders(response.headersList, (canonType is not None)) + + textType = self._textContentType(contentType) if contentType else None + + (newHeaders, remHeaders) = self._rewriteHeaders(response.headersList, urlrewriter, textType is not None) # binary type, just send through - if canonType is None: + if textType is None: response.headersList = newHeaders return response @@ -186,19 +185,19 @@ class RewritingReplayHandler(ReplayHandler): if (utils.contains_header(remHeaders, ('Content-Encoding', 'gzip'))): stream = archiveloader.LineStream(stream, decomp = zlib.decompressobj(16 + zlib.MAX_WBITS)) - return self._rewriteContent(canonType, urlrewriter, stream, newHeaders, response) + return self._rewriteContent(textType, urlrewriter, stream, newHeaders, response) # TODO: first non-streaming attempt, probably want to stream - def _rewriteContent(self, canonType, urlrewriter, stream, newHeaders, origResponse): - if canonType == 'html': + def _rewriteContent(self, textType, urlrewriter, stream, newHeaders, origResponse, encoding = 'utf-8'): + if textType == 'html': out = StringIO.StringIO() htmlrewriter = wbhtml.WBHtml(urlrewriter, out, self.headInsert) try: - buff = stream.read() + buff = stream.read()#.decode(encoding) while buff: htmlrewriter.feed(buff) - buff = stream.read() + buff = stream.read()#.decode(encoding) htmlrewriter.close() @@ -207,12 +206,13 @@ class RewritingReplayHandler(ReplayHandler): finally: value = [out.getvalue()] + newHeaders.append(('Content-Length', str(len(value[0])))) out.close() else: - if canonType == 'css': + if textType == 'css': rewriter = regexmatch.CSSRewriter(urlrewriter) - elif canonType == 'js': + elif textType == 'js': rewriter = regexmatch.JSRewriter(urlrewriter) def gen(): @@ -231,7 +231,7 @@ class RewritingReplayHandler(ReplayHandler): - def _rewriteHeaders(self, headers, stripEncoding = False): + def _rewriteHeaders(self, headers, urlrewriter, stripEncoding = False): newHeaders = [] removedHeaders = [] diff --git a/pywb/wbexceptions.py b/pywb/wbexceptions.py index 036f309b..1c61a7d6 100644 --- a/pywb/wbexceptions.py +++ b/pywb/wbexceptions.py @@ -34,7 +34,7 @@ class UnknownLoaderProtocolException(CaptureException): pass class InvalidArchiveRecordException(CaptureException): - def __init__(msg, errList = None): + def __init__(self, msg, errList = None): super(InvalidArchiveRecordException, self).__init__(msg) self.errList = errList diff --git a/pywb/wbhtml.py b/pywb/wbhtml.py index 05e81e40..d6b39550 100644 --- a/pywb/wbhtml.py +++ b/pywb/wbhtml.py @@ -19,6 +19,13 @@ class WBHtml(HTMLParser): >>> parse('