diff --git a/pywb/archiveloader.py b/pywb/archiveloader.py index 9b1fe739..51ae4498 100644 --- a/pywb/archiveloader.py +++ b/pywb/archiveloader.py @@ -15,8 +15,8 @@ class HttpStreamLoader: self.hmacDuration = hmacDuration def load(self, url, offset, length): - if length: - rangeHeader = 'bytes={0}-{1}'.format(offset, int(offset) + int(length) - 1) + if length > 0: + rangeHeader = 'bytes={0}-{1}'.format(offset, offset + length - 1) else: rangeHeader = 'bytes={0}-'.format(offset) @@ -31,7 +31,20 @@ class HttpStreamLoader: #================================================================= -WBArchiveRecord = collections.namedtuple('WBArchiveRecord', 'parsed, stream, statusline, httpHeaders') +# Untested, but for completeness +class FileStreamLoader: + def load(self, url, offset, length): + if url.startswith('file://'): + url = url[len('file://'):] + + afile = open(url, 'rb') + afile.seek(offset) + return afile + + + +#================================================================= +WBArchiveRecord = collections.namedtuple('WBArchiveRecord', 'type, record, stream, statusline, httpHeaders') #================================================================= class ArchiveLoader: @@ -46,14 +59,17 @@ class ArchiveLoader: '.arc': (hanzo.warctools.ArcRecord, 'arc', False), } - HTTP_STATUS_REGEX = re.compile('^HTTP/[\d.]+ ((\d+).*)$') + HTTP_STATUS_REGEX = re.compile('^HTTP/[\d.]+ (\d+.*)$') @staticmethod def createDefaultLoaders(): http = HttpStreamLoader() + file = FileStreamLoader() return { 'http': http, 'https': http, + 'file': file, + '': file } @@ -86,10 +102,15 @@ class ArchiveLoader: else: decomp = None + try: + length = int(length) + except: + length = -1 - raw = loader.load(url, offset, length) - reader = LineReader(raw, self.chunkSize, decomp) + raw = loader.load(url, long(offset), length) + + reader = LineReader(raw, length, self.chunkSize, decomp) parser = loaderCls.make_parser() @@ -104,27 +125,33 @@ class ArchiveLoader: if aFormat == 'arc': - recType = 'arc-response' + recType = 'response' empty = (utils.get_header(parsed.headers, 'length') == 0) else: recType = utils.get_header(parsed.headers, 'WARC-Type') empty = (utils.get_header(parsed.headers, 'Content-Length') == '0') - parsed.recType = recType - parsed.aFormat = aFormat - + # special case: empty w/arc record (hopefully a revisit) if empty: - return WBArchiveRecord(parsed, reader, '400', []) + statusline = '204 No Content' + headers = [] + # special case: warc records that are not expected to have http headers + # attempt to add 200 status and content-type elif recType == 'metadata' or recType == 'resource': + statusline = '200 OK' headers = [('Content-Type', utils.get_header(parsed.headers, 'Content-Type'))] - return WBArchiveRecord(parsed, reader, '200 OK', headers) + # special case: http 0.9 response, no status or headers + elif recType == 'response' and (';version=0.9' in utils.get_header(parsed.headers, 'Content-Type')): + statusline = '200 OK' + headers = [] + # response record: parse HTTP status and headers! else: (statusline, headers) = self.parseHttpHeaders(reader) - return WBArchiveRecord(parsed, reader, statusline, headers) + return WBArchiveRecord((aFormat, recType), parsed, reader, statusline, headers) def parseHttpHeaders(self, stream): @@ -153,20 +180,23 @@ class ArchiveLoader: #================================================================= class LineReader: - def __init__(self, stream, chunkSize = 1024, decomp = None): + def __init__(self, stream, maxLen = 0, chunkSize = 1024, decomp = None): self.stream = stream self.chunkSize = chunkSize self.decomp = decomp self.buff = None - self.numread = 0 + self.numRead = 0 + self.maxLen = maxLen def _fillbuff(self, chunkSize = None): if not chunkSize: chunkSize = self.chunkSize if not self.buff or self.buff.pos >= self.buff.len: - data = self.stream.read(chunkSize) - self.numread += len(data) + toRead = min(self.maxLen - self.numRead, self.chunkSize) if (self.maxLen > 0) else self.chunkSize + data = self.stream.read(toRead) + self.numRead += len(data) + if self.decomp: data = self.decomp.decompress(data) diff --git a/pywb/indexreader.py b/pywb/indexreader.py index 8cc80395..a757788a 100644 --- a/pywb/indexreader.py +++ b/pywb/indexreader.py @@ -80,6 +80,9 @@ class RemoteCDXServer: ArchivalUrl.REPLAY: {'sort': 'closest', 'filter': '!statuscode:(500|502|504)', 'limit': replayClosest, 'closest': wburl.timestamp, 'resolveRevisits': True}, + # BUG: resolveRevisits currently doesn't work for this type of query + # This is not an issue in archival mode, as there is a redirect to the actual timestamp query + # but may be an issue in proxy mode ArchivalUrl.LATEST_REPLAY: {'sort': 'reverse', 'filter': 'statuscode:[23]..', 'limit': '1', 'resolveRevisits': True} diff --git a/pywb/regexmatch.py b/pywb/regexmatch.py index 9f3d4242..c2b61bbc 100644 --- a/pywb/regexmatch.py +++ b/pywb/regexmatch.py @@ -91,8 +91,8 @@ class JSRewriter(RegexRewriter): """ - def __init__(self, httpPrefix, extra = []): - rules = self._createRules(httpPrefix) + def __init__(self, rewriter, extra = []): + rules = self._createRules(rewriter.getAbsUrl()) rules.extend(extra) RegexRewriter.__init__(self, rules) @@ -167,12 +167,10 @@ class CSSRewriter(RegexRewriter): if __name__ == "__main__": import doctest - rwPrefix = '/web/20131010im_/' - arcrw = ArchivalUrlRewriter('/20131010im_/http://example.com/', '/web/') def test_js(string, extra = []): - return JSRewriter(rwPrefix, extra).replaceAll(string) + return JSRewriter(arcrw, extra).replaceAll(string) def test_css(string): return CSSRewriter(arcrw).replaceAll(string) diff --git a/pywb/replay.py b/pywb/replay.py index 1c62a5d6..a992a5e4 100644 --- a/pywb/replay.py +++ b/pywb/replay.py @@ -1,8 +1,32 @@ +import StringIO + import indexreader from wbrequestresponse import WbResponse +from wbarchivalurl import ArchivalUrl import utils +from wburlrewriter import ArchivalUrlRewriter -class ReplayHandler: +import wbhtml +import regexmatch +import wbexceptions + +#================================================================= +class FullHandler: + def __init__(self, query, replay): + self.query = query + self.replay = replay + + def __call__(self, wbrequest, _): + query_response = self.query(wbrequest, None) + + if (wbrequest.wb_url.type == ArchivalUrl.QUERY) or (wbrequest.wb_url.type == ArchivalUrl.URL_QUERY): + return query_response + + return self.replay(wbrequest, query_response) + + +#================================================================= +class ReplayHandler(object): def __init__(self, resolvers, archiveloader): self.resolvers = resolvers self.archiveloader = archiveloader @@ -11,38 +35,45 @@ class ReplayHandler: cdxlist = query_response.body last_e = None first = True + for cdx in cdxlist: try: cdx = indexreader.CDXCaptureResult(cdx) - # First time through, check if do redirect before warc load - if first and (cdx['timestamp'] != wbrequest.wb_url.timestamp): - return WbResponse.better_timestamp_response(wbrequest, cdx['timestamp']) + # ability to intercept and redirect + if first: + self._checkRedir(wbrequest, cdx) + first = False response = self.doReplay(cdx, wbrequest) if response: - # if a fallback, redirect to exact timestamp! - if not first and (cdx['timestamp'] != wbrequest.wb_url.timestamp): - response.close() - return WbResponse.better_timestamp_response(wbrequest, cdx['timestamp']) - + response.cdx = cdx return response - first = False + #except wbexceptions.InternalRedirect as ir: + # raise ir - except Exception, e: + except wbexceptions.CaptureException as ce: import traceback traceback.print_exc() - last_e = e + last_e = ce pass if last_e: raise last_e + else: + raise wbexceptions.ArchiveLoadFailed() + + def _checkRedir(self, wbrequest, cdx): + return None def _load(self, cdx, revisit = False): - prefix = '' if not revisit else 'orig.' - return self.archiveloader.load(self.resolveFull(cdx[prefix + 'filename']), cdx[prefix + 'offset'], cdx[prefix + 'length']) + if revisit: + return self.archiveloader.load(self.resolveFull(cdx['orig.filename']), cdx['orig.offset'], cdx['orig.length']) + else: + return self.archiveloader.load(self.resolveFull(cdx['filename']), cdx['offset'], cdx['length']) + def doReplay(self, cdx, wbrequest): hasCurr = (cdx['filename'] != '-') @@ -75,19 +106,8 @@ class ReplayHandler: else: raise wbexceptions.CaptureException('Invalid CDX' + cdx) - # Check for self redirect - if headersRecord.statusline.startswith('3'): - if self.isSelfRedirect(wbrequest, headersRecord): - raise wbexception.CaptureException('Self Redirect: ' + cdx) - return WbResponse.stream_response(headersRecord.statusline, headersRecord.httpHeaders, payloadRecord.stream) - def isSelfRedirect(self, wbrequest, record): - requestUrl = wbrequest.wb_url.url.lower() - locationUrl = utils.get_header(record.httpHeaders, 'Location').lower() - return requestUrl == locationUrl - #ArchivalUrlRewriter.stripProtocol(requestUrl) == ArchivalUrlRewriter.stripProtocol(locationUrl) - def resolveFull(self, filename): # Attempt to resolve cdx file to full path @@ -100,6 +120,164 @@ class ReplayHandler: raise exceptions.UnresolvedArchiveFileException('Archive File Not Found: ' + cdx.filename) +#================================================================= +class RewritingReplayHandler(ReplayHandler): + + + REWRITE_TYPES = { + 'html': ('text/html', 'application/xhtml'), + 'css': ('text/css'), + 'js': ('text/javascript', 'application/javascript', 'application/x-javascript'), + 'xml': ('/xml', '+xml', '.xml', '.rss'), + } + + + PROXY_HEADERS = ('content-type', 'content-disposition') + + URL_REWRITE_HEADERS = ('location', 'content-location', 'content-base') + + ENCODING_HEADERS = ('content-encoding', 'transfer-encoding') + + + def __init__(self, resolvers, archiveloader, headerPrefix = 'X-Archive-Orig-', headInsert = None): + ReplayHandler.__init__(self, resolvers, archiveloader) + self.headerPrefix = headerPrefix + self.headInsert = headInsert + + + def _canonContentType(self, contentType): + for type, mimelist in self.REWRITE_TYPES.iteritems(): + for mime in mimelist: + if mime in contentType: + return type + + return None + + + def __call__(self, wbrequest, query_response): + urlrewriter = ArchivalUrlRewriter(wbrequest.wb_url, wbrequest.wb_prefix) + wbrequest.urlrewriter = urlrewriter + + response = ReplayHandler.__call__(self, wbrequest, query_response) + + if response and response.cdx: + self._checkRedir(wbrequest, response.cdx) + + # Transparent! + if wbrequest.wb_url.mod == 'id_': + return response + + contentType = utils.get_header(response.headersList, 'Content-Type') + + canonType = self._canonContentType(contentType) + + (newHeaders, remHeaders) = self._rewriteHeaders(response.headersList, (canonType is not None)) + + # binary type, just send through + if canonType is None: + response.headersList = newHeaders + return response + + # Handle text rewriting + # TODO: better way to pass this + stream = response._stream + + # special case -- need to ungzip the body + if (utils.contains_header(remHeaders, ('Content-Encoding', 'gzip'))): + stream = archiveloader.LineStream(stream, decomp = zlib.decompressobj(16 + zlib.MAX_WBITS)) + + return self._rewriteContent(canonType, urlrewriter, stream, newHeaders, response) + + # TODO: first non-streaming attempt, probably want to stream + def _rewriteContent(self, canonType, urlrewriter, stream, newHeaders, origResponse): + if canonType == 'html': + out = StringIO.StringIO() + htmlrewriter = wbhtml.WBHtml(urlrewriter, out, self.headInsert) + + try: + buff = stream.read() + while buff: + htmlrewriter.feed(buff) + buff = stream.read() + + htmlrewriter.close() + + #except Exception as e: + # print e + + finally: + value = [out.getvalue()] + out.close() + + else: + if canonType == 'css': + rewriter = regexmatch.CSSRewriter(urlrewriter) + elif canonType == 'js': + rewriter = regexmatch.JSRewriter(urlrewriter) + + def gen(): + try: + buff = stream.read() + while buff: + yield rewriter.replaceAll(buff) + buff = stream.read() + + finally: + stream.close() + + value = gen() + + return WbResponse(status = origResponse.status, headersList = newHeaders, value = value) + + + + def _rewriteHeaders(self, headers, stripEncoding = False): + newHeaders = [] + removedHeaders = [] + + for (name, value) in headers: + lowername = name.lower() + if lowername in self.PROXY_HEADERS: + newHeaders.append((name, value)) + elif lowername in self.URL_REWRITE_HEADERS: + newHeaders.append((name, urlrewriter.rewrite(value))) + elif lowername in self.ENCODING_HEADERS: + if stripEncoding: + removedHeaders.append((name, value)) + else: + newHeaders.append((name, value)) + else: + newHeaders.append((self.headerPrefix + name, value)) + + return (newHeaders, removedHeaders) + + + def _checkRedir(self, wbrequest, cdx): + if cdx and (cdx['timestamp'] != wbrequest.wb_url.timestamp): + newUrl = wbrequest.urlrewriter.getTimestampUrl(cdx['timestamp'], cdx['original']) + raise wbexceptions.InternalRedirect(newUrl) + #return WbResponse.better_timestamp_response(wbrequest, cdx['timestamp']) + + return None + + + def doReplay(self, cdx, wbrequest): + wbresponse = ReplayHandler.doReplay(self, cdx, wbrequest) + + # Check for self redirect + if wbresponse.status.startswith('3'): + if self.isSelfRedirect(wbrequest, wbresponse.headersList): + raise wbexceptions.CaptureException('Self Redirect: ' + str(cdx)) + + return wbresponse + + def isSelfRedirect(self, wbrequest, httpHeaders): + requestUrl = wbrequest.wb_url.url.lower() + locationUrl = utils.get_header(httpHeaders, 'Location').lower() + #return requestUrl == locationUrl + return (ArchivalUrlRewriter.stripProtocol(requestUrl) == ArchivalUrlRewriter.stripProtocol(locationUrl)) + + #====================================== # PrefixResolver - convert cdx file entry to url with prefix if url contains specified string #====================================== diff --git a/pywb/run.sh b/pywb/run.sh deleted file mode 100755 index c8a1198d..00000000 --- a/pywb/run.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/sh - -app=$1 -if [ -z "$app" ]; then - app=wbapp.py -fi - -uwsgi --http :9090 --wsgi-file $app diff --git a/pywb/utils.py b/pywb/utils.py index 89dc7459..ba427e55 100644 --- a/pywb/utils.py +++ b/pywb/utils.py @@ -17,6 +17,16 @@ def get_header(headersList, name): if (value[0].lower() == nameLower): return value[1] + return None + +def contains_header(headersList, seekHeader): + header = get_header(headersList, seekHeader[0]) + if not header: + return False + + # see if found header matches value! + return (header == seekHeader[1]) + class HMACCookieMaker: def __init__(self, key, name): self.key = key diff --git a/pywb/wbapp.py b/pywb/wbapp.py index 755550de..5a70c42b 100644 --- a/pywb/wbapp.py +++ b/pywb/wbapp.py @@ -1,4 +1,5 @@ from query import QueryHandler +from replay import FullHandler import wbexceptions from wbrequestresponse import WbResponse @@ -19,7 +20,16 @@ class WBHandler: query = QueryHandler() import testwb -replay = testwb.createReplay() + +headInsert = """ + + + + + +""" + +replay = testwb.createReplay(headInsert) ## =========== parser = ArchivalRequestRouter( @@ -28,6 +38,7 @@ parser = ArchivalRequestRouter( 't1' : [WBHandler()], 't2' : [query], 't3' : [query, replay], + 'web': FullHandler(query, replay) }, hostpaths = ['http://localhost:9090/']) ## =========== @@ -42,6 +53,9 @@ def application(env, start_response): if not response: raise wbexceptions.NotFoundException(env['REQUEST_URI'] + ' was not found') + except wbexceptions.InternalRedirect as ir: + response = WbResponse(status = ir.status, headersList = ir.httpHeaders) + except Exception as e: last_exc = e import traceback diff --git a/pywb/wbexceptions.py b/pywb/wbexceptions.py index ac882b4f..036f309b 100644 --- a/pywb/wbexceptions.py +++ b/pywb/wbexceptions.py @@ -38,4 +38,15 @@ class InvalidArchiveRecordException(CaptureException): super(InvalidArchiveRecordException, self).__init__(msg) self.errList = errList +class ArchiveLoadFailed(CaptureException): + pass + +class InternalRedirect(Exception): + def __init__(self, location, status = '302 Internal Redirect'): + Exception.__init__(self, 'Redirecting -> ' + location) + self.status = status + self.httpHeaders = [('Location', location)] + + def status(_): + return self.status diff --git a/pywb/wbhtml.py b/pywb/wbhtml.py index 946715da..05e81e40 100644 --- a/pywb/wbhtml.py +++ b/pywb/wbhtml.py @@ -22,6 +22,9 @@ class WBHtml(HTMLParser): >>> parse('') + >>> parse('') + + >>> parse('') @@ -41,7 +44,18 @@ class WBHtml(HTMLParser): # Unterminated style tag auto-terminate >>> parse(' - """ + + # Head Insertion + >>> parse('
Test', headInsert = '') + Test + + >>> parse('