diff --git a/pywb/archiveloader.py b/pywb/archiveloader.py new file mode 100644 index 00000000..9b1fe739 --- /dev/null +++ b/pywb/archiveloader.py @@ -0,0 +1,189 @@ +import hanzo.warctools + +import re +import utils +import zlib +import urllib2 +import StringIO +import urlparse +import collections + +#================================================================= +class HttpStreamLoader: + def __init__(self, hmac = None, hmacDuration = 30): + self.hmac = hmac + self.hmacDuration = hmacDuration + + def load(self, url, offset, length): + if length: + rangeHeader = 'bytes={0}-{1}'.format(offset, int(offset) + int(length) - 1) + else: + rangeHeader = 'bytes={0}-'.format(offset) + + headers = {} + headers['Range'] = rangeHeader + + if self.hmac: + headers['Cookie'] = self.hmac(self.hmacDuration) + + request = urllib2.Request(url, headers = headers) + return urllib2.urlopen(request) + + +#================================================================= +WBArchiveRecord = collections.namedtuple('WBArchiveRecord', 'parsed, stream, statusline, httpHeaders') + +#================================================================= +class ArchiveLoader: + # Standard ARC headers + ARC_HEADERS = ["uri", "ip-address", "creation-date", "content-type", "length"] + + # Since loading a range request, can only determine gzip-ness based on file extension + FORMAT_MAP = { + '.warc.gz': (hanzo.warctools.WarcRecord, 'warc', True), + '.arc.gz': (hanzo.warctools.ArcRecord, 'arc', True), + '.warc': (hanzo.warctools.WarcRecord, 'warc', False), + '.arc': (hanzo.warctools.ArcRecord, 'arc', False), + } + + HTTP_STATUS_REGEX = re.compile('^HTTP/[\d.]+ ((\d+).*)$') + + @staticmethod + def createDefaultLoaders(): + http = HttpStreamLoader() + return { + 'http': http, + 'https': http, + } + + + def __init__(self, loaders = {}, chunkSize = 8192): + self.loaders = loaders if loaders else ArchiveLoader.createDefaultLoaders() + self.chunkSize = chunkSize + + def load(self, url, offset, length): + urlParts = urlparse.urlsplit(url) + + try: + loader = self.loaders.get(urlParts.scheme) + except Exception: + raise wbexceptions.UnknownLoaderProtocolException(url) + + loaderCls = None + + for ext, (loaderCls, aFormat, gzip) in ArchiveLoader.FORMAT_MAP.iteritems(): + if url.endswith(ext): + loaderCls = loaderCls + aFormat = aFormat + isGzip = gzip + break + + if loaderCls is None: + raise wbexceptions.UnknownArchiveFormatException(url) + + if isGzip: + decomp = zlib.decompressobj(16+zlib.MAX_WBITS) + else: + decomp = None + + + raw = loader.load(url, offset, length) + + reader = LineReader(raw, self.chunkSize, decomp) + + parser = loaderCls.make_parser() + + if aFormat == 'arc': + parser.headers = ArchiveLoader.ARC_HEADERS + + (parsed, errors, _) = parser.parse(reader, 0) + + if errors: + reader.close() + raise wbexceptions.InvalidArchiveRecordException('Error Parsing Record', errors) + + + if aFormat == 'arc': + recType = 'arc-response' + empty = (utils.get_header(parsed.headers, 'length') == 0) + else: + recType = utils.get_header(parsed.headers, 'WARC-Type') + empty = (utils.get_header(parsed.headers, 'Content-Length') == '0') + + parsed.recType = recType + parsed.aFormat = aFormat + + if empty: + return WBArchiveRecord(parsed, reader, '400', []) + + elif recType == 'metadata' or recType == 'resource': + headers = [('Content-Type', utils.get_header(parsed.headers, 'Content-Type'))] + + return WBArchiveRecord(parsed, reader, '200 OK', headers) + + else: + (statusline, headers) = self.parseHttpHeaders(reader) + + return WBArchiveRecord(parsed, reader, statusline, headers) + + + def parseHttpHeaders(self, stream): + def nextHeaderLine(stream): + return stream.readline().rstrip() + + line = nextHeaderLine(stream) + matched = self.HTTP_STATUS_REGEX.match(line) + + if not matched: + raise wbexceptions.InvalidArchiveRecordException('Expected HTTP Status Line, Found: ' + line) + + #status = int(matched.group(2)) + statusline = matched.group(1) + headers = [] + + line = nextHeaderLine(stream) + + while line and line != '\r\n': + name, value = line.split(':', 1) + value = value.strip() + headers.append((name, value)) + line = nextHeaderLine(stream) + + return (statusline, headers) + +#================================================================= +class LineReader: + def __init__(self, stream, chunkSize = 1024, decomp = None): + self.stream = stream + self.chunkSize = chunkSize + self.decomp = decomp + self.buff = None + self.numread = 0 + + def _fillbuff(self, chunkSize = None): + if not chunkSize: + chunkSize = self.chunkSize + + if not self.buff or self.buff.pos >= self.buff.len: + data = self.stream.read(chunkSize) + self.numread += len(data) + if self.decomp: + data = self.decomp.decompress(data) + + self.buff = StringIO.StringIO(data) + + def read(self): + self._fillbuff() + return self.buff.read() + + def readline(self): + self._fillbuff() + return self.buff.readline() + + def close(self): + if self.stream: + self.stream.close() + self.stream = None + + + diff --git a/pywb/indexreader.py b/pywb/indexreader.py index 66bde98a..8cc80395 100644 --- a/pywb/indexreader.py +++ b/pywb/indexreader.py @@ -1,13 +1,14 @@ import urllib import urllib2 import wbexceptions +import itertools from wbarchivalurl import ArchivalUrl class RemoteCDXServer: """ >>> x = cdxserver.load('example.com', parse_cdx = True, limit = '2') - >>> pprint(vars(x[0])) + >>> pprint(x[0]) {'digest': 'HT2DYGA5UKZCPBSFVCV3JOBXGW2G5UUA', 'filename': 'DJ_crawl2.20020401123359-c/DJ_crawl3.20020120141301.arc.gz', 'length': '1792', @@ -20,7 +21,23 @@ class RemoteCDXServer: 'timestamp': '20020120142510', 'urlkey': 'com,example)/'} - """ + >>> x = cdxserver.load('example.com', parse_cdx = True, params = {'resolveRevisits': True, 'closest': '20131226', 'sort': 'closest', 'limit': '1'}) + >>> pprint(x[0]) + {'digest': 'B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A', + 'filename': 'top_domains-00800-20131210-035838/top_domains-00800-20131210051705-00024.warc.gz', + 'length': '523', + 'mimetype': 'warc/revisit', + 'offset': '247256770', + 'orig.filename': 'deccanchronicle.com-20130107-023325/IA-FOC-deccanchronicle.com-20130921004125-00000.warc.gz', + 'orig.length': '529', + 'orig.offset': '769759', + 'original': 'http://www.example.com/', + 'redirect': '-', + 'robotflags': '-', + 'statuscode': '-', + 'timestamp': '20131210052355', + 'urlkey': 'com,example)/'} + """ def __init__(self, serverUrl): self.serverUrl = serverUrl @@ -69,9 +86,22 @@ class RemoteCDXServer: }[wburl.type] -class CDXCaptureResult: - CDX_FORMATS = [["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"], - ["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","offset","filename"]] +class CDXCaptureResult(dict): + CDX_FORMATS = [ + # CDX 11 Format + ["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"], + + # CDX 9 Format + ["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","offset","filename"], + + # CDX 11 Format + 3 revisit resolve fields + ["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename", + "orig.length","orig.offset","orig.filename"], + + # CDX 9 Format + 3 revisit resolve fields + ["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","offset","filename", + "orig.length","orig.offset","orig.filename"] + ] def __init__(self, cdxline): cdxline = cdxline.rstrip() @@ -83,13 +113,14 @@ class CDXCaptureResult: cdxformat = i if not cdxformat: - raise InvalidCDXException('unknown {0}-field cdx format'.format(len(fields))) + raise wbexceptions.InvalidCDXException('unknown {0}-field cdx format'.format(len(fields))) - for header, field in zip(cdxformat, fields): - setattr(self, header, field) + for header, field in itertools.izip(cdxformat, fields): + self[header] = field + # setattr(self, header, field) - def __repr__(self): - return str(vars(self)) + #def __repr__(self): + # return str(vars(self)) diff --git a/pywb/query.py b/pywb/query.py new file mode 100644 index 00000000..4f5574b8 --- /dev/null +++ b/pywb/query.py @@ -0,0 +1,24 @@ +import indexreader +import utils +import wbrequestresponse +import wbexceptions + +class QueryHandler: + def __init__(self): + self.cdxserver = indexreader.RemoteCDXServer('http://web.archive.org/cdx/search/cdx') + + def __call__(self, wbrequest, prev_wbresponse): + wburl = wbrequest.wb_url + + params = self.cdxserver.getQueryParams(wburl) + + cdxlines = self.cdxserver.load(wburl.url, params) + + cdxlines = utils.peek_iter(cdxlines) + + if cdxlines is not None: + return wbrequestresponse.WbResponse.text_stream(cdxlines) + + raise wbexceptions.NotFoundException('WB Does Not Have Url: ' + wburl.url) + + diff --git a/pywb/replay.py b/pywb/replay.py new file mode 100644 index 00000000..1c62a5d6 --- /dev/null +++ b/pywb/replay.py @@ -0,0 +1,110 @@ +import indexreader +from wbrequestresponse import WbResponse +import utils + +class ReplayHandler: + def __init__(self, resolvers, archiveloader): + self.resolvers = resolvers + self.archiveloader = archiveloader + + def __call__(self, wbrequest, query_response): + cdxlist = query_response.body + last_e = None + first = True + for cdx in cdxlist: + try: + cdx = indexreader.CDXCaptureResult(cdx) + + # First time through, check if do redirect before warc load + if first and (cdx['timestamp'] != wbrequest.wb_url.timestamp): + return WbResponse.better_timestamp_response(wbrequest, cdx['timestamp']) + + response = self.doReplay(cdx, wbrequest) + + if response: + # if a fallback, redirect to exact timestamp! + if not first and (cdx['timestamp'] != wbrequest.wb_url.timestamp): + response.close() + return WbResponse.better_timestamp_response(wbrequest, cdx['timestamp']) + + return response + + first = False + + except Exception, e: + import traceback + traceback.print_exc() + last_e = e + pass + + if last_e: + raise last_e + + def _load(self, cdx, revisit = False): + prefix = '' if not revisit else 'orig.' + return self.archiveloader.load(self.resolveFull(cdx[prefix + 'filename']), cdx[prefix + 'offset'], cdx[prefix + 'length']) + + def doReplay(self, cdx, wbrequest): + hasCurr = (cdx['filename'] != '-') + hasOrig = (cdx['orig.filename'] != '-') + + # Case 1: non-revisit + if (hasCurr and not hasOrig): + headersRecord = self._load(cdx, False) + payloadRecord = headersRecord + isRevisit = False + + # Case 2: old-style revisit, load headers from original payload + elif (not hasCurr and hasOrig): + payloadRecord = self._load(cdx, False) + headersRecord = payloadRecord + isRevisit = True + + # Case 3: modern revisit, load headers from curr, payload from original + elif (hasCurr and hasOrig): + headersRecord = self._load(cdx, False) + payloadRecord = self._load(cdx, True) + + # Case 4: if headers record is actually empty (eg empty revisit), then use headers from revisit + if not headersRecord.httpHeaders: + headersRecord.close() + headersRecord = payloadRecord + + isRevisit = True + + else: + raise wbexceptions.CaptureException('Invalid CDX' + cdx) + + # Check for self redirect + if headersRecord.statusline.startswith('3'): + if self.isSelfRedirect(wbrequest, headersRecord): + raise wbexception.CaptureException('Self Redirect: ' + cdx) + + return WbResponse.stream_response(headersRecord.statusline, headersRecord.httpHeaders, payloadRecord.stream) + + def isSelfRedirect(self, wbrequest, record): + requestUrl = wbrequest.wb_url.url.lower() + locationUrl = utils.get_header(record.httpHeaders, 'Location').lower() + return requestUrl == locationUrl + #ArchivalUrlRewriter.stripProtocol(requestUrl) == ArchivalUrlRewriter.stripProtocol(locationUrl) + + + def resolveFull(self, filename): + # Attempt to resolve cdx file to full path + fullUrl = None + for resolver in self.resolvers: + fullUrl = resolver(filename) + if fullUrl: + return fullUrl + + raise exceptions.UnresolvedArchiveFileException('Archive File Not Found: ' + cdx.filename) + + +#====================================== +# PrefixResolver - convert cdx file entry to url with prefix if url contains specified string +#====================================== +def PrefixResolver(prefix, contains): + def makeUrl(url): + return prefix + url if (contains in url) else None + + return makeUrl diff --git a/pywb/utils.py b/pywb/utils.py index ee70be6a..89dc7459 100644 --- a/pywb/utils.py +++ b/pywb/utils.py @@ -1,4 +1,6 @@ import itertools +import hmac +import time def peek_iter(iterable): try: @@ -7,3 +9,39 @@ def peek_iter(iterable): return None return itertools.chain([first], iterable) + + +def get_header(headersList, name): + nameLower = name.lower() + for value in headersList: + if (value[0].lower() == nameLower): + return value[1] + +class HMACCookieMaker: + def __init__(self, key, name): + self.key = key + self.name = name + + + def __call__(self, duration, extraId = ''): + expire = str(long(time.time() + duration)) + + if extraId: + msg = extraId + '-' + expire + else: + msg = expire + + hmacdigest = hmac.new(self.key, msg) + hexdigest = hmacdigest.hexdigest() + + if extraId: + cookie = '{0}-{1}={2}-{3}'.format(self.name, extraId, expire, hexdigest) + else: + cookie = '{0}={1}-{2}'.format(self.name, expire, hexdigest) + + return cookie + + #return cookie + hexdigest + + + diff --git a/pywb/wbapp.py b/pywb/wbapp.py index 53edd319..755550de 100644 --- a/pywb/wbapp.py +++ b/pywb/wbapp.py @@ -1,11 +1,11 @@ -import indexreader -import json +from query import QueryHandler import wbexceptions -import utils from wbrequestresponse import WbResponse from archivalrouter import ArchivalRequestRouter + +## =========== class EchoEnv: def __call__(self, wbrequest, _): return WbResponse.text_response(str(wbrequest.env)) @@ -14,33 +14,20 @@ class WBHandler: def __call__(self, wbrequest, _): return WbResponse.text_response(str(wbrequest)) -class QueryHandler: - def __init__(self): - self.cdxserver = indexreader.RemoteCDXServer('http://web.archive.org/cdx/search/cdx') - - - def __call__(self, wbrequest, prev_wbresponse): - wburl = wbrequest.wb_url - - params = self.cdxserver.getQueryParams(wburl) - - cdxlines = self.cdxserver.load(wburl.url, params) - - cdxlines = utils.peek_iter(cdxlines) - - if cdxlines is not None: - return WbResponse.text_stream(cdxlines) - - raise wbexceptions.NotFoundException('WB Does Not Have Url: ' + wburl.url) +## =========== +query = QueryHandler() +import testwb +replay = testwb.createReplay() ## =========== parser = ArchivalRequestRouter( { 't0' : [EchoEnv()], 't1' : [WBHandler()], - 't2' : [QueryHandler()] + 't2' : [query], + 't3' : [query, replay], }, hostpaths = ['http://localhost:9090/']) ## =========== @@ -63,6 +50,7 @@ def application(env, start_response): return response(env, start_response) + def handleException(env, exc): if hasattr(exc, 'status'): status = exc.status() diff --git a/pywb/wbexceptions.py b/pywb/wbexceptions.py index e0724fa2..ac882b4f 100644 --- a/pywb/wbexceptions.py +++ b/pywb/wbexceptions.py @@ -18,3 +18,24 @@ class InvalidCDXException(Exception): class NotFoundException(Exception): def status(_): return '404' + +# Exceptions that effect a specific capture and result in a retry +class CaptureException(Exception): + def status(_): + return '500' + +class UnresolvedArchiveFileException(CaptureException): + pass + +class UnknownArchiveFormatException(CaptureException): + pass + +class UnknownLoaderProtocolException(CaptureException): + pass + +class InvalidArchiveRecordException(CaptureException): + def __init__(msg, errList = None): + super(InvalidArchiveRecordException, self).__init__(msg) + self.errList = errList + + diff --git a/pywb/wbrequestresponse.py b/pywb/wbrequestresponse.py index a045edd5..46d7208e 100644 --- a/pywb/wbrequestresponse.py +++ b/pywb/wbrequestresponse.py @@ -1,4 +1,5 @@ from wbarchivalurl import ArchivalUrl +import utils #WB Request and Response class WbRequest: @@ -106,11 +107,27 @@ class WbResponse: def redir_response(location, status = '302 Redirect'): return WbResponse(status, headersList = [('Location', location)]) + @staticmethod + def stream_response(statusline, headers, stream): + def streamGen(): + try: + buff = stream.read() + while buff: + yield buff + buff = stream.read() + finally: + stream.close() + + return WbResponse(statusline, headersList = headers, value = streamGen()) + + @staticmethod + def better_timestamp_response(wbrequest, newTimestamp): + wbrequest.wb_url.timestamp = newTimestamp + newUrl = wbrequest.wb_prefix + str(wbrequest.wb_url)[1:] + return WbResponse.redir_response(newUrl) + def get_header(self, name): - name_upp = name.upper() - for value in self.headersList: - if (value[0].upper() == name_upp): - return value[1] + return utils.get_header(self.headersList, name) def __call__(self, env, start_response): #headersList = [] @@ -119,6 +136,12 @@ class WbResponse: start_response(self.status, self.headersList) + if env['REQUEST_METHOD'] == 'HEAD': + if hasattr(self.body, 'close'): + self.body.close() + return self.body + return [] + if hasattr(self.body, '__iter__'): return self.body else: diff --git a/pywb/wburlrewriter.py b/pywb/wburlrewriter.py index 751d3b4c..3f059415 100644 --- a/pywb/wburlrewriter.py +++ b/pywb/wburlrewriter.py @@ -37,6 +37,9 @@ class ArchivalUrlRewriter: >>> ArchivalUrlRewriter('/19960708im_/http://domain.example.com/path.txt', '/abc/').getAbsUrl() '/abc/19960708im_/' + + >>> ArchivalUrlRewriter.stripProtocol('https://example.com') == ArchivalUrlRewriter.stripProtocol('http://example.com') + True """ NO_REWRITE_URI_PREFIX = ['javascript:', 'data:', 'mailto:', 'about:'] @@ -85,6 +88,14 @@ class ArchivalUrlRewriter: def setBaseUrl(self, newUrl): self.wburl.url = newUrl + @staticmethod + def stripProtocol(url): + for protocol in ArchivalUrlRewriter.PROTOCOLS: + if url.startswith(protocol): + return url[len(protocol):] + + return url + if __name__ == "__main__": import doctest