archiveloader: Support for loading warc/arc records using hanzo parser (for record header parsing only)

ReplayHandler: load replay from query response, find best option basic support for matching url, checking self-redirects!
2025-03-24 06:59:52 +01:00 · 2013-12-28 05:00:06 -08:00 · 2013-12-28 05:00:06 -08:00 · 16f458d5ec
commit 16f458d5ec
parent 787dfc136e
9 changed files with 471 additions and 36 deletions
--- a/pywb/archiveloader.py
+++ b/pywb/archiveloader.py
@ -0,0 +1,189 @@
 import hanzo.warctools
 import re
 import utils
 import zlib
 import urllib2
 import StringIO
 import urlparse
 import collections
 #=================================================================
 class HttpStreamLoader:
    def __init__(self, hmac = None, hmacDuration = 30):
        self.hmac = hmac
        self.hmacDuration = hmacDuration
    def load(self, url, offset, length):
        if length:
            rangeHeader = 'bytes={0}-{1}'.format(offset, int(offset) + int(length) - 1)
        else:
            rangeHeader = 'bytes={0}-'.format(offset)
        headers = {}
        headers['Range'] = rangeHeader
        if self.hmac:
            headers['Cookie'] = self.hmac(self.hmacDuration)
        request = urllib2.Request(url, headers = headers)
        return urllib2.urlopen(request)
 #=================================================================
 WBArchiveRecord = collections.namedtuple('WBArchiveRecord', 'parsed, stream, statusline, httpHeaders')
 #=================================================================
 class ArchiveLoader:
    # Standard ARC headers
    ARC_HEADERS = ["uri", "ip-address", "creation-date", "content-type", "length"]
    # Since loading a range request, can only determine gzip-ness based on file extension
    FORMAT_MAP = {
        '.warc.gz': (hanzo.warctools.WarcRecord, 'warc', True),
        '.arc.gz':  (hanzo.warctools.ArcRecord,  'arc',  True),
        '.warc':    (hanzo.warctools.WarcRecord, 'warc', False),
        '.arc':     (hanzo.warctools.ArcRecord,  'arc',  False),
    }
    HTTP_STATUS_REGEX = re.compile('^HTTP/[\d.]+ ((\d+).*)$')
    @staticmethod
    def createDefaultLoaders():
        http = HttpStreamLoader()
        return {
                'http': http,
                'https': http,
               }
    def __init__(self, loaders = {}, chunkSize = 8192):
        self.loaders = loaders if loaders else ArchiveLoader.createDefaultLoaders()
        self.chunkSize = chunkSize
    def load(self, url, offset, length):
        urlParts = urlparse.urlsplit(url)
        try:
            loader = self.loaders.get(urlParts.scheme)
        except Exception:
            raise wbexceptions.UnknownLoaderProtocolException(url)
        loaderCls = None
        for ext, (loaderCls, aFormat, gzip) in ArchiveLoader.FORMAT_MAP.iteritems():
            if url.endswith(ext):
                loaderCls = loaderCls
                aFormat = aFormat
                isGzip = gzip
                break
        if loaderCls is None:
            raise wbexceptions.UnknownArchiveFormatException(url)
        if isGzip:
            decomp = zlib.decompressobj(16+zlib.MAX_WBITS)
        else:
            decomp = None
        raw = loader.load(url, offset, length)
        reader = LineReader(raw, self.chunkSize, decomp)
        parser = loaderCls.make_parser()
        if aFormat == 'arc':
            parser.headers = ArchiveLoader.ARC_HEADERS
        (parsed, errors, _) = parser.parse(reader, 0)
        if errors:
            reader.close()
            raise wbexceptions.InvalidArchiveRecordException('Error Parsing Record', errors)
        if aFormat == 'arc':
            recType = 'arc-response'
            empty = (utils.get_header(parsed.headers, 'length') == 0)
        else:
            recType = utils.get_header(parsed.headers, 'WARC-Type')
            empty = (utils.get_header(parsed.headers, 'Content-Length') == '0')
        parsed.recType = recType
        parsed.aFormat = aFormat
        if empty:
            return WBArchiveRecord(parsed, reader, '400', [])
        elif recType == 'metadata' or recType == 'resource':
            headers = [('Content-Type', utils.get_header(parsed.headers, 'Content-Type'))]
            return WBArchiveRecord(parsed, reader, '200 OK', headers)
        else:
            (statusline, headers) = self.parseHttpHeaders(reader)
            return WBArchiveRecord(parsed, reader, statusline, headers)
    def parseHttpHeaders(self, stream):
        def nextHeaderLine(stream):
            return stream.readline().rstrip()
        line = nextHeaderLine(stream)
        matched = self.HTTP_STATUS_REGEX.match(line)
        if not matched:
            raise wbexceptions.InvalidArchiveRecordException('Expected HTTP Status Line, Found: ' + line)
        #status = int(matched.group(2))
        statusline = matched.group(1)
        headers = []
        line = nextHeaderLine(stream)
        while line and line != '\r\n':
            name, value = line.split(':', 1)
            value = value.strip()
            headers.append((name, value))
            line = nextHeaderLine(stream)
        return (statusline, headers)
 #=================================================================
 class LineReader:
    def __init__(self, stream, chunkSize = 1024, decomp = None):
        self.stream = stream
        self.chunkSize = chunkSize
        self.decomp = decomp
        self.buff = None
        self.numread = 0
    def _fillbuff(self, chunkSize = None):
        if not chunkSize:
            chunkSize = self.chunkSize
        if not self.buff or self.buff.pos >= self.buff.len:
            data = self.stream.read(chunkSize)
            self.numread += len(data)
            if self.decomp:
                data = self.decomp.decompress(data)
            self.buff = StringIO.StringIO(data)
    def read(self):
        self._fillbuff()
        return self.buff.read()
    def readline(self):
        self._fillbuff()
        return self.buff.readline()
    def close(self):
        if self.stream:
            self.stream.close()
            self.stream = None
--- a/pywb/indexreader.py
+++ b/pywb/indexreader.py
@ -1,13 +1,14 @@
 import urllib
 import urllib2
 import wbexceptions
 import itertools
 from wbarchivalurl import ArchivalUrl
 class RemoteCDXServer:
    """
    >>> x = cdxserver.load('example.com', parse_cdx = True, limit = '2')
-    >>> pprint(vars(x[0]))
+    >>> pprint(x[0])
    {'digest': 'HT2DYGA5UKZCPBSFVCV3JOBXGW2G5UUA',
     'filename': 'DJ_crawl2.20020401123359-c/DJ_crawl3.20020120141301.arc.gz',
     'length': '1792',
@ -20,7 +21,23 @@ class RemoteCDXServer:
     'timestamp': '20020120142510',
     'urlkey': 'com,example)/'}
-   """
+    >>> x = cdxserver.load('example.com', parse_cdx = True, params = {'resolveRevisits': True, 'closest': '20131226', 'sort': 'closest', 'limit': '1'})
    >>> pprint(x[0])
    {'digest': 'B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A',
     'filename': 'top_domains-00800-20131210-035838/top_domains-00800-20131210051705-00024.warc.gz',
     'length': '523',
     'mimetype': 'warc/revisit',
     'offset': '247256770',
     'orig.filename': 'deccanchronicle.com-20130107-023325/IA-FOC-deccanchronicle.com-20130921004125-00000.warc.gz',
     'orig.length': '529',
     'orig.offset': '769759',
     'original': 'http://www.example.com/',
     'redirect': '-',
     'robotflags': '-',
     'statuscode': '-',
     'timestamp': '20131210052355',
     'urlkey': 'com,example)/'}
  """
    def __init__(self, serverUrl):
        self.serverUrl = serverUrl
@ -69,9 +86,22 @@ class RemoteCDXServer:
        }[wburl.type]
-class CDXCaptureResult:
+class CDXCaptureResult(dict):
-    CDX_FORMATS = [["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"],
+    CDX_FORMATS = [
-                   ["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","offset","filename"]]
+        # CDX 11 Format
        ["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"],
        # CDX 9 Format
        ["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","offset","filename"],
        # CDX 11 Format + 3 revisit resolve fields
        ["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename",
         "orig.length","orig.offset","orig.filename"],
        # CDX 9 Format + 3 revisit resolve fields
        ["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","offset","filename",
         "orig.length","orig.offset","orig.filename"]
        ]
    def __init__(self, cdxline):
        cdxline = cdxline.rstrip()
@ -83,13 +113,14 @@ class CDXCaptureResult:
                cdxformat = i
        if not cdxformat:
-            raise InvalidCDXException('unknown {0}-field cdx format'.format(len(fields)))
+            raise wbexceptions.InvalidCDXException('unknown {0}-field cdx format'.format(len(fields)))
-        for header, field in zip(cdxformat, fields):
+        for header, field in itertools.izip(cdxformat, fields):
-            setattr(self, header, field)
+            self[header] = field
    #        setattr(self, header, field)
-    def __repr__(self):
+    #def __repr__(self):
-        return str(vars(self))
+    #    return str(vars(self))
--- a/pywb/query.py
+++ b/pywb/query.py
@ -0,0 +1,24 @@
 import indexreader
 import utils
 import wbrequestresponse
 import wbexceptions
 class QueryHandler:
    def __init__(self):
        self.cdxserver = indexreader.RemoteCDXServer('http://web.archive.org/cdx/search/cdx')
    def __call__(self, wbrequest, prev_wbresponse):
        wburl = wbrequest.wb_url
        params = self.cdxserver.getQueryParams(wburl)
        cdxlines = self.cdxserver.load(wburl.url, params)
        cdxlines = utils.peek_iter(cdxlines)
        if cdxlines is not None:
            return wbrequestresponse.WbResponse.text_stream(cdxlines)
        raise wbexceptions.NotFoundException('WB Does Not Have Url: ' + wburl.url)
--- a/pywb/replay.py
+++ b/pywb/replay.py
@ -0,0 +1,110 @@
 import indexreader
 from wbrequestresponse import WbResponse
 import utils
 class ReplayHandler:
    def __init__(self, resolvers, archiveloader):
        self.resolvers = resolvers
        self.archiveloader = archiveloader
    def __call__(self, wbrequest, query_response):
        cdxlist = query_response.body
        last_e = None
        first = True
        for cdx in cdxlist:
            try:
                cdx = indexreader.CDXCaptureResult(cdx)
                # First time through, check if do redirect before warc load
                if first and (cdx['timestamp'] != wbrequest.wb_url.timestamp):
                    return WbResponse.better_timestamp_response(wbrequest, cdx['timestamp'])
                response = self.doReplay(cdx, wbrequest)
                if response:
                    # if a fallback, redirect to exact timestamp!
                    if not first and (cdx['timestamp'] != wbrequest.wb_url.timestamp):
                        response.close()
                        return WbResponse.better_timestamp_response(wbrequest, cdx['timestamp'])
                    return response
                first = False
            except Exception, e:
                import traceback
                traceback.print_exc()
                last_e = e
                pass
        if last_e:
            raise last_e
    def _load(self, cdx, revisit = False):
        prefix = '' if not revisit else 'orig.'
        return self.archiveloader.load(self.resolveFull(cdx[prefix + 'filename']), cdx[prefix + 'offset'], cdx[prefix + 'length'])
    def doReplay(self, cdx, wbrequest):
        hasCurr = (cdx['filename'] != '-')
        hasOrig = (cdx['orig.filename'] != '-')
        # Case 1: non-revisit
        if (hasCurr and not hasOrig):
            headersRecord = self._load(cdx, False)
            payloadRecord = headersRecord
            isRevisit = False
        # Case 2: old-style revisit, load headers from original payload
        elif (not hasCurr and hasOrig):
            payloadRecord = self._load(cdx, False)
            headersRecord = payloadRecord
            isRevisit = True
        # Case 3: modern revisit, load headers from curr, payload from original
        elif (hasCurr and hasOrig):
            headersRecord = self._load(cdx, False)
            payloadRecord = self._load(cdx, True)
            # Case 4: if headers record is actually empty (eg empty revisit), then use headers from revisit
            if not headersRecord.httpHeaders:
                headersRecord.close()
                headersRecord = payloadRecord
            isRevisit = True
        else:
            raise wbexceptions.CaptureException('Invalid CDX' + cdx)
        # Check for self redirect
        if headersRecord.statusline.startswith('3'):
            if self.isSelfRedirect(wbrequest, headersRecord):
                raise wbexception.CaptureException('Self Redirect: ' + cdx)
        return WbResponse.stream_response(headersRecord.statusline, headersRecord.httpHeaders, payloadRecord.stream)
    def isSelfRedirect(self, wbrequest, record):
        requestUrl = wbrequest.wb_url.url.lower()
        locationUrl = utils.get_header(record.httpHeaders, 'Location').lower()
        return requestUrl == locationUrl
        #ArchivalUrlRewriter.stripProtocol(requestUrl) == ArchivalUrlRewriter.stripProtocol(locationUrl)
    def resolveFull(self, filename):
        # Attempt to resolve cdx file to full path
        fullUrl = None
        for resolver in self.resolvers:
            fullUrl = resolver(filename)
            if fullUrl:
                return fullUrl
        raise exceptions.UnresolvedArchiveFileException('Archive File Not Found: ' + cdx.filename)
 #======================================
 # PrefixResolver - convert cdx file entry to url with prefix if url contains specified string
 #======================================
 def PrefixResolver(prefix, contains):
    def makeUrl(url):
        return prefix + url if (contains in url) else None
    return makeUrl
--- a/pywb/utils.py
+++ b/pywb/utils.py
@ -1,4 +1,6 @@
 import itertools
 import hmac
 import time
 def peek_iter(iterable):
    try:
@ -7,3 +9,39 @@ def peek_iter(iterable):
        return None
    return itertools.chain([first], iterable)
 def get_header(headersList, name):
    nameLower = name.lower()
    for value in headersList:
        if (value[0].lower() == nameLower):
            return value[1]
 class HMACCookieMaker:
    def __init__(self, key, name):
        self.key = key
        self.name = name
    def __call__(self, duration, extraId = ''):
        expire = str(long(time.time() + duration))
        if extraId:
            msg = extraId + '-' + expire
        else:
            msg = expire
        hmacdigest = hmac.new(self.key, msg)
        hexdigest = hmacdigest.hexdigest()
        if extraId:
            cookie = '{0}-{1}={2}-{3}'.format(self.name, extraId, expire, hexdigest)
        else:
            cookie = '{0}={1}-{2}'.format(self.name, expire, hexdigest)
        return cookie
        #return cookie + hexdigest
--- a/pywb/wbapp.py
+++ b/pywb/wbapp.py
@ -1,11 +1,11 @@
-import indexreader
+from query import QueryHandler
 import json
 import wbexceptions
 import utils
 from wbrequestresponse import WbResponse
 from archivalrouter import ArchivalRequestRouter
 ## ===========
 class EchoEnv:
    def __call__(self, wbrequest, _):
        return WbResponse.text_response(str(wbrequest.env))
@ -14,33 +14,20 @@ class WBHandler:
    def __call__(self, wbrequest, _):
        return WbResponse.text_response(str(wbrequest))
 class QueryHandler:
    def __init__(self):
        self.cdxserver = indexreader.RemoteCDXServer('http://web.archive.org/cdx/search/cdx')
    def __call__(self, wbrequest, prev_wbresponse):
        wburl = wbrequest.wb_url
        params = self.cdxserver.getQueryParams(wburl)
        cdxlines = self.cdxserver.load(wburl.url, params)
        cdxlines = utils.peek_iter(cdxlines)
        if cdxlines is not None:
            return WbResponse.text_stream(cdxlines)
        raise wbexceptions.NotFoundException('WB Does Not Have Url: ' + wburl.url)
 ## ===========
 query = QueryHandler()
 import testwb
 replay = testwb.createReplay()
 ## ===========
 parser = ArchivalRequestRouter(
    {
     't0' : [EchoEnv()],
     't1' : [WBHandler()],
-     't2' : [QueryHandler()]
+     't2' : [query],
     't3' : [query, replay],
    },
    hostpaths = ['http://localhost:9090/'])
 ## ===========
@ -63,6 +50,7 @@ def application(env, start_response):
    return response(env, start_response)
 def handleException(env, exc):
    if hasattr(exc, 'status'):
        status = exc.status()
--- a/pywb/wbexceptions.py
+++ b/pywb/wbexceptions.py
@ -18,3 +18,24 @@ class InvalidCDXException(Exception):
 class NotFoundException(Exception):
    def status(_):
        return '404'
 # Exceptions that effect a specific capture and result in a retry
 class CaptureException(Exception):
    def status(_):
        return '500'
 class UnresolvedArchiveFileException(CaptureException):
    pass
 class UnknownArchiveFormatException(CaptureException):
    pass
 class UnknownLoaderProtocolException(CaptureException):
    pass
 class InvalidArchiveRecordException(CaptureException):
    def __init__(msg, errList = None):
        super(InvalidArchiveRecordException, self).__init__(msg)
        self.errList = errList
--- a/pywb/wbrequestresponse.py
+++ b/pywb/wbrequestresponse.py
@ -1,4 +1,5 @@
 from wbarchivalurl import ArchivalUrl
 import utils
 #WB Request and Response
 class WbRequest:
@ -106,11 +107,27 @@ class WbResponse:
    def redir_response(location, status = '302 Redirect'):
        return WbResponse(status, headersList = [('Location', location)])
    @staticmethod
    def stream_response(statusline, headers, stream):
        def streamGen():
            try:
                buff = stream.read()
                while buff:
                    yield buff
                    buff = stream.read()
            finally:
                stream.close()
        return WbResponse(statusline, headersList = headers, value = streamGen())
    @staticmethod
    def better_timestamp_response(wbrequest, newTimestamp):
        wbrequest.wb_url.timestamp = newTimestamp
        newUrl = wbrequest.wb_prefix + str(wbrequest.wb_url)[1:]
        return WbResponse.redir_response(newUrl)
    def get_header(self, name):
-        name_upp = name.upper()
+        return utils.get_header(self.headersList, name)
        for value in self.headersList:
            if (value[0].upper() == name_upp):
                return value[1]
    def __call__(self, env, start_response):
        #headersList = []
@ -119,6 +136,12 @@ class WbResponse:
        start_response(self.status, self.headersList)
        if env['REQUEST_METHOD'] == 'HEAD':
            if hasattr(self.body, 'close'):
                self.body.close()
                return self.body
            return []
        if hasattr(self.body, '__iter__'):
            return self.body
        else:
--- a/pywb/wburlrewriter.py
+++ b/pywb/wburlrewriter.py
@ -37,6 +37,9 @@ class ArchivalUrlRewriter:
    >>> ArchivalUrlRewriter('/19960708im_/http://domain.example.com/path.txt', '/abc/').getAbsUrl()
    '/abc/19960708im_/'
    >>> ArchivalUrlRewriter.stripProtocol('https://example.com') == ArchivalUrlRewriter.stripProtocol('http://example.com')
    True
    """
    NO_REWRITE_URI_PREFIX = ['javascript:', 'data:', 'mailto:', 'about:']
@ -85,6 +88,14 @@ class ArchivalUrlRewriter:
    def setBaseUrl(self, newUrl):
        self.wburl.url = newUrl
    @staticmethod
    def stripProtocol(url):
        for protocol in ArchivalUrlRewriter.PROTOCOLS:
            if url.startswith(protocol):
                return url[len(protocol):]
        return url
 if __name__ == "__main__":
    import doctest