mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
archiveloader: Support for loading warc/arc records using hanzo parser (for record header parsing only)
ReplayHandler: load replay from query response, find best option basic support for matching url, checking self-redirects!
This commit is contained in:
parent
787dfc136e
commit
16f458d5ec
189
pywb/archiveloader.py
Normal file
189
pywb/archiveloader.py
Normal file
@ -0,0 +1,189 @@
|
|||||||
|
import hanzo.warctools
|
||||||
|
|
||||||
|
import re
|
||||||
|
import utils
|
||||||
|
import zlib
|
||||||
|
import urllib2
|
||||||
|
import StringIO
|
||||||
|
import urlparse
|
||||||
|
import collections
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
class HttpStreamLoader:
|
||||||
|
def __init__(self, hmac = None, hmacDuration = 30):
|
||||||
|
self.hmac = hmac
|
||||||
|
self.hmacDuration = hmacDuration
|
||||||
|
|
||||||
|
def load(self, url, offset, length):
|
||||||
|
if length:
|
||||||
|
rangeHeader = 'bytes={0}-{1}'.format(offset, int(offset) + int(length) - 1)
|
||||||
|
else:
|
||||||
|
rangeHeader = 'bytes={0}-'.format(offset)
|
||||||
|
|
||||||
|
headers = {}
|
||||||
|
headers['Range'] = rangeHeader
|
||||||
|
|
||||||
|
if self.hmac:
|
||||||
|
headers['Cookie'] = self.hmac(self.hmacDuration)
|
||||||
|
|
||||||
|
request = urllib2.Request(url, headers = headers)
|
||||||
|
return urllib2.urlopen(request)
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
WBArchiveRecord = collections.namedtuple('WBArchiveRecord', 'parsed, stream, statusline, httpHeaders')
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
class ArchiveLoader:
|
||||||
|
# Standard ARC headers
|
||||||
|
ARC_HEADERS = ["uri", "ip-address", "creation-date", "content-type", "length"]
|
||||||
|
|
||||||
|
# Since loading a range request, can only determine gzip-ness based on file extension
|
||||||
|
FORMAT_MAP = {
|
||||||
|
'.warc.gz': (hanzo.warctools.WarcRecord, 'warc', True),
|
||||||
|
'.arc.gz': (hanzo.warctools.ArcRecord, 'arc', True),
|
||||||
|
'.warc': (hanzo.warctools.WarcRecord, 'warc', False),
|
||||||
|
'.arc': (hanzo.warctools.ArcRecord, 'arc', False),
|
||||||
|
}
|
||||||
|
|
||||||
|
HTTP_STATUS_REGEX = re.compile('^HTTP/[\d.]+ ((\d+).*)$')
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def createDefaultLoaders():
|
||||||
|
http = HttpStreamLoader()
|
||||||
|
return {
|
||||||
|
'http': http,
|
||||||
|
'https': http,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def __init__(self, loaders = {}, chunkSize = 8192):
|
||||||
|
self.loaders = loaders if loaders else ArchiveLoader.createDefaultLoaders()
|
||||||
|
self.chunkSize = chunkSize
|
||||||
|
|
||||||
|
def load(self, url, offset, length):
|
||||||
|
urlParts = urlparse.urlsplit(url)
|
||||||
|
|
||||||
|
try:
|
||||||
|
loader = self.loaders.get(urlParts.scheme)
|
||||||
|
except Exception:
|
||||||
|
raise wbexceptions.UnknownLoaderProtocolException(url)
|
||||||
|
|
||||||
|
loaderCls = None
|
||||||
|
|
||||||
|
for ext, (loaderCls, aFormat, gzip) in ArchiveLoader.FORMAT_MAP.iteritems():
|
||||||
|
if url.endswith(ext):
|
||||||
|
loaderCls = loaderCls
|
||||||
|
aFormat = aFormat
|
||||||
|
isGzip = gzip
|
||||||
|
break
|
||||||
|
|
||||||
|
if loaderCls is None:
|
||||||
|
raise wbexceptions.UnknownArchiveFormatException(url)
|
||||||
|
|
||||||
|
if isGzip:
|
||||||
|
decomp = zlib.decompressobj(16+zlib.MAX_WBITS)
|
||||||
|
else:
|
||||||
|
decomp = None
|
||||||
|
|
||||||
|
|
||||||
|
raw = loader.load(url, offset, length)
|
||||||
|
|
||||||
|
reader = LineReader(raw, self.chunkSize, decomp)
|
||||||
|
|
||||||
|
parser = loaderCls.make_parser()
|
||||||
|
|
||||||
|
if aFormat == 'arc':
|
||||||
|
parser.headers = ArchiveLoader.ARC_HEADERS
|
||||||
|
|
||||||
|
(parsed, errors, _) = parser.parse(reader, 0)
|
||||||
|
|
||||||
|
if errors:
|
||||||
|
reader.close()
|
||||||
|
raise wbexceptions.InvalidArchiveRecordException('Error Parsing Record', errors)
|
||||||
|
|
||||||
|
|
||||||
|
if aFormat == 'arc':
|
||||||
|
recType = 'arc-response'
|
||||||
|
empty = (utils.get_header(parsed.headers, 'length') == 0)
|
||||||
|
else:
|
||||||
|
recType = utils.get_header(parsed.headers, 'WARC-Type')
|
||||||
|
empty = (utils.get_header(parsed.headers, 'Content-Length') == '0')
|
||||||
|
|
||||||
|
parsed.recType = recType
|
||||||
|
parsed.aFormat = aFormat
|
||||||
|
|
||||||
|
if empty:
|
||||||
|
return WBArchiveRecord(parsed, reader, '400', [])
|
||||||
|
|
||||||
|
elif recType == 'metadata' or recType == 'resource':
|
||||||
|
headers = [('Content-Type', utils.get_header(parsed.headers, 'Content-Type'))]
|
||||||
|
|
||||||
|
return WBArchiveRecord(parsed, reader, '200 OK', headers)
|
||||||
|
|
||||||
|
else:
|
||||||
|
(statusline, headers) = self.parseHttpHeaders(reader)
|
||||||
|
|
||||||
|
return WBArchiveRecord(parsed, reader, statusline, headers)
|
||||||
|
|
||||||
|
|
||||||
|
def parseHttpHeaders(self, stream):
|
||||||
|
def nextHeaderLine(stream):
|
||||||
|
return stream.readline().rstrip()
|
||||||
|
|
||||||
|
line = nextHeaderLine(stream)
|
||||||
|
matched = self.HTTP_STATUS_REGEX.match(line)
|
||||||
|
|
||||||
|
if not matched:
|
||||||
|
raise wbexceptions.InvalidArchiveRecordException('Expected HTTP Status Line, Found: ' + line)
|
||||||
|
|
||||||
|
#status = int(matched.group(2))
|
||||||
|
statusline = matched.group(1)
|
||||||
|
headers = []
|
||||||
|
|
||||||
|
line = nextHeaderLine(stream)
|
||||||
|
|
||||||
|
while line and line != '\r\n':
|
||||||
|
name, value = line.split(':', 1)
|
||||||
|
value = value.strip()
|
||||||
|
headers.append((name, value))
|
||||||
|
line = nextHeaderLine(stream)
|
||||||
|
|
||||||
|
return (statusline, headers)
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
class LineReader:
|
||||||
|
def __init__(self, stream, chunkSize = 1024, decomp = None):
|
||||||
|
self.stream = stream
|
||||||
|
self.chunkSize = chunkSize
|
||||||
|
self.decomp = decomp
|
||||||
|
self.buff = None
|
||||||
|
self.numread = 0
|
||||||
|
|
||||||
|
def _fillbuff(self, chunkSize = None):
|
||||||
|
if not chunkSize:
|
||||||
|
chunkSize = self.chunkSize
|
||||||
|
|
||||||
|
if not self.buff or self.buff.pos >= self.buff.len:
|
||||||
|
data = self.stream.read(chunkSize)
|
||||||
|
self.numread += len(data)
|
||||||
|
if self.decomp:
|
||||||
|
data = self.decomp.decompress(data)
|
||||||
|
|
||||||
|
self.buff = StringIO.StringIO(data)
|
||||||
|
|
||||||
|
def read(self):
|
||||||
|
self._fillbuff()
|
||||||
|
return self.buff.read()
|
||||||
|
|
||||||
|
def readline(self):
|
||||||
|
self._fillbuff()
|
||||||
|
return self.buff.readline()
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
if self.stream:
|
||||||
|
self.stream.close()
|
||||||
|
self.stream = None
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -1,13 +1,14 @@
|
|||||||
import urllib
|
import urllib
|
||||||
import urllib2
|
import urllib2
|
||||||
import wbexceptions
|
import wbexceptions
|
||||||
|
import itertools
|
||||||
|
|
||||||
from wbarchivalurl import ArchivalUrl
|
from wbarchivalurl import ArchivalUrl
|
||||||
|
|
||||||
class RemoteCDXServer:
|
class RemoteCDXServer:
|
||||||
"""
|
"""
|
||||||
>>> x = cdxserver.load('example.com', parse_cdx = True, limit = '2')
|
>>> x = cdxserver.load('example.com', parse_cdx = True, limit = '2')
|
||||||
>>> pprint(vars(x[0]))
|
>>> pprint(x[0])
|
||||||
{'digest': 'HT2DYGA5UKZCPBSFVCV3JOBXGW2G5UUA',
|
{'digest': 'HT2DYGA5UKZCPBSFVCV3JOBXGW2G5UUA',
|
||||||
'filename': 'DJ_crawl2.20020401123359-c/DJ_crawl3.20020120141301.arc.gz',
|
'filename': 'DJ_crawl2.20020401123359-c/DJ_crawl3.20020120141301.arc.gz',
|
||||||
'length': '1792',
|
'length': '1792',
|
||||||
@ -20,7 +21,23 @@ class RemoteCDXServer:
|
|||||||
'timestamp': '20020120142510',
|
'timestamp': '20020120142510',
|
||||||
'urlkey': 'com,example)/'}
|
'urlkey': 'com,example)/'}
|
||||||
|
|
||||||
"""
|
>>> x = cdxserver.load('example.com', parse_cdx = True, params = {'resolveRevisits': True, 'closest': '20131226', 'sort': 'closest', 'limit': '1'})
|
||||||
|
>>> pprint(x[0])
|
||||||
|
{'digest': 'B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A',
|
||||||
|
'filename': 'top_domains-00800-20131210-035838/top_domains-00800-20131210051705-00024.warc.gz',
|
||||||
|
'length': '523',
|
||||||
|
'mimetype': 'warc/revisit',
|
||||||
|
'offset': '247256770',
|
||||||
|
'orig.filename': 'deccanchronicle.com-20130107-023325/IA-FOC-deccanchronicle.com-20130921004125-00000.warc.gz',
|
||||||
|
'orig.length': '529',
|
||||||
|
'orig.offset': '769759',
|
||||||
|
'original': 'http://www.example.com/',
|
||||||
|
'redirect': '-',
|
||||||
|
'robotflags': '-',
|
||||||
|
'statuscode': '-',
|
||||||
|
'timestamp': '20131210052355',
|
||||||
|
'urlkey': 'com,example)/'}
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(self, serverUrl):
|
def __init__(self, serverUrl):
|
||||||
self.serverUrl = serverUrl
|
self.serverUrl = serverUrl
|
||||||
@ -69,9 +86,22 @@ class RemoteCDXServer:
|
|||||||
}[wburl.type]
|
}[wburl.type]
|
||||||
|
|
||||||
|
|
||||||
class CDXCaptureResult:
|
class CDXCaptureResult(dict):
|
||||||
CDX_FORMATS = [["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"],
|
CDX_FORMATS = [
|
||||||
["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","offset","filename"]]
|
# CDX 11 Format
|
||||||
|
["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"],
|
||||||
|
|
||||||
|
# CDX 9 Format
|
||||||
|
["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","offset","filename"],
|
||||||
|
|
||||||
|
# CDX 11 Format + 3 revisit resolve fields
|
||||||
|
["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename",
|
||||||
|
"orig.length","orig.offset","orig.filename"],
|
||||||
|
|
||||||
|
# CDX 9 Format + 3 revisit resolve fields
|
||||||
|
["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","offset","filename",
|
||||||
|
"orig.length","orig.offset","orig.filename"]
|
||||||
|
]
|
||||||
|
|
||||||
def __init__(self, cdxline):
|
def __init__(self, cdxline):
|
||||||
cdxline = cdxline.rstrip()
|
cdxline = cdxline.rstrip()
|
||||||
@ -83,13 +113,14 @@ class CDXCaptureResult:
|
|||||||
cdxformat = i
|
cdxformat = i
|
||||||
|
|
||||||
if not cdxformat:
|
if not cdxformat:
|
||||||
raise InvalidCDXException('unknown {0}-field cdx format'.format(len(fields)))
|
raise wbexceptions.InvalidCDXException('unknown {0}-field cdx format'.format(len(fields)))
|
||||||
|
|
||||||
for header, field in zip(cdxformat, fields):
|
for header, field in itertools.izip(cdxformat, fields):
|
||||||
setattr(self, header, field)
|
self[header] = field
|
||||||
|
# setattr(self, header, field)
|
||||||
|
|
||||||
def __repr__(self):
|
#def __repr__(self):
|
||||||
return str(vars(self))
|
# return str(vars(self))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
24
pywb/query.py
Normal file
24
pywb/query.py
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
import indexreader
|
||||||
|
import utils
|
||||||
|
import wbrequestresponse
|
||||||
|
import wbexceptions
|
||||||
|
|
||||||
|
class QueryHandler:
|
||||||
|
def __init__(self):
|
||||||
|
self.cdxserver = indexreader.RemoteCDXServer('http://web.archive.org/cdx/search/cdx')
|
||||||
|
|
||||||
|
def __call__(self, wbrequest, prev_wbresponse):
|
||||||
|
wburl = wbrequest.wb_url
|
||||||
|
|
||||||
|
params = self.cdxserver.getQueryParams(wburl)
|
||||||
|
|
||||||
|
cdxlines = self.cdxserver.load(wburl.url, params)
|
||||||
|
|
||||||
|
cdxlines = utils.peek_iter(cdxlines)
|
||||||
|
|
||||||
|
if cdxlines is not None:
|
||||||
|
return wbrequestresponse.WbResponse.text_stream(cdxlines)
|
||||||
|
|
||||||
|
raise wbexceptions.NotFoundException('WB Does Not Have Url: ' + wburl.url)
|
||||||
|
|
||||||
|
|
110
pywb/replay.py
Normal file
110
pywb/replay.py
Normal file
@ -0,0 +1,110 @@
|
|||||||
|
import indexreader
|
||||||
|
from wbrequestresponse import WbResponse
|
||||||
|
import utils
|
||||||
|
|
||||||
|
class ReplayHandler:
|
||||||
|
def __init__(self, resolvers, archiveloader):
|
||||||
|
self.resolvers = resolvers
|
||||||
|
self.archiveloader = archiveloader
|
||||||
|
|
||||||
|
def __call__(self, wbrequest, query_response):
|
||||||
|
cdxlist = query_response.body
|
||||||
|
last_e = None
|
||||||
|
first = True
|
||||||
|
for cdx in cdxlist:
|
||||||
|
try:
|
||||||
|
cdx = indexreader.CDXCaptureResult(cdx)
|
||||||
|
|
||||||
|
# First time through, check if do redirect before warc load
|
||||||
|
if first and (cdx['timestamp'] != wbrequest.wb_url.timestamp):
|
||||||
|
return WbResponse.better_timestamp_response(wbrequest, cdx['timestamp'])
|
||||||
|
|
||||||
|
response = self.doReplay(cdx, wbrequest)
|
||||||
|
|
||||||
|
if response:
|
||||||
|
# if a fallback, redirect to exact timestamp!
|
||||||
|
if not first and (cdx['timestamp'] != wbrequest.wb_url.timestamp):
|
||||||
|
response.close()
|
||||||
|
return WbResponse.better_timestamp_response(wbrequest, cdx['timestamp'])
|
||||||
|
|
||||||
|
return response
|
||||||
|
|
||||||
|
first = False
|
||||||
|
|
||||||
|
except Exception, e:
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
last_e = e
|
||||||
|
pass
|
||||||
|
|
||||||
|
if last_e:
|
||||||
|
raise last_e
|
||||||
|
|
||||||
|
def _load(self, cdx, revisit = False):
|
||||||
|
prefix = '' if not revisit else 'orig.'
|
||||||
|
return self.archiveloader.load(self.resolveFull(cdx[prefix + 'filename']), cdx[prefix + 'offset'], cdx[prefix + 'length'])
|
||||||
|
|
||||||
|
def doReplay(self, cdx, wbrequest):
|
||||||
|
hasCurr = (cdx['filename'] != '-')
|
||||||
|
hasOrig = (cdx['orig.filename'] != '-')
|
||||||
|
|
||||||
|
# Case 1: non-revisit
|
||||||
|
if (hasCurr and not hasOrig):
|
||||||
|
headersRecord = self._load(cdx, False)
|
||||||
|
payloadRecord = headersRecord
|
||||||
|
isRevisit = False
|
||||||
|
|
||||||
|
# Case 2: old-style revisit, load headers from original payload
|
||||||
|
elif (not hasCurr and hasOrig):
|
||||||
|
payloadRecord = self._load(cdx, False)
|
||||||
|
headersRecord = payloadRecord
|
||||||
|
isRevisit = True
|
||||||
|
|
||||||
|
# Case 3: modern revisit, load headers from curr, payload from original
|
||||||
|
elif (hasCurr and hasOrig):
|
||||||
|
headersRecord = self._load(cdx, False)
|
||||||
|
payloadRecord = self._load(cdx, True)
|
||||||
|
|
||||||
|
# Case 4: if headers record is actually empty (eg empty revisit), then use headers from revisit
|
||||||
|
if not headersRecord.httpHeaders:
|
||||||
|
headersRecord.close()
|
||||||
|
headersRecord = payloadRecord
|
||||||
|
|
||||||
|
isRevisit = True
|
||||||
|
|
||||||
|
else:
|
||||||
|
raise wbexceptions.CaptureException('Invalid CDX' + cdx)
|
||||||
|
|
||||||
|
# Check for self redirect
|
||||||
|
if headersRecord.statusline.startswith('3'):
|
||||||
|
if self.isSelfRedirect(wbrequest, headersRecord):
|
||||||
|
raise wbexception.CaptureException('Self Redirect: ' + cdx)
|
||||||
|
|
||||||
|
return WbResponse.stream_response(headersRecord.statusline, headersRecord.httpHeaders, payloadRecord.stream)
|
||||||
|
|
||||||
|
def isSelfRedirect(self, wbrequest, record):
|
||||||
|
requestUrl = wbrequest.wb_url.url.lower()
|
||||||
|
locationUrl = utils.get_header(record.httpHeaders, 'Location').lower()
|
||||||
|
return requestUrl == locationUrl
|
||||||
|
#ArchivalUrlRewriter.stripProtocol(requestUrl) == ArchivalUrlRewriter.stripProtocol(locationUrl)
|
||||||
|
|
||||||
|
|
||||||
|
def resolveFull(self, filename):
|
||||||
|
# Attempt to resolve cdx file to full path
|
||||||
|
fullUrl = None
|
||||||
|
for resolver in self.resolvers:
|
||||||
|
fullUrl = resolver(filename)
|
||||||
|
if fullUrl:
|
||||||
|
return fullUrl
|
||||||
|
|
||||||
|
raise exceptions.UnresolvedArchiveFileException('Archive File Not Found: ' + cdx.filename)
|
||||||
|
|
||||||
|
|
||||||
|
#======================================
|
||||||
|
# PrefixResolver - convert cdx file entry to url with prefix if url contains specified string
|
||||||
|
#======================================
|
||||||
|
def PrefixResolver(prefix, contains):
|
||||||
|
def makeUrl(url):
|
||||||
|
return prefix + url if (contains in url) else None
|
||||||
|
|
||||||
|
return makeUrl
|
@ -1,4 +1,6 @@
|
|||||||
import itertools
|
import itertools
|
||||||
|
import hmac
|
||||||
|
import time
|
||||||
|
|
||||||
def peek_iter(iterable):
|
def peek_iter(iterable):
|
||||||
try:
|
try:
|
||||||
@ -7,3 +9,39 @@ def peek_iter(iterable):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
return itertools.chain([first], iterable)
|
return itertools.chain([first], iterable)
|
||||||
|
|
||||||
|
|
||||||
|
def get_header(headersList, name):
|
||||||
|
nameLower = name.lower()
|
||||||
|
for value in headersList:
|
||||||
|
if (value[0].lower() == nameLower):
|
||||||
|
return value[1]
|
||||||
|
|
||||||
|
class HMACCookieMaker:
|
||||||
|
def __init__(self, key, name):
|
||||||
|
self.key = key
|
||||||
|
self.name = name
|
||||||
|
|
||||||
|
|
||||||
|
def __call__(self, duration, extraId = ''):
|
||||||
|
expire = str(long(time.time() + duration))
|
||||||
|
|
||||||
|
if extraId:
|
||||||
|
msg = extraId + '-' + expire
|
||||||
|
else:
|
||||||
|
msg = expire
|
||||||
|
|
||||||
|
hmacdigest = hmac.new(self.key, msg)
|
||||||
|
hexdigest = hmacdigest.hexdigest()
|
||||||
|
|
||||||
|
if extraId:
|
||||||
|
cookie = '{0}-{1}={2}-{3}'.format(self.name, extraId, expire, hexdigest)
|
||||||
|
else:
|
||||||
|
cookie = '{0}={1}-{2}'.format(self.name, expire, hexdigest)
|
||||||
|
|
||||||
|
return cookie
|
||||||
|
|
||||||
|
#return cookie + hexdigest
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,11 +1,11 @@
|
|||||||
import indexreader
|
from query import QueryHandler
|
||||||
import json
|
|
||||||
import wbexceptions
|
import wbexceptions
|
||||||
import utils
|
|
||||||
|
|
||||||
from wbrequestresponse import WbResponse
|
from wbrequestresponse import WbResponse
|
||||||
from archivalrouter import ArchivalRequestRouter
|
from archivalrouter import ArchivalRequestRouter
|
||||||
|
|
||||||
|
|
||||||
|
## ===========
|
||||||
class EchoEnv:
|
class EchoEnv:
|
||||||
def __call__(self, wbrequest, _):
|
def __call__(self, wbrequest, _):
|
||||||
return WbResponse.text_response(str(wbrequest.env))
|
return WbResponse.text_response(str(wbrequest.env))
|
||||||
@ -14,33 +14,20 @@ class WBHandler:
|
|||||||
def __call__(self, wbrequest, _):
|
def __call__(self, wbrequest, _):
|
||||||
return WbResponse.text_response(str(wbrequest))
|
return WbResponse.text_response(str(wbrequest))
|
||||||
|
|
||||||
class QueryHandler:
|
|
||||||
def __init__(self):
|
|
||||||
self.cdxserver = indexreader.RemoteCDXServer('http://web.archive.org/cdx/search/cdx')
|
|
||||||
|
|
||||||
|
|
||||||
def __call__(self, wbrequest, prev_wbresponse):
|
|
||||||
wburl = wbrequest.wb_url
|
|
||||||
|
|
||||||
params = self.cdxserver.getQueryParams(wburl)
|
|
||||||
|
|
||||||
cdxlines = self.cdxserver.load(wburl.url, params)
|
|
||||||
|
|
||||||
cdxlines = utils.peek_iter(cdxlines)
|
|
||||||
|
|
||||||
if cdxlines is not None:
|
|
||||||
return WbResponse.text_stream(cdxlines)
|
|
||||||
|
|
||||||
raise wbexceptions.NotFoundException('WB Does Not Have Url: ' + wburl.url)
|
|
||||||
|
|
||||||
|
## ===========
|
||||||
|
query = QueryHandler()
|
||||||
|
|
||||||
|
import testwb
|
||||||
|
replay = testwb.createReplay()
|
||||||
|
|
||||||
## ===========
|
## ===========
|
||||||
parser = ArchivalRequestRouter(
|
parser = ArchivalRequestRouter(
|
||||||
{
|
{
|
||||||
't0' : [EchoEnv()],
|
't0' : [EchoEnv()],
|
||||||
't1' : [WBHandler()],
|
't1' : [WBHandler()],
|
||||||
't2' : [QueryHandler()]
|
't2' : [query],
|
||||||
|
't3' : [query, replay],
|
||||||
},
|
},
|
||||||
hostpaths = ['http://localhost:9090/'])
|
hostpaths = ['http://localhost:9090/'])
|
||||||
## ===========
|
## ===========
|
||||||
@ -63,6 +50,7 @@ def application(env, start_response):
|
|||||||
|
|
||||||
return response(env, start_response)
|
return response(env, start_response)
|
||||||
|
|
||||||
|
|
||||||
def handleException(env, exc):
|
def handleException(env, exc):
|
||||||
if hasattr(exc, 'status'):
|
if hasattr(exc, 'status'):
|
||||||
status = exc.status()
|
status = exc.status()
|
||||||
|
@ -18,3 +18,24 @@ class InvalidCDXException(Exception):
|
|||||||
class NotFoundException(Exception):
|
class NotFoundException(Exception):
|
||||||
def status(_):
|
def status(_):
|
||||||
return '404'
|
return '404'
|
||||||
|
|
||||||
|
# Exceptions that effect a specific capture and result in a retry
|
||||||
|
class CaptureException(Exception):
|
||||||
|
def status(_):
|
||||||
|
return '500'
|
||||||
|
|
||||||
|
class UnresolvedArchiveFileException(CaptureException):
|
||||||
|
pass
|
||||||
|
|
||||||
|
class UnknownArchiveFormatException(CaptureException):
|
||||||
|
pass
|
||||||
|
|
||||||
|
class UnknownLoaderProtocolException(CaptureException):
|
||||||
|
pass
|
||||||
|
|
||||||
|
class InvalidArchiveRecordException(CaptureException):
|
||||||
|
def __init__(msg, errList = None):
|
||||||
|
super(InvalidArchiveRecordException, self).__init__(msg)
|
||||||
|
self.errList = errList
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
from wbarchivalurl import ArchivalUrl
|
from wbarchivalurl import ArchivalUrl
|
||||||
|
import utils
|
||||||
#WB Request and Response
|
#WB Request and Response
|
||||||
|
|
||||||
class WbRequest:
|
class WbRequest:
|
||||||
@ -106,11 +107,27 @@ class WbResponse:
|
|||||||
def redir_response(location, status = '302 Redirect'):
|
def redir_response(location, status = '302 Redirect'):
|
||||||
return WbResponse(status, headersList = [('Location', location)])
|
return WbResponse(status, headersList = [('Location', location)])
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def stream_response(statusline, headers, stream):
|
||||||
|
def streamGen():
|
||||||
|
try:
|
||||||
|
buff = stream.read()
|
||||||
|
while buff:
|
||||||
|
yield buff
|
||||||
|
buff = stream.read()
|
||||||
|
finally:
|
||||||
|
stream.close()
|
||||||
|
|
||||||
|
return WbResponse(statusline, headersList = headers, value = streamGen())
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def better_timestamp_response(wbrequest, newTimestamp):
|
||||||
|
wbrequest.wb_url.timestamp = newTimestamp
|
||||||
|
newUrl = wbrequest.wb_prefix + str(wbrequest.wb_url)[1:]
|
||||||
|
return WbResponse.redir_response(newUrl)
|
||||||
|
|
||||||
def get_header(self, name):
|
def get_header(self, name):
|
||||||
name_upp = name.upper()
|
return utils.get_header(self.headersList, name)
|
||||||
for value in self.headersList:
|
|
||||||
if (value[0].upper() == name_upp):
|
|
||||||
return value[1]
|
|
||||||
|
|
||||||
def __call__(self, env, start_response):
|
def __call__(self, env, start_response):
|
||||||
#headersList = []
|
#headersList = []
|
||||||
@ -119,6 +136,12 @@ class WbResponse:
|
|||||||
|
|
||||||
start_response(self.status, self.headersList)
|
start_response(self.status, self.headersList)
|
||||||
|
|
||||||
|
if env['REQUEST_METHOD'] == 'HEAD':
|
||||||
|
if hasattr(self.body, 'close'):
|
||||||
|
self.body.close()
|
||||||
|
return self.body
|
||||||
|
return []
|
||||||
|
|
||||||
if hasattr(self.body, '__iter__'):
|
if hasattr(self.body, '__iter__'):
|
||||||
return self.body
|
return self.body
|
||||||
else:
|
else:
|
||||||
|
@ -37,6 +37,9 @@ class ArchivalUrlRewriter:
|
|||||||
|
|
||||||
>>> ArchivalUrlRewriter('/19960708im_/http://domain.example.com/path.txt', '/abc/').getAbsUrl()
|
>>> ArchivalUrlRewriter('/19960708im_/http://domain.example.com/path.txt', '/abc/').getAbsUrl()
|
||||||
'/abc/19960708im_/'
|
'/abc/19960708im_/'
|
||||||
|
|
||||||
|
>>> ArchivalUrlRewriter.stripProtocol('https://example.com') == ArchivalUrlRewriter.stripProtocol('http://example.com')
|
||||||
|
True
|
||||||
"""
|
"""
|
||||||
|
|
||||||
NO_REWRITE_URI_PREFIX = ['javascript:', 'data:', 'mailto:', 'about:']
|
NO_REWRITE_URI_PREFIX = ['javascript:', 'data:', 'mailto:', 'about:']
|
||||||
@ -85,6 +88,14 @@ class ArchivalUrlRewriter:
|
|||||||
def setBaseUrl(self, newUrl):
|
def setBaseUrl(self, newUrl):
|
||||||
self.wburl.url = newUrl
|
self.wburl.url = newUrl
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def stripProtocol(url):
|
||||||
|
for protocol in ArchivalUrlRewriter.PROTOCOLS:
|
||||||
|
if url.startswith(protocol):
|
||||||
|
return url[len(protocol):]
|
||||||
|
|
||||||
|
return url
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import doctest
|
import doctest
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user