1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

archiveloader: Support for loading warc/arc records using hanzo parser (for record header parsing only)

ReplayHandler: load replay from query response, find best option
basic support for matching url, checking self-redirects!
This commit is contained in:
Ilya Kreymer 2013-12-28 05:00:06 -08:00
parent 787dfc136e
commit 16f458d5ec
9 changed files with 471 additions and 36 deletions

189
pywb/archiveloader.py Normal file
View File

@ -0,0 +1,189 @@
import hanzo.warctools
import re
import utils
import zlib
import urllib2
import StringIO
import urlparse
import collections
#=================================================================
class HttpStreamLoader:
def __init__(self, hmac = None, hmacDuration = 30):
self.hmac = hmac
self.hmacDuration = hmacDuration
def load(self, url, offset, length):
if length:
rangeHeader = 'bytes={0}-{1}'.format(offset, int(offset) + int(length) - 1)
else:
rangeHeader = 'bytes={0}-'.format(offset)
headers = {}
headers['Range'] = rangeHeader
if self.hmac:
headers['Cookie'] = self.hmac(self.hmacDuration)
request = urllib2.Request(url, headers = headers)
return urllib2.urlopen(request)
#=================================================================
WBArchiveRecord = collections.namedtuple('WBArchiveRecord', 'parsed, stream, statusline, httpHeaders')
#=================================================================
class ArchiveLoader:
# Standard ARC headers
ARC_HEADERS = ["uri", "ip-address", "creation-date", "content-type", "length"]
# Since loading a range request, can only determine gzip-ness based on file extension
FORMAT_MAP = {
'.warc.gz': (hanzo.warctools.WarcRecord, 'warc', True),
'.arc.gz': (hanzo.warctools.ArcRecord, 'arc', True),
'.warc': (hanzo.warctools.WarcRecord, 'warc', False),
'.arc': (hanzo.warctools.ArcRecord, 'arc', False),
}
HTTP_STATUS_REGEX = re.compile('^HTTP/[\d.]+ ((\d+).*)$')
@staticmethod
def createDefaultLoaders():
http = HttpStreamLoader()
return {
'http': http,
'https': http,
}
def __init__(self, loaders = {}, chunkSize = 8192):
self.loaders = loaders if loaders else ArchiveLoader.createDefaultLoaders()
self.chunkSize = chunkSize
def load(self, url, offset, length):
urlParts = urlparse.urlsplit(url)
try:
loader = self.loaders.get(urlParts.scheme)
except Exception:
raise wbexceptions.UnknownLoaderProtocolException(url)
loaderCls = None
for ext, (loaderCls, aFormat, gzip) in ArchiveLoader.FORMAT_MAP.iteritems():
if url.endswith(ext):
loaderCls = loaderCls
aFormat = aFormat
isGzip = gzip
break
if loaderCls is None:
raise wbexceptions.UnknownArchiveFormatException(url)
if isGzip:
decomp = zlib.decompressobj(16+zlib.MAX_WBITS)
else:
decomp = None
raw = loader.load(url, offset, length)
reader = LineReader(raw, self.chunkSize, decomp)
parser = loaderCls.make_parser()
if aFormat == 'arc':
parser.headers = ArchiveLoader.ARC_HEADERS
(parsed, errors, _) = parser.parse(reader, 0)
if errors:
reader.close()
raise wbexceptions.InvalidArchiveRecordException('Error Parsing Record', errors)
if aFormat == 'arc':
recType = 'arc-response'
empty = (utils.get_header(parsed.headers, 'length') == 0)
else:
recType = utils.get_header(parsed.headers, 'WARC-Type')
empty = (utils.get_header(parsed.headers, 'Content-Length') == '0')
parsed.recType = recType
parsed.aFormat = aFormat
if empty:
return WBArchiveRecord(parsed, reader, '400', [])
elif recType == 'metadata' or recType == 'resource':
headers = [('Content-Type', utils.get_header(parsed.headers, 'Content-Type'))]
return WBArchiveRecord(parsed, reader, '200 OK', headers)
else:
(statusline, headers) = self.parseHttpHeaders(reader)
return WBArchiveRecord(parsed, reader, statusline, headers)
def parseHttpHeaders(self, stream):
def nextHeaderLine(stream):
return stream.readline().rstrip()
line = nextHeaderLine(stream)
matched = self.HTTP_STATUS_REGEX.match(line)
if not matched:
raise wbexceptions.InvalidArchiveRecordException('Expected HTTP Status Line, Found: ' + line)
#status = int(matched.group(2))
statusline = matched.group(1)
headers = []
line = nextHeaderLine(stream)
while line and line != '\r\n':
name, value = line.split(':', 1)
value = value.strip()
headers.append((name, value))
line = nextHeaderLine(stream)
return (statusline, headers)
#=================================================================
class LineReader:
def __init__(self, stream, chunkSize = 1024, decomp = None):
self.stream = stream
self.chunkSize = chunkSize
self.decomp = decomp
self.buff = None
self.numread = 0
def _fillbuff(self, chunkSize = None):
if not chunkSize:
chunkSize = self.chunkSize
if not self.buff or self.buff.pos >= self.buff.len:
data = self.stream.read(chunkSize)
self.numread += len(data)
if self.decomp:
data = self.decomp.decompress(data)
self.buff = StringIO.StringIO(data)
def read(self):
self._fillbuff()
return self.buff.read()
def readline(self):
self._fillbuff()
return self.buff.readline()
def close(self):
if self.stream:
self.stream.close()
self.stream = None

View File

@ -1,13 +1,14 @@
import urllib
import urllib2
import wbexceptions
import itertools
from wbarchivalurl import ArchivalUrl
class RemoteCDXServer:
"""
>>> x = cdxserver.load('example.com', parse_cdx = True, limit = '2')
>>> pprint(vars(x[0]))
>>> pprint(x[0])
{'digest': 'HT2DYGA5UKZCPBSFVCV3JOBXGW2G5UUA',
'filename': 'DJ_crawl2.20020401123359-c/DJ_crawl3.20020120141301.arc.gz',
'length': '1792',
@ -20,7 +21,23 @@ class RemoteCDXServer:
'timestamp': '20020120142510',
'urlkey': 'com,example)/'}
"""
>>> x = cdxserver.load('example.com', parse_cdx = True, params = {'resolveRevisits': True, 'closest': '20131226', 'sort': 'closest', 'limit': '1'})
>>> pprint(x[0])
{'digest': 'B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A',
'filename': 'top_domains-00800-20131210-035838/top_domains-00800-20131210051705-00024.warc.gz',
'length': '523',
'mimetype': 'warc/revisit',
'offset': '247256770',
'orig.filename': 'deccanchronicle.com-20130107-023325/IA-FOC-deccanchronicle.com-20130921004125-00000.warc.gz',
'orig.length': '529',
'orig.offset': '769759',
'original': 'http://www.example.com/',
'redirect': '-',
'robotflags': '-',
'statuscode': '-',
'timestamp': '20131210052355',
'urlkey': 'com,example)/'}
"""
def __init__(self, serverUrl):
self.serverUrl = serverUrl
@ -69,9 +86,22 @@ class RemoteCDXServer:
}[wburl.type]
class CDXCaptureResult:
CDX_FORMATS = [["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"],
["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","offset","filename"]]
class CDXCaptureResult(dict):
CDX_FORMATS = [
# CDX 11 Format
["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"],
# CDX 9 Format
["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","offset","filename"],
# CDX 11 Format + 3 revisit resolve fields
["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename",
"orig.length","orig.offset","orig.filename"],
# CDX 9 Format + 3 revisit resolve fields
["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","offset","filename",
"orig.length","orig.offset","orig.filename"]
]
def __init__(self, cdxline):
cdxline = cdxline.rstrip()
@ -83,13 +113,14 @@ class CDXCaptureResult:
cdxformat = i
if not cdxformat:
raise InvalidCDXException('unknown {0}-field cdx format'.format(len(fields)))
raise wbexceptions.InvalidCDXException('unknown {0}-field cdx format'.format(len(fields)))
for header, field in zip(cdxformat, fields):
setattr(self, header, field)
for header, field in itertools.izip(cdxformat, fields):
self[header] = field
# setattr(self, header, field)
def __repr__(self):
return str(vars(self))
#def __repr__(self):
# return str(vars(self))

24
pywb/query.py Normal file
View File

@ -0,0 +1,24 @@
import indexreader
import utils
import wbrequestresponse
import wbexceptions
class QueryHandler:
def __init__(self):
self.cdxserver = indexreader.RemoteCDXServer('http://web.archive.org/cdx/search/cdx')
def __call__(self, wbrequest, prev_wbresponse):
wburl = wbrequest.wb_url
params = self.cdxserver.getQueryParams(wburl)
cdxlines = self.cdxserver.load(wburl.url, params)
cdxlines = utils.peek_iter(cdxlines)
if cdxlines is not None:
return wbrequestresponse.WbResponse.text_stream(cdxlines)
raise wbexceptions.NotFoundException('WB Does Not Have Url: ' + wburl.url)

110
pywb/replay.py Normal file
View File

@ -0,0 +1,110 @@
import indexreader
from wbrequestresponse import WbResponse
import utils
class ReplayHandler:
def __init__(self, resolvers, archiveloader):
self.resolvers = resolvers
self.archiveloader = archiveloader
def __call__(self, wbrequest, query_response):
cdxlist = query_response.body
last_e = None
first = True
for cdx in cdxlist:
try:
cdx = indexreader.CDXCaptureResult(cdx)
# First time through, check if do redirect before warc load
if first and (cdx['timestamp'] != wbrequest.wb_url.timestamp):
return WbResponse.better_timestamp_response(wbrequest, cdx['timestamp'])
response = self.doReplay(cdx, wbrequest)
if response:
# if a fallback, redirect to exact timestamp!
if not first and (cdx['timestamp'] != wbrequest.wb_url.timestamp):
response.close()
return WbResponse.better_timestamp_response(wbrequest, cdx['timestamp'])
return response
first = False
except Exception, e:
import traceback
traceback.print_exc()
last_e = e
pass
if last_e:
raise last_e
def _load(self, cdx, revisit = False):
prefix = '' if not revisit else 'orig.'
return self.archiveloader.load(self.resolveFull(cdx[prefix + 'filename']), cdx[prefix + 'offset'], cdx[prefix + 'length'])
def doReplay(self, cdx, wbrequest):
hasCurr = (cdx['filename'] != '-')
hasOrig = (cdx['orig.filename'] != '-')
# Case 1: non-revisit
if (hasCurr and not hasOrig):
headersRecord = self._load(cdx, False)
payloadRecord = headersRecord
isRevisit = False
# Case 2: old-style revisit, load headers from original payload
elif (not hasCurr and hasOrig):
payloadRecord = self._load(cdx, False)
headersRecord = payloadRecord
isRevisit = True
# Case 3: modern revisit, load headers from curr, payload from original
elif (hasCurr and hasOrig):
headersRecord = self._load(cdx, False)
payloadRecord = self._load(cdx, True)
# Case 4: if headers record is actually empty (eg empty revisit), then use headers from revisit
if not headersRecord.httpHeaders:
headersRecord.close()
headersRecord = payloadRecord
isRevisit = True
else:
raise wbexceptions.CaptureException('Invalid CDX' + cdx)
# Check for self redirect
if headersRecord.statusline.startswith('3'):
if self.isSelfRedirect(wbrequest, headersRecord):
raise wbexception.CaptureException('Self Redirect: ' + cdx)
return WbResponse.stream_response(headersRecord.statusline, headersRecord.httpHeaders, payloadRecord.stream)
def isSelfRedirect(self, wbrequest, record):
requestUrl = wbrequest.wb_url.url.lower()
locationUrl = utils.get_header(record.httpHeaders, 'Location').lower()
return requestUrl == locationUrl
#ArchivalUrlRewriter.stripProtocol(requestUrl) == ArchivalUrlRewriter.stripProtocol(locationUrl)
def resolveFull(self, filename):
# Attempt to resolve cdx file to full path
fullUrl = None
for resolver in self.resolvers:
fullUrl = resolver(filename)
if fullUrl:
return fullUrl
raise exceptions.UnresolvedArchiveFileException('Archive File Not Found: ' + cdx.filename)
#======================================
# PrefixResolver - convert cdx file entry to url with prefix if url contains specified string
#======================================
def PrefixResolver(prefix, contains):
def makeUrl(url):
return prefix + url if (contains in url) else None
return makeUrl

View File

@ -1,4 +1,6 @@
import itertools
import hmac
import time
def peek_iter(iterable):
try:
@ -7,3 +9,39 @@ def peek_iter(iterable):
return None
return itertools.chain([first], iterable)
def get_header(headersList, name):
nameLower = name.lower()
for value in headersList:
if (value[0].lower() == nameLower):
return value[1]
class HMACCookieMaker:
def __init__(self, key, name):
self.key = key
self.name = name
def __call__(self, duration, extraId = ''):
expire = str(long(time.time() + duration))
if extraId:
msg = extraId + '-' + expire
else:
msg = expire
hmacdigest = hmac.new(self.key, msg)
hexdigest = hmacdigest.hexdigest()
if extraId:
cookie = '{0}-{1}={2}-{3}'.format(self.name, extraId, expire, hexdigest)
else:
cookie = '{0}={1}-{2}'.format(self.name, expire, hexdigest)
return cookie
#return cookie + hexdigest

View File

@ -1,11 +1,11 @@
import indexreader
import json
from query import QueryHandler
import wbexceptions
import utils
from wbrequestresponse import WbResponse
from archivalrouter import ArchivalRequestRouter
## ===========
class EchoEnv:
def __call__(self, wbrequest, _):
return WbResponse.text_response(str(wbrequest.env))
@ -14,33 +14,20 @@ class WBHandler:
def __call__(self, wbrequest, _):
return WbResponse.text_response(str(wbrequest))
class QueryHandler:
def __init__(self):
self.cdxserver = indexreader.RemoteCDXServer('http://web.archive.org/cdx/search/cdx')
def __call__(self, wbrequest, prev_wbresponse):
wburl = wbrequest.wb_url
params = self.cdxserver.getQueryParams(wburl)
cdxlines = self.cdxserver.load(wburl.url, params)
cdxlines = utils.peek_iter(cdxlines)
if cdxlines is not None:
return WbResponse.text_stream(cdxlines)
raise wbexceptions.NotFoundException('WB Does Not Have Url: ' + wburl.url)
## ===========
query = QueryHandler()
import testwb
replay = testwb.createReplay()
## ===========
parser = ArchivalRequestRouter(
{
't0' : [EchoEnv()],
't1' : [WBHandler()],
't2' : [QueryHandler()]
't2' : [query],
't3' : [query, replay],
},
hostpaths = ['http://localhost:9090/'])
## ===========
@ -63,6 +50,7 @@ def application(env, start_response):
return response(env, start_response)
def handleException(env, exc):
if hasattr(exc, 'status'):
status = exc.status()

View File

@ -18,3 +18,24 @@ class InvalidCDXException(Exception):
class NotFoundException(Exception):
def status(_):
return '404'
# Exceptions that effect a specific capture and result in a retry
class CaptureException(Exception):
def status(_):
return '500'
class UnresolvedArchiveFileException(CaptureException):
pass
class UnknownArchiveFormatException(CaptureException):
pass
class UnknownLoaderProtocolException(CaptureException):
pass
class InvalidArchiveRecordException(CaptureException):
def __init__(msg, errList = None):
super(InvalidArchiveRecordException, self).__init__(msg)
self.errList = errList

View File

@ -1,4 +1,5 @@
from wbarchivalurl import ArchivalUrl
import utils
#WB Request and Response
class WbRequest:
@ -106,11 +107,27 @@ class WbResponse:
def redir_response(location, status = '302 Redirect'):
return WbResponse(status, headersList = [('Location', location)])
@staticmethod
def stream_response(statusline, headers, stream):
def streamGen():
try:
buff = stream.read()
while buff:
yield buff
buff = stream.read()
finally:
stream.close()
return WbResponse(statusline, headersList = headers, value = streamGen())
@staticmethod
def better_timestamp_response(wbrequest, newTimestamp):
wbrequest.wb_url.timestamp = newTimestamp
newUrl = wbrequest.wb_prefix + str(wbrequest.wb_url)[1:]
return WbResponse.redir_response(newUrl)
def get_header(self, name):
name_upp = name.upper()
for value in self.headersList:
if (value[0].upper() == name_upp):
return value[1]
return utils.get_header(self.headersList, name)
def __call__(self, env, start_response):
#headersList = []
@ -119,6 +136,12 @@ class WbResponse:
start_response(self.status, self.headersList)
if env['REQUEST_METHOD'] == 'HEAD':
if hasattr(self.body, 'close'):
self.body.close()
return self.body
return []
if hasattr(self.body, '__iter__'):
return self.body
else:

View File

@ -37,6 +37,9 @@ class ArchivalUrlRewriter:
>>> ArchivalUrlRewriter('/19960708im_/http://domain.example.com/path.txt', '/abc/').getAbsUrl()
'/abc/19960708im_/'
>>> ArchivalUrlRewriter.stripProtocol('https://example.com') == ArchivalUrlRewriter.stripProtocol('http://example.com')
True
"""
NO_REWRITE_URI_PREFIX = ['javascript:', 'data:', 'mailto:', 'about:']
@ -85,6 +88,14 @@ class ArchivalUrlRewriter:
def setBaseUrl(self, newUrl):
self.wburl.url = newUrl
@staticmethod
def stripProtocol(url):
for protocol in ArchivalUrlRewriter.PROTOCOLS:
if url.startswith(protocol):
return url[len(protocol):]
return url
if __name__ == "__main__":
import doctest