1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-25 23:47:47 +01:00
pywb/pywb/indexreader.py
Ilya Kreymer c55bdf0e1f -binsearch: add tests, support both prefix and exact loading, for #11
-cdx server first pass for #12: implement cdx parsing and transforming
-operations supported: merge sort, regex filter, resolve revisits, closest sort, reverse sort,
timestamp collapse
timestamp parsing utils
2014-01-27 17:02:48 -08:00

146 lines
4.8 KiB
Python

import urllib
import urllib2
import wbexceptions
import itertools
from collections import OrderedDict
from wbarchivalurl import ArchivalUrl
class RemoteCDXServer:
"""
>>> x = cdxserver.load('example.com', parse_cdx = True, limit = '2')
>>> pprint(x[0].items())
[('urlkey', 'com,example)/'),
('timestamp', '20020120142510'),
('original', 'http://example.com:80/'),
('mimetype', 'text/html'),
('statuscode', '200'),
('digest', 'HT2DYGA5UKZCPBSFVCV3JOBXGW2G5UUA'),
('length', '1792')]
"""
def __init__(self, serverUrl, cookie = None):
self.serverUrl = serverUrl
self.authCookie = cookie
def load(self, url, params = {}, parse_cdx = False, **kwvalues):
#url is required, must be passed explicitly!
params['url'] = url
params.update(**kwvalues)
urlparams = urllib.urlencode(params, True)
try:
request = urllib2.Request(self.serverUrl, urlparams)
if self.authCookie:
request.add_header('Cookie', self.authCookie)
response = urllib2.urlopen(request)
except urllib2.HTTPError, e:
if e.code == 403:
exc_msg = e.read()
msg = 'Blocked By Robots' if 'Blocked By Robots' in exc_msg else 'Excluded'
raise wbexceptions.AccessException(msg)
else:
raise e
if parse_cdx:
return map(CDXCaptureResult, response)
else:
return response
# BUG: Setting replayClosest to high number for now, as cdx server sometimes returns wrong result
# with lower values if there are too many captures. Ideally, should be around 10-20
# The replayClosest is the max number of cdx lines, so max number of retry attempts that WB will make
@staticmethod
def getQueryParams(wburl, limit = '150000', collapseTime = '10', replayClosest = '4000'):
return {
wburl.QUERY:
{'collapseTime': collapseTime, 'filter': '!statuscode:(500|502|504)', 'limit': limit},
wburl.URL_QUERY:
{'collapse': 'urlkey', 'matchType': 'prefix', 'showGroupCount': True, 'showUniqCount': True, 'lastSkipTimestamp': True, 'limit': limit,
'fl': 'urlkey,original,timestamp,endtimestamp,groupcount,uniqcount',
},
wburl.REPLAY:
{'sort': 'closest', 'filter': '!statuscode:(500|502|504)', 'limit': replayClosest, 'closest': wburl.timestamp, 'resolveRevisits': True},
# BUG: resolveRevisits currently doesn't work for this type of query
# This is not an issue in archival mode, as there is a redirect to the actual timestamp query
# but may be an issue in proxy mode
wburl.LATEST_REPLAY:
{'sort': 'reverse', 'filter': 'statuscode:[23]..', 'limit': '1', 'resolveRevisits': True}
}[wburl.type]
class CDXCaptureResult(OrderedDict):
CDX_FORMATS = [
# Public CDX Format
["urlkey","timestamp","original","mimetype","statuscode","digest","length"],
# CDX 11 Format
["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"],
# CDX 9 Format
["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","offset","filename"],
# CDX 11 Format + 3 revisit resolve fields
["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename",
"orig.length","orig.offset","orig.filename"],
# CDX 9 Format + 3 revisit resolve fields
["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","offset","filename",
"orig.length","orig.offset","orig.filename"]
]
def __init__(self, cdxline):
OrderedDict.__init__(self)
cdxline = cdxline.rstrip()
fields = cdxline.split(' ')
cdxformat = None
for i in CDXCaptureResult.CDX_FORMATS:
if len(i) == len(fields):
cdxformat = i
if not cdxformat:
raise wbexceptions.InvalidCDXException('unknown {0}-field cdx format'.format(len(fields)))
for header, field in itertools.izip(cdxformat, fields):
self[header] = field
self.cdxline = cdxline
def __setitem__(self, key, value):
OrderedDict.__setitem__(self, key, value)
# force regen on next __str__ call
self.cdxline = None
def __str__(self):
if self.cdxline:
return self.cdxline
li = itertools.imap(lambda (n, val): val, self.items())
return ' '.join(li)
# Testing
import utils
if __name__ == "__main__" or utils.enable_doctests():
from pprint import pprint
cdxserver = RemoteCDXServer('http://web.archive.org/cdx/search/cdx')
import doctest
doctest.testmod()