mirror of
synced 2025-03-25 23:47:47 +01:00
-cdx server first pass for #12: implement cdx parsing and transforming -operations supported: merge sort, regex filter, resolve revisits, closest sort, reverse sort, timestamp collapse timestamp parsing utils
146 lines
4.8 KiB
146 lines
4.8 KiB
import urllib
import urllib2
import wbexceptions
import itertools
from collections import OrderedDict
from wbarchivalurl import ArchivalUrl
class RemoteCDXServer:
>>> x = cdxserver.load('example.com', parse_cdx = True, limit = '2')
>>> pprint(x[0].items())
[('urlkey', 'com,example)/'),
('timestamp', '20020120142510'),
('original', 'http://example.com:80/'),
('mimetype', 'text/html'),
('statuscode', '200'),
('length', '1792')]
def __init__(self, serverUrl, cookie = None):
self.serverUrl = serverUrl
self.authCookie = cookie
def load(self, url, params = {}, parse_cdx = False, **kwvalues):
#url is required, must be passed explicitly!
params['url'] = url
urlparams = urllib.urlencode(params, True)
request = urllib2.Request(self.serverUrl, urlparams)
if self.authCookie:
request.add_header('Cookie', self.authCookie)
response = urllib2.urlopen(request)
except urllib2.HTTPError, e:
if e.code == 403:
exc_msg = e.read()
msg = 'Blocked By Robots' if 'Blocked By Robots' in exc_msg else 'Excluded'
raise wbexceptions.AccessException(msg)
raise e
if parse_cdx:
return map(CDXCaptureResult, response)
return response
# BUG: Setting replayClosest to high number for now, as cdx server sometimes returns wrong result
# with lower values if there are too many captures. Ideally, should be around 10-20
# The replayClosest is the max number of cdx lines, so max number of retry attempts that WB will make
def getQueryParams(wburl, limit = '150000', collapseTime = '10', replayClosest = '4000'):
return {
{'collapseTime': collapseTime, 'filter': '!statuscode:(500|502|504)', 'limit': limit},
{'collapse': 'urlkey', 'matchType': 'prefix', 'showGroupCount': True, 'showUniqCount': True, 'lastSkipTimestamp': True, 'limit': limit,
'fl': 'urlkey,original,timestamp,endtimestamp,groupcount,uniqcount',
{'sort': 'closest', 'filter': '!statuscode:(500|502|504)', 'limit': replayClosest, 'closest': wburl.timestamp, 'resolveRevisits': True},
# BUG: resolveRevisits currently doesn't work for this type of query
# This is not an issue in archival mode, as there is a redirect to the actual timestamp query
# but may be an issue in proxy mode
{'sort': 'reverse', 'filter': 'statuscode:[23]..', 'limit': '1', 'resolveRevisits': True}
class CDXCaptureResult(OrderedDict):
# Public CDX Format
# CDX 11 Format
# CDX 9 Format
# CDX 11 Format + 3 revisit resolve fields
# CDX 9 Format + 3 revisit resolve fields
def __init__(self, cdxline):
cdxline = cdxline.rstrip()
fields = cdxline.split(' ')
cdxformat = None
for i in CDXCaptureResult.CDX_FORMATS:
if len(i) == len(fields):
cdxformat = i
if not cdxformat:
raise wbexceptions.InvalidCDXException('unknown {0}-field cdx format'.format(len(fields)))
for header, field in itertools.izip(cdxformat, fields):
self[header] = field
self.cdxline = cdxline
def __setitem__(self, key, value):
OrderedDict.__setitem__(self, key, value)
# force regen on next __str__ call
self.cdxline = None
def __str__(self):
if self.cdxline:
return self.cdxline
li = itertools.imap(lambda (n, val): val, self.items())
return ' '.join(li)
# Testing
import utils
if __name__ == "__main__" or utils.enable_doctests():
from pprint import pprint
cdxserver = RemoteCDXServer('http://web.archive.org/cdx/search/cdx')
import doctest