2013-12-18 18:52:52 -08:00
|
|
|
import urllib
|
|
|
|
import urllib2
|
2013-12-19 12:06:47 -08:00
|
|
|
import wbexceptions
|
2013-12-28 05:00:06 -08:00
|
|
|
import itertools
|
2013-12-18 18:52:52 -08:00
|
|
|
|
2013-12-20 14:54:41 -08:00
|
|
|
from wbarchivalurl import ArchivalUrl
|
|
|
|
|
2013-12-18 18:52:52 -08:00
|
|
|
class RemoteCDXServer:
|
|
|
|
"""
|
|
|
|
>>> x = cdxserver.load('example.com', parse_cdx = True, limit = '2')
|
2013-12-28 05:00:06 -08:00
|
|
|
>>> pprint(x[0])
|
2013-12-18 18:52:52 -08:00
|
|
|
{'digest': 'HT2DYGA5UKZCPBSFVCV3JOBXGW2G5UUA',
|
|
|
|
'filename': 'DJ_crawl2.20020401123359-c/DJ_crawl3.20020120141301.arc.gz',
|
|
|
|
'length': '1792',
|
|
|
|
'mimetype': 'text/html',
|
|
|
|
'offset': '49482198',
|
|
|
|
'original': 'http://example.com:80/',
|
|
|
|
'redirect': '-',
|
|
|
|
'robotflags': '-',
|
|
|
|
'statuscode': '200',
|
|
|
|
'timestamp': '20020120142510',
|
|
|
|
'urlkey': 'com,example)/'}
|
2013-12-24 22:51:33 -08:00
|
|
|
|
2013-12-28 05:00:06 -08:00
|
|
|
>>> x = cdxserver.load('example.com', parse_cdx = True, params = {'resolveRevisits': True, 'closest': '20131226', 'sort': 'closest', 'limit': '1'})
|
|
|
|
>>> pprint(x[0])
|
|
|
|
{'digest': 'B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A',
|
|
|
|
'filename': 'top_domains-00800-20131210-035838/top_domains-00800-20131210051705-00024.warc.gz',
|
|
|
|
'length': '523',
|
|
|
|
'mimetype': 'warc/revisit',
|
|
|
|
'offset': '247256770',
|
|
|
|
'orig.filename': 'deccanchronicle.com-20130107-023325/IA-FOC-deccanchronicle.com-20130921004125-00000.warc.gz',
|
|
|
|
'orig.length': '529',
|
|
|
|
'orig.offset': '769759',
|
|
|
|
'original': 'http://www.example.com/',
|
|
|
|
'redirect': '-',
|
|
|
|
'robotflags': '-',
|
|
|
|
'statuscode': '-',
|
|
|
|
'timestamp': '20131210052355',
|
|
|
|
'urlkey': 'com,example)/'}
|
|
|
|
"""
|
2013-12-18 18:52:52 -08:00
|
|
|
|
|
|
|
def __init__(self, serverUrl):
|
|
|
|
self.serverUrl = serverUrl
|
|
|
|
|
|
|
|
def load(self, url, params = {}, parse_cdx = False, **kwvalues):
|
|
|
|
#url is required, must be passed explicitly!
|
|
|
|
params['url'] = url
|
|
|
|
params.update(**kwvalues)
|
|
|
|
|
|
|
|
urlparams = urllib.urlencode(params)
|
2013-12-19 12:06:47 -08:00
|
|
|
|
|
|
|
try:
|
|
|
|
request = urllib2.Request(self.serverUrl, urlparams)
|
|
|
|
response = urllib2.urlopen(request)
|
|
|
|
except urllib2.HTTPError, e:
|
|
|
|
if e.code == 403:
|
|
|
|
exc_msg = e.read()
|
|
|
|
msg = 'Blocked By Robots' if 'Blocked By Robots' in exc_msg else 'Excluded'
|
|
|
|
raise wbexceptions.AccessException(msg)
|
|
|
|
else:
|
|
|
|
raise e
|
2013-12-18 18:52:52 -08:00
|
|
|
|
|
|
|
if parse_cdx:
|
|
|
|
return map(CDXCaptureResult, response)
|
|
|
|
else:
|
|
|
|
return response
|
|
|
|
|
2013-12-20 14:54:41 -08:00
|
|
|
@staticmethod
|
2013-12-20 19:11:52 -08:00
|
|
|
def getQueryParams(wburl, limit = '150000', collapseTime = '10', replayClosest = '10'):
|
2013-12-20 14:54:41 -08:00
|
|
|
return {
|
|
|
|
|
|
|
|
ArchivalUrl.QUERY:
|
2013-12-20 19:11:52 -08:00
|
|
|
{'collapseTime': collapseTime, 'filter': '!statuscode:(500|502|504)', 'limit': limit},
|
2013-12-20 14:54:41 -08:00
|
|
|
|
|
|
|
ArchivalUrl.URL_QUERY:
|
2013-12-20 19:11:52 -08:00
|
|
|
{'collapse': 'urlkey', 'matchType': 'prefix', 'showGroupCount': True, 'showUniqCount': True, 'lastSkipTimestamp': True, 'limit': limit,
|
2013-12-20 14:54:41 -08:00
|
|
|
'fl': 'urlkey,original,timestamp,endtimestamp,groupcount,uniqcount',
|
|
|
|
},
|
|
|
|
|
|
|
|
ArchivalUrl.REPLAY:
|
2013-12-20 19:11:52 -08:00
|
|
|
{'sort': 'closest', 'filter': '!statuscode:(500|502|504)', 'limit': replayClosest, 'closest': wburl.timestamp, 'resolveRevisits': True},
|
2013-12-20 14:54:41 -08:00
|
|
|
|
|
|
|
ArchivalUrl.LATEST_REPLAY:
|
|
|
|
{'sort': 'reverse', 'filter': 'statuscode:[23]..', 'limit': '1', 'resolveRevisits': True}
|
|
|
|
|
|
|
|
}[wburl.type]
|
|
|
|
|
|
|
|
|
2013-12-28 05:00:06 -08:00
|
|
|
class CDXCaptureResult(dict):
|
|
|
|
CDX_FORMATS = [
|
|
|
|
# CDX 11 Format
|
|
|
|
["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"],
|
|
|
|
|
|
|
|
# CDX 9 Format
|
|
|
|
["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","offset","filename"],
|
|
|
|
|
|
|
|
# CDX 11 Format + 3 revisit resolve fields
|
|
|
|
["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename",
|
|
|
|
"orig.length","orig.offset","orig.filename"],
|
|
|
|
|
|
|
|
# CDX 9 Format + 3 revisit resolve fields
|
|
|
|
["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","offset","filename",
|
|
|
|
"orig.length","orig.offset","orig.filename"]
|
|
|
|
]
|
2013-12-18 18:52:52 -08:00
|
|
|
|
|
|
|
def __init__(self, cdxline):
|
|
|
|
cdxline = cdxline.rstrip()
|
|
|
|
fields = cdxline.split(' ')
|
|
|
|
|
|
|
|
cdxformat = None
|
|
|
|
for i in CDXCaptureResult.CDX_FORMATS:
|
|
|
|
if len(i) == len(fields):
|
|
|
|
cdxformat = i
|
|
|
|
|
|
|
|
if not cdxformat:
|
2013-12-28 05:00:06 -08:00
|
|
|
raise wbexceptions.InvalidCDXException('unknown {0}-field cdx format'.format(len(fields)))
|
2013-12-18 18:52:52 -08:00
|
|
|
|
2013-12-28 05:00:06 -08:00
|
|
|
for header, field in itertools.izip(cdxformat, fields):
|
|
|
|
self[header] = field
|
|
|
|
# setattr(self, header, field)
|
2013-12-18 18:52:52 -08:00
|
|
|
|
2013-12-28 05:00:06 -08:00
|
|
|
#def __repr__(self):
|
|
|
|
# return str(vars(self))
|
2013-12-18 18:52:52 -08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Testing
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
import doctest
|
|
|
|
from pprint import pprint
|
|
|
|
|
|
|
|
cdxserver = RemoteCDXServer('http://web.archive.org/cdx/search/cdx')
|
|
|
|
|
|
|
|
doctest.testmod()
|