1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-04-02 20:16:14 +02:00
pywb/pywb/indexreader.py

137 lines
4.6 KiB
Python
Raw Normal View History

import urllib
import urllib2
import wbexceptions
import itertools
from wbarchivalurl import ArchivalUrl
class RemoteCDXServer:
"""
>>> x = cdxserver.load('example.com', parse_cdx = True, limit = '2')
>>> pprint(x[0])
{'digest': 'HT2DYGA5UKZCPBSFVCV3JOBXGW2G5UUA',
'filename': 'DJ_crawl2.20020401123359-c/DJ_crawl3.20020120141301.arc.gz',
'length': '1792',
'mimetype': 'text/html',
'offset': '49482198',
'original': 'http://example.com:80/',
'redirect': '-',
'robotflags': '-',
'statuscode': '200',
'timestamp': '20020120142510',
'urlkey': 'com,example)/'}
>>> x = cdxserver.load('example.com', parse_cdx = True, params = {'resolveRevisits': True, 'closest': '20131226', 'sort': 'closest', 'limit': '1'})
>>> pprint(x[0])
{'digest': 'B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A',
'filename': 'top_domains-00800-20131210-035838/top_domains-00800-20131210051705-00024.warc.gz',
'length': '523',
'mimetype': 'warc/revisit',
'offset': '247256770',
'orig.filename': 'deccanchronicle.com-20130107-023325/IA-FOC-deccanchronicle.com-20130921004125-00000.warc.gz',
'orig.length': '529',
'orig.offset': '769759',
'original': 'http://www.example.com/',
'redirect': '-',
'robotflags': '-',
'statuscode': '-',
'timestamp': '20131210052355',
'urlkey': 'com,example)/'}
"""
def __init__(self, serverUrl):
self.serverUrl = serverUrl
def load(self, url, params = {}, parse_cdx = False, **kwvalues):
#url is required, must be passed explicitly!
params['url'] = url
params.update(**kwvalues)
urlparams = urllib.urlencode(params)
try:
request = urllib2.Request(self.serverUrl, urlparams)
response = urllib2.urlopen(request)
except urllib2.HTTPError, e:
if e.code == 403:
exc_msg = e.read()
msg = 'Blocked By Robots' if 'Blocked By Robots' in exc_msg else 'Excluded'
raise wbexceptions.AccessException(msg)
else:
raise e
if parse_cdx:
return map(CDXCaptureResult, response)
else:
return response
@staticmethod
def getQueryParams(wburl, limit = '150000', collapseTime = '10', replayClosest = '10'):
return {
ArchivalUrl.QUERY:
{'collapseTime': collapseTime, 'filter': '!statuscode:(500|502|504)', 'limit': limit},
ArchivalUrl.URL_QUERY:
{'collapse': 'urlkey', 'matchType': 'prefix', 'showGroupCount': True, 'showUniqCount': True, 'lastSkipTimestamp': True, 'limit': limit,
'fl': 'urlkey,original,timestamp,endtimestamp,groupcount,uniqcount',
},
ArchivalUrl.REPLAY:
{'sort': 'closest', 'filter': '!statuscode:(500|502|504)', 'limit': replayClosest, 'closest': wburl.timestamp, 'resolveRevisits': True},
ArchivalUrl.LATEST_REPLAY:
{'sort': 'reverse', 'filter': 'statuscode:[23]..', 'limit': '1', 'resolveRevisits': True}
}[wburl.type]
class CDXCaptureResult(dict):
CDX_FORMATS = [
# CDX 11 Format
["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"],
# CDX 9 Format
["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","offset","filename"],
# CDX 11 Format + 3 revisit resolve fields
["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename",
"orig.length","orig.offset","orig.filename"],
# CDX 9 Format + 3 revisit resolve fields
["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","offset","filename",
"orig.length","orig.offset","orig.filename"]
]
def __init__(self, cdxline):
cdxline = cdxline.rstrip()
fields = cdxline.split(' ')
cdxformat = None
for i in CDXCaptureResult.CDX_FORMATS:
if len(i) == len(fields):
cdxformat = i
if not cdxformat:
raise wbexceptions.InvalidCDXException('unknown {0}-field cdx format'.format(len(fields)))
for header, field in itertools.izip(cdxformat, fields):
self[header] = field
# setattr(self, header, field)
#def __repr__(self):
# return str(vars(self))
# Testing
if __name__ == "__main__":
import doctest
from pprint import pprint
cdxserver = RemoteCDXServer('http://web.archive.org/cdx/search/cdx')
doctest.testmod()