2013-12-18 18:52:52 -08:00
|
|
|
import urllib
|
|
|
|
import urllib2
|
2014-02-17 10:23:37 -08:00
|
|
|
|
|
|
|
from wbexceptions import NotFoundException
|
2013-12-18 18:52:52 -08:00
|
|
|
|
2014-02-17 02:34:39 -08:00
|
|
|
from itertools import chain
|
|
|
|
from pprint import pprint
|
2014-02-12 13:16:07 -08:00
|
|
|
|
2014-02-17 02:34:39 -08:00
|
|
|
from pywb.cdx.cdxserver import CDXServer, CDXException
|
|
|
|
from pywb.cdx.cdxobject import CDXObject
|
2014-01-27 21:46:38 -08:00
|
|
|
|
|
|
|
#=================================================================
|
2014-02-17 02:34:39 -08:00
|
|
|
class IndexReader(object):
|
|
|
|
def __init__(self, config):
|
|
|
|
if isinstance(config, str):
|
|
|
|
self.cdx_server = CDXServer(config)
|
|
|
|
else:
|
|
|
|
self.cdx_server = CDXServer.create_from_config(config)
|
|
|
|
|
|
|
|
def load_for_request(self, wbrequest):
|
2014-01-28 16:41:19 -08:00
|
|
|
wburl = wbrequest.wb_url
|
|
|
|
|
|
|
|
# init standard params
|
|
|
|
params = self.get_query_params(wburl)
|
|
|
|
|
|
|
|
# add any custom filter from the request
|
2014-01-28 19:37:37 -08:00
|
|
|
if wbrequest.query_filter:
|
|
|
|
params['filter'] = wbrequest.query_filter
|
2014-01-28 16:41:19 -08:00
|
|
|
|
2014-01-28 19:37:37 -08:00
|
|
|
if wbrequest.custom_params:
|
|
|
|
params.update(wbrequest.custom_params)
|
2014-01-28 16:41:19 -08:00
|
|
|
|
2014-02-17 02:34:39 -08:00
|
|
|
params['url'] = wburl.url
|
2014-02-17 10:23:37 -08:00
|
|
|
|
|
|
|
cdxlines = self.load_cdx(output='raw', **params)
|
2014-01-28 16:41:19 -08:00
|
|
|
|
2014-02-17 02:34:39 -08:00
|
|
|
cdxlines = self.peek_iter(cdxlines)
|
2014-01-28 16:41:19 -08:00
|
|
|
|
|
|
|
if cdxlines is None:
|
2014-02-17 10:23:37 -08:00
|
|
|
raise NotFoundException('No Captures found for: ' + wburl.url)
|
2014-01-28 16:41:19 -08:00
|
|
|
|
|
|
|
return cdxlines
|
|
|
|
|
2014-02-17 02:34:39 -08:00
|
|
|
def load_cdx(self, **params):
|
|
|
|
return self.cdx_server.load_cdx(**params)
|
2014-01-28 16:41:19 -08:00
|
|
|
|
|
|
|
def get_query_params(self, wburl, limit = 150000, collapse_time = None, replay_closest = 10):
|
|
|
|
if wburl.type == wburl.URL_QUERY:
|
|
|
|
raise NotImplementedError('Url Query Not Yet Supported')
|
|
|
|
|
2014-01-27 21:46:38 -08:00
|
|
|
return {
|
2014-01-20 14:12:59 -05:00
|
|
|
wburl.QUERY:
|
2014-02-01 13:19:30 -08:00
|
|
|
{'collapseTime': collapse_time, 'filter': '!statuscode:(500|502|504)', 'limit': limit},
|
2013-12-20 14:54:41 -08:00
|
|
|
|
2014-01-20 14:12:59 -05:00
|
|
|
wburl.URL_QUERY:
|
2013-12-20 19:11:52 -08:00
|
|
|
{'collapse': 'urlkey', 'matchType': 'prefix', 'showGroupCount': True, 'showUniqCount': True, 'lastSkipTimestamp': True, 'limit': limit,
|
2013-12-20 14:54:41 -08:00
|
|
|
'fl': 'urlkey,original,timestamp,endtimestamp,groupcount,uniqcount',
|
|
|
|
},
|
|
|
|
|
2014-01-20 14:12:59 -05:00
|
|
|
wburl.REPLAY:
|
2014-02-01 13:19:30 -08:00
|
|
|
{'sort': 'closest', 'filter': '!statuscode:(500|502|504)', 'limit': replay_closest, 'closest': wburl.timestamp, 'resolveRevisits': True},
|
2013-12-20 14:54:41 -08:00
|
|
|
|
2013-12-28 17:39:43 -08:00
|
|
|
# BUG: resolveRevisits currently doesn't work for this type of query
|
|
|
|
# This is not an issue in archival mode, as there is a redirect to the actual timestamp query
|
|
|
|
# but may be an issue in proxy mode
|
2014-01-20 14:12:59 -05:00
|
|
|
wburl.LATEST_REPLAY:
|
2013-12-20 14:54:41 -08:00
|
|
|
{'sort': 'reverse', 'filter': 'statuscode:[23]..', 'limit': '1', 'resolveRevisits': True}
|
|
|
|
|
|
|
|
}[wburl.type]
|
|
|
|
|
2014-02-17 02:34:39 -08:00
|
|
|
@staticmethod
|
|
|
|
def peek_iter(iterable):
|
|
|
|
try:
|
|
|
|
first = next(iterable)
|
|
|
|
except StopIteration:
|
|
|
|
return None
|
2013-12-20 14:54:41 -08:00
|
|
|
|
2014-02-17 02:34:39 -08:00
|
|
|
return chain([first], iterable)
|
2013-12-18 18:52:52 -08:00
|
|
|
|
2014-02-17 02:34:39 -08:00
|
|
|
#=================================================================
|
|
|
|
class RemoteCDXServer(IndexReader):
|
|
|
|
def __init__(self, remote_url, cookie=None):
|
|
|
|
self.remote = RemoteCDXSource(remote_url=remote_url, cookie=cookie, proxy_all=True)
|
|
|
|
self.cdx_server = CDXServer(self.remote)
|
2013-12-18 18:52:52 -08:00
|
|
|
|
2014-02-17 02:34:39 -08:00
|
|
|
#def load_cdx(self, **params):
|
|
|
|
#return remote.load_cdx(**params)
|