1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-31 03:04:12 +02:00
pywb/pywb/indexreader.py

281 lines
9.4 KiB
Python

import urllib
import urllib2
import wbexceptions
import itertools
import wbrequestresponse
import surt
from collections import OrderedDict
import binsearch
import cdxserve
import logging
import os
#=================================================================
class IndexReader:
def load_for_request(self, wbrequest, parsed_cdx = True):
wburl = wbrequest.wb_url
# init standard params
params = self.get_query_params(wburl)
# add any custom filter from the request
if wbrequest.query_filter:
params['filter'] = wbrequest.query_filter
if wbrequest.custom_params:
params.update(wbrequest.custom_params)
cdxlines = self.load_cdx(wburl.url, params, parsed_cdx)
cdxlines = utils.peek_iter(cdxlines)
if cdxlines is None:
raise wbexceptions.NotFoundException('WB Does Not Have Url: ' + wburl.url)
cdxlines = self.filter_cdx(wbrequest, cdxlines)
return cdxlines
def filter_cdx(self, wbrequest, cdxlines):
# Subclasses may wrap cdxlines iterator in a filter
return cdxlines
def load_cdx(self, url, params = {}, parsed_cdx = True):
raise NotImplementedError('Override in subclasses')
#=================================================================
class LocalCDXServer(IndexReader):
"""
>>> x = LocalCDXServer([test_dir]).load_cdx('example.com', parsed_cdx = True, limit = 1)
>>> pprint(x.next().items())
[('urlkey', 'com,example)/'),
('timestamp', '20140127171200'),
('original', 'http://example.com'),
('mimetype', 'text/html'),
('statuscode', '200'),
('digest', 'B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'),
('redirect', '-'),
('robotflags', '-'),
('length', '1046'),
('offset', '334'),
('filename', 'dupes.warc.gz')]
"""
def __init__(self, sources, surt_ordered = True):
self.sources = []
self.surt_ordered = surt_ordered
logging.info('CDX Surt-Ordered? ' + str(surt_ordered))
for src in sources:
if os.path.isdir(src):
for file in os.listdir(src):
if file.endswith('.cdx'):
full = src + file
logging.info('Adding CDX: ' + full)
self.sources.append(full)
else:
logging.info('Adding CDX: ' + src)
self.sources.append(src)
def load_cdx(self, url, params = {}, parsed_cdx = True, **kwvalues):
# canonicalize to surt (canonicalization is part of surt conversion)
try:
key = surt.surt(url)
except Exception as e:
raise wbexceptions.BadUrlException('Bad Request Url: ' + url)
# if not surt, unsurt the surt to get canonicalized non-surt url
if not self.surt_ordered:
key = utils.unsurt(key)
match_func = binsearch.iter_exact
params.update(**kwvalues)
params['output'] = 'raw' if parsed_cdx else 'text'
return cdxserve.cdx_serve(key, params, self.sources, match_func)
def get_query_params(self, wburl, limit = 150000, collapse_time = None, replay_closest = 10):
if wburl.type == wburl.URL_QUERY:
raise NotImplementedError('Url Query Not Yet Supported')
return {
wburl.QUERY:
{'collapse_time': collapse_time, 'filter': '!statuscode:(500|502|504)', 'limit': limit},
wburl.URL_QUERY:
{},
# raise Exception('Not Yet Implemented')
# {'collapse': 'urlkey', 'matchType': 'prefix', 'showGroupCount': True, 'showUniqCount': True, 'lastSkipTimestamp': True, 'limit': limit,
# 'fl': 'urlkey,original,timestamp,endtimestamp,groupcount,uniqcount',
# },
wburl.REPLAY:
{'filter': '!statuscode:(500|502|504)', 'limit': replay_closest, 'closest_to': wburl.timestamp, 'resolve_revisits': True},
wburl.LATEST_REPLAY:
{'reverse': True, 'filter': 'statuscode:[23]..', 'limit': '1', 'resolve_revisits': True}
}[wburl.type]
def __str__(self):
return 'load cdx indexes from ' + str(self.sources)
#=================================================================
class RemoteCDXServer(IndexReader):
"""
>>> x = RemoteCDXServer('http://web.archive.org/cdx/search/cdx').load_cdx('example.com', parsed_cdx = True, limit = '2')
>>> pprint(x.next().items())
[('urlkey', 'com,example)/'),
('timestamp', '20020120142510'),
('original', 'http://example.com:80/'),
('mimetype', 'text/html'),
('statuscode', '200'),
('digest', 'HT2DYGA5UKZCPBSFVCV3JOBXGW2G5UUA'),
('length', '1792')]
"""
def __init__(self, server_url, cookie = None):
self.server_url = server_url
self.auth_cookie = cookie
def load_cdx(self, url, params = {}, parsed_cdx = True, **kwvalues):
#url is required, must be passed explicitly!
params['url'] = url
params.update(**kwvalues)
urlparams = urllib.urlencode(params, True)
try:
request = urllib2.Request(self.server_url, urlparams)
if self.auth_cookie:
request.add_header('Cookie', self.auth_cookie)
response = urllib2.urlopen(request)
except urllib2.HTTPError, e:
if e.code == 403:
exc_msg = e.read()
msg = 'Blocked By Robots' if 'Blocked By Robots' in exc_msg else 'Excluded'
raise wbexceptions.AccessException(msg)
else:
raise e
if parsed_cdx:
return (CDXCaptureResult(cdx) for cdx in response)
else:
return iter(response)
# Note: this params are designed to make pywb compatible with the original Java wayback-cdx-server API:
# https://github.com/internetarchive/wayback/tree/master/wayback-cdx-server
# Soon, this will be switched over to support the native pywb cdx server
# BUG: Setting replayClosest to high number for now, as cdx server sometimes returns wrong result
# with lower values if there are too many captures. Ideally, should be around 10-20
# The replayClosest is the max number of cdx lines, so max number of retry attempts that WB will make
def get_query_params(self, wburl, limit = '150000', collapse_time = '10', replay_closest = '4000'):
return {
wburl.QUERY:
{'collapseTime': collapse_time, 'filter': '!statuscode:(500|502|504)', 'limit': limit},
wburl.URL_QUERY:
{'collapse': 'urlkey', 'matchType': 'prefix', 'showGroupCount': True, 'showUniqCount': True, 'lastSkipTimestamp': True, 'limit': limit,
'fl': 'urlkey,original,timestamp,endtimestamp,groupcount,uniqcount',
},
wburl.REPLAY:
{'sort': 'closest', 'filter': '!statuscode:(500|502|504)', 'limit': replay_closest, 'closest': wburl.timestamp, 'resolveRevisits': True},
# BUG: resolveRevisits currently doesn't work for this type of query
# This is not an issue in archival mode, as there is a redirect to the actual timestamp query
# but may be an issue in proxy mode
wburl.LATEST_REPLAY:
{'sort': 'reverse', 'filter': 'statuscode:[23]..', 'limit': '1', 'resolveRevisits': True}
}[wburl.type]
def __str__(self):
return 'server cdx from ' + self.server_url
#=================================================================
class CDXCaptureResult(OrderedDict):
CDX_FORMATS = [
# Public CDX Format
["urlkey","timestamp","original","mimetype","statuscode","digest","length"],
# CDX 11 Format
["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"],
# CDX 9 Format
["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","offset","filename"],
# CDX 11 Format + 3 revisit resolve fields
["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename",
"orig.length","orig.offset","orig.filename"],
# CDX 9 Format + 3 revisit resolve fields
["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","offset","filename",
"orig.length","orig.offset","orig.filename"]
]
def __init__(self, cdxline):
OrderedDict.__init__(self)
cdxline = cdxline.rstrip()
fields = cdxline.split(' ')
cdxformat = None
for i in CDXCaptureResult.CDX_FORMATS:
if len(i) == len(fields):
cdxformat = i
if not cdxformat:
raise wbexceptions.InvalidCDXException('unknown {0}-field cdx format'.format(len(fields)))
for header, field in itertools.izip(cdxformat, fields):
self[header] = field
self.cdxline = cdxline
def __setitem__(self, key, value):
OrderedDict.__setitem__(self, key, value)
# force regen on next __str__ call
self.cdxline = None
def __str__(self):
if self.cdxline:
return self.cdxline
li = itertools.imap(lambda (n, val): val, self.items())
return ' '.join(li)
# Testing
import utils
if __name__ == "__main__" or utils.enable_doctests():
from pprint import pprint
test_dir = utils.test_data_dir() + 'cdx/'
import doctest
doctest.testmod()