mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
indexsource: add new XmlQueryIndexSource
- support outbackcdx (tinycdxserver) and OpenWayback xmlquery interface (ukwa/ukwa-pywb#2) - convert xml to cdx iter for exact match - support prefix match (eg. for fuzzy matching) via chaining prefix query, and lazy urlquery in iterator
This commit is contained in:
parent
56e7c78ea3
commit
c1f0f7517a
@ -7,6 +7,11 @@ from warcio.timeutils import PAD_14_DOWN, http_date_to_timestamp, pad_timestamp,
|
|||||||
|
|
||||||
from pywb.utils.binsearch import iter_range
|
from pywb.utils.binsearch import iter_range
|
||||||
from pywb.utils.canonicalize import canonicalize
|
from pywb.utils.canonicalize import canonicalize
|
||||||
|
from pywb.utils.wbexception import NotFoundException, BadRequestException
|
||||||
|
|
||||||
|
from warcio.timeutils import timestamp_to_http_date, http_date_to_timestamp
|
||||||
|
from warcio.timeutils import timestamp_now, pad_timestamp, PAD_14_DOWN
|
||||||
|
|
||||||
from pywb.utils.format import res_template
|
from pywb.utils.format import res_template
|
||||||
from pywb.utils.io import no_except_close
|
from pywb.utils.io import no_except_close
|
||||||
from pywb.utils.memento import MementoUtils
|
from pywb.utils.memento import MementoUtils
|
||||||
@ -14,6 +19,25 @@ from pywb.utils.wbexception import NotFoundException
|
|||||||
from pywb.warcserver.http import DefaultAdapters
|
from pywb.warcserver.http import DefaultAdapters
|
||||||
from pywb.warcserver.index.cdxobject import CDXObject
|
from pywb.warcserver.index.cdxobject import CDXObject
|
||||||
|
|
||||||
|
from pywb.utils.format import ParamFormatter, res_template
|
||||||
|
from pywb.utils.memento import MementoUtils
|
||||||
|
|
||||||
|
from pywb.warcserver.index.cdxops import cdx_sort_closest
|
||||||
|
|
||||||
|
from six.moves.urllib.parse import quote_plus
|
||||||
|
|
||||||
|
try:
|
||||||
|
from lxml import etree
|
||||||
|
except:
|
||||||
|
import xml.etree.ElementTree as etree
|
||||||
|
|
||||||
|
import redis
|
||||||
|
|
||||||
|
import requests
|
||||||
|
|
||||||
|
import re
|
||||||
|
import logging
|
||||||
|
|
||||||
|
|
||||||
#=============================================================================
|
#=============================================================================
|
||||||
class BaseIndexSource(object):
|
class BaseIndexSource(object):
|
||||||
@ -190,7 +214,92 @@ class RemoteIndexSource(BaseIndexSource):
|
|||||||
return cls(config['api_url'], config['replay_url'])
|
return cls(config['api_url'], config['replay_url'])
|
||||||
|
|
||||||
|
|
||||||
#=============================================================================
|
# =============================================================================
|
||||||
|
class XmlQueryIndexSource(BaseIndexSource):
|
||||||
|
EXACT_SUFFIX = '?q=type:urlquery+url:{url}'
|
||||||
|
PREFIX_SUFFIX = '?q=type:prefixquery+url:{url}'
|
||||||
|
|
||||||
|
def __init__(self, query_api_url):
|
||||||
|
self.query_api_url = query_api_url
|
||||||
|
self.session = requests.session()
|
||||||
|
|
||||||
|
def load_index(self, params):
|
||||||
|
closest = params.get('closest')
|
||||||
|
|
||||||
|
url = params.get('url', '')
|
||||||
|
|
||||||
|
matchType = params.get('matchType', 'exact')
|
||||||
|
|
||||||
|
if matchType == 'exact':
|
||||||
|
query_url = self.query_api_url + self.EXACT_SUFFIX
|
||||||
|
elif matchType == 'prefix':
|
||||||
|
query_url = self.query_api_url + self.PREFIX_SUFFIX
|
||||||
|
else:
|
||||||
|
raise BadRequestException('matchType={0} is not supported'.format(matchType=matchType))
|
||||||
|
|
||||||
|
query_url = query_url.format(url=quote_plus(url))
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = self.session.get(query_url)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
results = etree.fromstring(response.text)
|
||||||
|
|
||||||
|
items = results.find('results').findall('result')
|
||||||
|
|
||||||
|
if matchType == 'exact':
|
||||||
|
cdx_iter = [self.convert_to_cdx(item) for item in items]
|
||||||
|
if closest:
|
||||||
|
cdx_iter = cdx_sort_closest(closest, cdx_iter, limit=10000)
|
||||||
|
|
||||||
|
else:
|
||||||
|
cdx_iter = self.prefix_query_iter(items)
|
||||||
|
|
||||||
|
except Exception:
|
||||||
|
if self.logger.getEffectiveLevel() == logging.DEBUG:
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
|
||||||
|
raise NotFoundException('url {0} not found'.format(url))
|
||||||
|
|
||||||
|
return cdx_iter
|
||||||
|
|
||||||
|
def prefix_query_iter(self, items):
|
||||||
|
for item in items:
|
||||||
|
url = self.gettext(item, 'originalurl')
|
||||||
|
if not url:
|
||||||
|
continue
|
||||||
|
|
||||||
|
cdx_iter = self.load_index({'url': url})
|
||||||
|
for cdx in cdx_iter:
|
||||||
|
yield cdx
|
||||||
|
|
||||||
|
def convert_to_cdx(self, item):
|
||||||
|
cdx = CDXObject()
|
||||||
|
cdx['urlkey'] = self.gettext(item, 'urlkey')
|
||||||
|
cdx['timestamp'] = self.gettext(item, 'capturedate')[:14]
|
||||||
|
cdx['url'] = self.gettext(item, 'url')
|
||||||
|
cdx['mime'] = self.gettext(item, 'mimetype')
|
||||||
|
cdx['status'] = self.gettext(item, 'httpresponsecode')
|
||||||
|
cdx['digest'] = self.gettext(item, 'digest')
|
||||||
|
cdx['offset'] = self.gettext(item, 'compressedoffset')
|
||||||
|
cdx['filename'] = self.gettext(item, 'file')
|
||||||
|
return cdx
|
||||||
|
|
||||||
|
def gettext(self, item, name):
|
||||||
|
elem = item.find(name)
|
||||||
|
if elem is not None:
|
||||||
|
return elem.text
|
||||||
|
else:
|
||||||
|
return ''
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def init_from_string(cls, value):
|
||||||
|
if value.startswith('xmlquery+'):
|
||||||
|
return cls(value[9:])
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
class LiveIndexSource(BaseIndexSource):
|
class LiveIndexSource(BaseIndexSource):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self._init_sesh(DefaultAdapters.live_adapter)
|
self._init_sesh(DefaultAdapters.live_adapter)
|
||||||
|
@ -10,6 +10,8 @@ from pywb.warcserver.handlers import DefaultResourceHandler, HandlerSeq
|
|||||||
from pywb.warcserver.index.indexsource import FileIndexSource, RemoteIndexSource
|
from pywb.warcserver.index.indexsource import FileIndexSource, RemoteIndexSource
|
||||||
from pywb.warcserver.index.indexsource import MementoIndexSource, RedisIndexSource
|
from pywb.warcserver.index.indexsource import MementoIndexSource, RedisIndexSource
|
||||||
from pywb.warcserver.index.indexsource import LiveIndexSource, WBMementoIndexSource
|
from pywb.warcserver.index.indexsource import LiveIndexSource, WBMementoIndexSource
|
||||||
|
from pywb.warcserver.index.indexsource import XmlQueryIndexSource
|
||||||
|
|
||||||
from pywb.warcserver.index.zipnum import ZipNumIndexSource
|
from pywb.warcserver.index.zipnum import ZipNumIndexSource
|
||||||
|
|
||||||
from pywb import DEFAULT_CONFIG
|
from pywb import DEFAULT_CONFIG
|
||||||
@ -20,6 +22,7 @@ import os
|
|||||||
|
|
||||||
|
|
||||||
SOURCE_LIST = [LiveIndexSource,
|
SOURCE_LIST = [LiveIndexSource,
|
||||||
|
XmlQueryIndexSource,
|
||||||
WBMementoIndexSource,
|
WBMementoIndexSource,
|
||||||
RedisMultiKeyIndexSource,
|
RedisMultiKeyIndexSource,
|
||||||
MementoIndexSource,
|
MementoIndexSource,
|
||||||
|
Loading…
x
Reference in New Issue
Block a user