From c1f0f7517a2da95e0d2077ece27164448fa9683d Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Fri, 2 Feb 2018 12:08:06 -0800 Subject: [PATCH] indexsource: add new XmlQueryIndexSource - support outbackcdx (tinycdxserver) and OpenWayback xmlquery interface (ukwa/ukwa-pywb#2) - convert xml to cdx iter for exact match - support prefix match (eg. for fuzzy matching) via chaining prefix query, and lazy urlquery in iterator --- pywb/warcserver/index/indexsource.py | 111 ++++++++++++++++++++++++++- pywb/warcserver/warcserver.py | 3 + 2 files changed, 113 insertions(+), 1 deletion(-) diff --git a/pywb/warcserver/index/indexsource.py b/pywb/warcserver/index/indexsource.py index 09101e18..8807e1a5 100644 --- a/pywb/warcserver/index/indexsource.py +++ b/pywb/warcserver/index/indexsource.py @@ -7,6 +7,11 @@ from warcio.timeutils import PAD_14_DOWN, http_date_to_timestamp, pad_timestamp, from pywb.utils.binsearch import iter_range from pywb.utils.canonicalize import canonicalize +from pywb.utils.wbexception import NotFoundException, BadRequestException + +from warcio.timeutils import timestamp_to_http_date, http_date_to_timestamp +from warcio.timeutils import timestamp_now, pad_timestamp, PAD_14_DOWN + from pywb.utils.format import res_template from pywb.utils.io import no_except_close from pywb.utils.memento import MementoUtils @@ -14,6 +19,25 @@ from pywb.utils.wbexception import NotFoundException from pywb.warcserver.http import DefaultAdapters from pywb.warcserver.index.cdxobject import CDXObject +from pywb.utils.format import ParamFormatter, res_template +from pywb.utils.memento import MementoUtils + +from pywb.warcserver.index.cdxops import cdx_sort_closest + +from six.moves.urllib.parse import quote_plus + +try: + from lxml import etree +except: + import xml.etree.ElementTree as etree + +import redis + +import requests + +import re +import logging + #============================================================================= class BaseIndexSource(object): @@ -190,7 +214,92 @@ class RemoteIndexSource(BaseIndexSource): return cls(config['api_url'], config['replay_url']) -#============================================================================= +# ============================================================================= +class XmlQueryIndexSource(BaseIndexSource): + EXACT_SUFFIX = '?q=type:urlquery+url:{url}' + PREFIX_SUFFIX = '?q=type:prefixquery+url:{url}' + + def __init__(self, query_api_url): + self.query_api_url = query_api_url + self.session = requests.session() + + def load_index(self, params): + closest = params.get('closest') + + url = params.get('url', '') + + matchType = params.get('matchType', 'exact') + + if matchType == 'exact': + query_url = self.query_api_url + self.EXACT_SUFFIX + elif matchType == 'prefix': + query_url = self.query_api_url + self.PREFIX_SUFFIX + else: + raise BadRequestException('matchType={0} is not supported'.format(matchType=matchType)) + + query_url = query_url.format(url=quote_plus(url)) + + try: + response = self.session.get(query_url) + response.raise_for_status() + + results = etree.fromstring(response.text) + + items = results.find('results').findall('result') + + if matchType == 'exact': + cdx_iter = [self.convert_to_cdx(item) for item in items] + if closest: + cdx_iter = cdx_sort_closest(closest, cdx_iter, limit=10000) + + else: + cdx_iter = self.prefix_query_iter(items) + + except Exception: + if self.logger.getEffectiveLevel() == logging.DEBUG: + import traceback + traceback.print_exc() + + raise NotFoundException('url {0} not found'.format(url)) + + return cdx_iter + + def prefix_query_iter(self, items): + for item in items: + url = self.gettext(item, 'originalurl') + if not url: + continue + + cdx_iter = self.load_index({'url': url}) + for cdx in cdx_iter: + yield cdx + + def convert_to_cdx(self, item): + cdx = CDXObject() + cdx['urlkey'] = self.gettext(item, 'urlkey') + cdx['timestamp'] = self.gettext(item, 'capturedate')[:14] + cdx['url'] = self.gettext(item, 'url') + cdx['mime'] = self.gettext(item, 'mimetype') + cdx['status'] = self.gettext(item, 'httpresponsecode') + cdx['digest'] = self.gettext(item, 'digest') + cdx['offset'] = self.gettext(item, 'compressedoffset') + cdx['filename'] = self.gettext(item, 'file') + return cdx + + def gettext(self, item, name): + elem = item.find(name) + if elem is not None: + return elem.text + else: + return '' + + @classmethod + def init_from_string(cls, value): + if value.startswith('xmlquery+'): + return cls(value[9:]) + + +# ============================================================================= class LiveIndexSource(BaseIndexSource): def __init__(self): self._init_sesh(DefaultAdapters.live_adapter) diff --git a/pywb/warcserver/warcserver.py b/pywb/warcserver/warcserver.py index 9f9233d1..c3ed1ef8 100644 --- a/pywb/warcserver/warcserver.py +++ b/pywb/warcserver/warcserver.py @@ -10,6 +10,8 @@ from pywb.warcserver.handlers import DefaultResourceHandler, HandlerSeq from pywb.warcserver.index.indexsource import FileIndexSource, RemoteIndexSource from pywb.warcserver.index.indexsource import MementoIndexSource, RedisIndexSource from pywb.warcserver.index.indexsource import LiveIndexSource, WBMementoIndexSource +from pywb.warcserver.index.indexsource import XmlQueryIndexSource + from pywb.warcserver.index.zipnum import ZipNumIndexSource from pywb import DEFAULT_CONFIG @@ -20,6 +22,7 @@ import os SOURCE_LIST = [LiveIndexSource, + XmlQueryIndexSource, WBMementoIndexSource, RedisMultiKeyIndexSource, MementoIndexSource,