1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

added lxml requirments entry to extra_requirments.txt and documented pywb.warcserver.index.indexsource.XmlQueryIndexSource

This commit is contained in:
John Berlin 2019-04-03 18:35:30 -04:00
parent 41c37129c0
commit 9a40d29ac3
No known key found for this signature in database
GPG Key ID: 6EF5E4B442011B02
2 changed files with 54 additions and 13 deletions

View File

@ -4,3 +4,4 @@ boto3
uwsgi uwsgi
git+https://github.com/esnme/ultrajson.git git+https://github.com/esnme/ultrajson.git
pysocks pysocks
lxml

View File

@ -18,14 +18,8 @@ from pywb.utils.memento import MementoUtils
from pywb.utils.wbexception import NotFoundException from pywb.utils.wbexception import NotFoundException
from pywb.warcserver.http import DefaultAdapters from pywb.warcserver.http import DefaultAdapters
from pywb.warcserver.index.cdxobject import CDXObject from pywb.warcserver.index.cdxobject import CDXObject
from pywb.utils.format import ParamFormatter, res_template
from pywb.utils.memento import MementoUtils
from pywb.warcserver.index.cdxops import cdx_sort_closest from pywb.warcserver.index.cdxops import cdx_sort_closest
from six.moves.urllib.parse import quote_plus
try: try:
from lxml import etree from lxml import etree
except: except:
@ -222,14 +216,28 @@ class RemoteIndexSource(BaseIndexSource):
# ============================================================================= # =============================================================================
class XmlQueryIndexSource(BaseIndexSource): class XmlQueryIndexSource(BaseIndexSource):
EXACT_QUERY = 'type:urlquery url:' """An index source class for XML files"""
PREFIX_QUERY = 'type:prefixquery url:'
EXACT_QUERY = 'type:urlquery url:' # type: str
PREFIX_QUERY = 'type:prefixquery url:' # type: str
def __init__(self, query_api_url): def __init__(self, query_api_url):
self.query_api_url = query_api_url """Initialize the XmlQueryIndexSource instance
self.session = requests.session()
:param str query_api_url: The query api URL
"""
self.query_api_url = query_api_url # type: str
self.session = requests.session() # type: requests.Session
def load_index(self, params): def load_index(self, params):
"""Loads the xml query index based on the supplied params
:param dict[str, str] params: The query params
:return: A list or generator of cdx objects
:raises NotFoundException: If the query url is not found
or the results of the query returns no cdx entries
:raises BadRequestException: If the match type is not exact or prefix
"""
closest = params.get('closest') closest = params.get('closest')
url = params.get('url', '') url = params.get('url', '')
@ -244,8 +252,8 @@ class XmlQueryIndexSource(BaseIndexSource):
raise BadRequestException('matchType={0} is not supported'.format(matchType=matchType)) raise BadRequestException('matchType={0} is not supported'.format(matchType=matchType))
try: try:
#OpenSearch API requires double-escaping # OpenSearch API requires double-escaping
#TODO: add option to not double escape if needed # TODO: add option to not double escape if needed
query_url = self.query_api_url + '?q=' + quote_plus(query + quote_plus(url)) query_url = self.query_api_url + '?q=' + quote_plus(query + quote_plus(url))
self.logger.debug("Running query: %s" % query_url) self.logger.debug("Running query: %s" % query_url)
response = self.session.get(query_url) response = self.session.get(query_url)
@ -278,6 +286,11 @@ class XmlQueryIndexSource(BaseIndexSource):
return cdx_iter return cdx_iter
def prefix_query_iter(self, items): def prefix_query_iter(self, items):
"""Returns an iterator yielding the results of performing a prefix query
:param items: The xml entry elements representing an query
:return: An iterator yielding the results of the query
"""
for item in items: for item in items:
url = self.gettext(item, 'originalurl') url = self.gettext(item, 'originalurl')
if not url: if not url:
@ -288,6 +301,12 @@ class XmlQueryIndexSource(BaseIndexSource):
yield cdx yield cdx
def convert_to_cdx(self, item): def convert_to_cdx(self, item):
"""Converts the etree element to an CDX object
:param item: The etree element to be converted
:return: The CDXObject representing the supplied etree element object
:rtype: CDXObject
"""
cdx = CDXObject() cdx = CDXObject()
cdx['urlkey'] = self.gettext(item, 'urlkey') cdx['urlkey'] = self.gettext(item, 'urlkey')
cdx['timestamp'] = self.gettext(item, 'capturedate')[:14] cdx['timestamp'] = self.gettext(item, 'capturedate')[:14]
@ -300,6 +319,13 @@ class XmlQueryIndexSource(BaseIndexSource):
return cdx return cdx
def gettext(self, item, name): def gettext(self, item, name):
"""Returns the value of the supplied name
:param item: The etree element to be converted
:param name: The name of the field to get its value for
:return: The value of the field
:rtype: str
"""
elem = item.find(name) elem = item.find(name)
if elem is not None: if elem is not None:
return elem.text return elem.text
@ -308,12 +334,25 @@ class XmlQueryIndexSource(BaseIndexSource):
@classmethod @classmethod
def init_from_string(cls, value): def init_from_string(cls, value):
"""Creates and initializes a new instance of XmlQueryIndexSource
IFF the supplied value starts with xmlquery+
:param str value: The string by which to initialize the XmlQueryIndexSource
:return: The initialized XmlQueryIndexSource or None
:rtype: XmlQueryIndexSource|None
"""
if value.startswith('xmlquery+'): if value.startswith('xmlquery+'):
return cls(value[9:]) return cls(value[9:])
@classmethod @classmethod
def init_from_config(cls, config): def init_from_config(cls, config):
"""Creates and initializes a new instance of XmlQueryIndexSource
IFF the supplied dictionary contains the type key equal to xmlquery
:param dict[str, str] config:
:return: The initialized XmlQueryIndexSource or None
:rtype: XmlQueryIndexSource|None
"""
if config['type'] != 'xmlquery': if config['type'] != 'xmlquery':
return return
@ -565,6 +604,7 @@ class MementoIndexSource(BaseIndexSource):
timeout=params.get('_timeout')) timeout=params.get('_timeout'))
res.raise_for_status() res.raise_for_status()
assert(res.text)
except Exception as e: except Exception as e:
no_except_close(res) no_except_close(res)