mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
added lxml requirments entry to extra_requirments.txt and documented pywb.warcserver.index.indexsource.XmlQueryIndexSource
This commit is contained in:
parent
41c37129c0
commit
9a40d29ac3
@ -4,3 +4,4 @@ boto3
|
|||||||
uwsgi
|
uwsgi
|
||||||
git+https://github.com/esnme/ultrajson.git
|
git+https://github.com/esnme/ultrajson.git
|
||||||
pysocks
|
pysocks
|
||||||
|
lxml
|
||||||
|
@ -18,14 +18,8 @@ from pywb.utils.memento import MementoUtils
|
|||||||
from pywb.utils.wbexception import NotFoundException
|
from pywb.utils.wbexception import NotFoundException
|
||||||
from pywb.warcserver.http import DefaultAdapters
|
from pywb.warcserver.http import DefaultAdapters
|
||||||
from pywb.warcserver.index.cdxobject import CDXObject
|
from pywb.warcserver.index.cdxobject import CDXObject
|
||||||
|
|
||||||
from pywb.utils.format import ParamFormatter, res_template
|
|
||||||
from pywb.utils.memento import MementoUtils
|
|
||||||
|
|
||||||
from pywb.warcserver.index.cdxops import cdx_sort_closest
|
from pywb.warcserver.index.cdxops import cdx_sort_closest
|
||||||
|
|
||||||
from six.moves.urllib.parse import quote_plus
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
except:
|
except:
|
||||||
@ -222,14 +216,28 @@ class RemoteIndexSource(BaseIndexSource):
|
|||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
class XmlQueryIndexSource(BaseIndexSource):
|
class XmlQueryIndexSource(BaseIndexSource):
|
||||||
EXACT_QUERY = 'type:urlquery url:'
|
"""An index source class for XML files"""
|
||||||
PREFIX_QUERY = 'type:prefixquery url:'
|
|
||||||
|
EXACT_QUERY = 'type:urlquery url:' # type: str
|
||||||
|
PREFIX_QUERY = 'type:prefixquery url:' # type: str
|
||||||
|
|
||||||
def __init__(self, query_api_url):
|
def __init__(self, query_api_url):
|
||||||
self.query_api_url = query_api_url
|
"""Initialize the XmlQueryIndexSource instance
|
||||||
self.session = requests.session()
|
|
||||||
|
:param str query_api_url: The query api URL
|
||||||
|
"""
|
||||||
|
self.query_api_url = query_api_url # type: str
|
||||||
|
self.session = requests.session() # type: requests.Session
|
||||||
|
|
||||||
def load_index(self, params):
|
def load_index(self, params):
|
||||||
|
"""Loads the xml query index based on the supplied params
|
||||||
|
|
||||||
|
:param dict[str, str] params: The query params
|
||||||
|
:return: A list or generator of cdx objects
|
||||||
|
:raises NotFoundException: If the query url is not found
|
||||||
|
or the results of the query returns no cdx entries
|
||||||
|
:raises BadRequestException: If the match type is not exact or prefix
|
||||||
|
"""
|
||||||
closest = params.get('closest')
|
closest = params.get('closest')
|
||||||
|
|
||||||
url = params.get('url', '')
|
url = params.get('url', '')
|
||||||
@ -244,8 +252,8 @@ class XmlQueryIndexSource(BaseIndexSource):
|
|||||||
raise BadRequestException('matchType={0} is not supported'.format(matchType=matchType))
|
raise BadRequestException('matchType={0} is not supported'.format(matchType=matchType))
|
||||||
|
|
||||||
try:
|
try:
|
||||||
#OpenSearch API requires double-escaping
|
# OpenSearch API requires double-escaping
|
||||||
#TODO: add option to not double escape if needed
|
# TODO: add option to not double escape if needed
|
||||||
query_url = self.query_api_url + '?q=' + quote_plus(query + quote_plus(url))
|
query_url = self.query_api_url + '?q=' + quote_plus(query + quote_plus(url))
|
||||||
self.logger.debug("Running query: %s" % query_url)
|
self.logger.debug("Running query: %s" % query_url)
|
||||||
response = self.session.get(query_url)
|
response = self.session.get(query_url)
|
||||||
@ -278,6 +286,11 @@ class XmlQueryIndexSource(BaseIndexSource):
|
|||||||
return cdx_iter
|
return cdx_iter
|
||||||
|
|
||||||
def prefix_query_iter(self, items):
|
def prefix_query_iter(self, items):
|
||||||
|
"""Returns an iterator yielding the results of performing a prefix query
|
||||||
|
|
||||||
|
:param items: The xml entry elements representing an query
|
||||||
|
:return: An iterator yielding the results of the query
|
||||||
|
"""
|
||||||
for item in items:
|
for item in items:
|
||||||
url = self.gettext(item, 'originalurl')
|
url = self.gettext(item, 'originalurl')
|
||||||
if not url:
|
if not url:
|
||||||
@ -288,6 +301,12 @@ class XmlQueryIndexSource(BaseIndexSource):
|
|||||||
yield cdx
|
yield cdx
|
||||||
|
|
||||||
def convert_to_cdx(self, item):
|
def convert_to_cdx(self, item):
|
||||||
|
"""Converts the etree element to an CDX object
|
||||||
|
|
||||||
|
:param item: The etree element to be converted
|
||||||
|
:return: The CDXObject representing the supplied etree element object
|
||||||
|
:rtype: CDXObject
|
||||||
|
"""
|
||||||
cdx = CDXObject()
|
cdx = CDXObject()
|
||||||
cdx['urlkey'] = self.gettext(item, 'urlkey')
|
cdx['urlkey'] = self.gettext(item, 'urlkey')
|
||||||
cdx['timestamp'] = self.gettext(item, 'capturedate')[:14]
|
cdx['timestamp'] = self.gettext(item, 'capturedate')[:14]
|
||||||
@ -300,6 +319,13 @@ class XmlQueryIndexSource(BaseIndexSource):
|
|||||||
return cdx
|
return cdx
|
||||||
|
|
||||||
def gettext(self, item, name):
|
def gettext(self, item, name):
|
||||||
|
"""Returns the value of the supplied name
|
||||||
|
|
||||||
|
:param item: The etree element to be converted
|
||||||
|
:param name: The name of the field to get its value for
|
||||||
|
:return: The value of the field
|
||||||
|
:rtype: str
|
||||||
|
"""
|
||||||
elem = item.find(name)
|
elem = item.find(name)
|
||||||
if elem is not None:
|
if elem is not None:
|
||||||
return elem.text
|
return elem.text
|
||||||
@ -308,12 +334,25 @@ class XmlQueryIndexSource(BaseIndexSource):
|
|||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def init_from_string(cls, value):
|
def init_from_string(cls, value):
|
||||||
|
"""Creates and initializes a new instance of XmlQueryIndexSource
|
||||||
|
IFF the supplied value starts with xmlquery+
|
||||||
|
|
||||||
|
:param str value: The string by which to initialize the XmlQueryIndexSource
|
||||||
|
:return: The initialized XmlQueryIndexSource or None
|
||||||
|
:rtype: XmlQueryIndexSource|None
|
||||||
|
"""
|
||||||
if value.startswith('xmlquery+'):
|
if value.startswith('xmlquery+'):
|
||||||
return cls(value[9:])
|
return cls(value[9:])
|
||||||
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def init_from_config(cls, config):
|
def init_from_config(cls, config):
|
||||||
|
"""Creates and initializes a new instance of XmlQueryIndexSource
|
||||||
|
IFF the supplied dictionary contains the type key equal to xmlquery
|
||||||
|
|
||||||
|
:param dict[str, str] config:
|
||||||
|
:return: The initialized XmlQueryIndexSource or None
|
||||||
|
:rtype: XmlQueryIndexSource|None
|
||||||
|
"""
|
||||||
if config['type'] != 'xmlquery':
|
if config['type'] != 'xmlquery':
|
||||||
return
|
return
|
||||||
|
|
||||||
@ -565,6 +604,7 @@ class MementoIndexSource(BaseIndexSource):
|
|||||||
timeout=params.get('_timeout'))
|
timeout=params.get('_timeout'))
|
||||||
|
|
||||||
res.raise_for_status()
|
res.raise_for_status()
|
||||||
|
assert(res.text)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
no_except_close(res)
|
no_except_close(res)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user