mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
added lxml requirments entry to extra_requirments.txt and documented pywb.warcserver.index.indexsource.XmlQueryIndexSource
This commit is contained in:
parent
41c37129c0
commit
9a40d29ac3
@ -4,3 +4,4 @@ boto3
|
||||
uwsgi
|
||||
git+https://github.com/esnme/ultrajson.git
|
||||
pysocks
|
||||
lxml
|
||||
|
@ -18,14 +18,8 @@ from pywb.utils.memento import MementoUtils
|
||||
from pywb.utils.wbexception import NotFoundException
|
||||
from pywb.warcserver.http import DefaultAdapters
|
||||
from pywb.warcserver.index.cdxobject import CDXObject
|
||||
|
||||
from pywb.utils.format import ParamFormatter, res_template
|
||||
from pywb.utils.memento import MementoUtils
|
||||
|
||||
from pywb.warcserver.index.cdxops import cdx_sort_closest
|
||||
|
||||
from six.moves.urllib.parse import quote_plus
|
||||
|
||||
try:
|
||||
from lxml import etree
|
||||
except:
|
||||
@ -222,14 +216,28 @@ class RemoteIndexSource(BaseIndexSource):
|
||||
|
||||
# =============================================================================
|
||||
class XmlQueryIndexSource(BaseIndexSource):
|
||||
EXACT_QUERY = 'type:urlquery url:'
|
||||
PREFIX_QUERY = 'type:prefixquery url:'
|
||||
"""An index source class for XML files"""
|
||||
|
||||
EXACT_QUERY = 'type:urlquery url:' # type: str
|
||||
PREFIX_QUERY = 'type:prefixquery url:' # type: str
|
||||
|
||||
def __init__(self, query_api_url):
|
||||
self.query_api_url = query_api_url
|
||||
self.session = requests.session()
|
||||
"""Initialize the XmlQueryIndexSource instance
|
||||
|
||||
:param str query_api_url: The query api URL
|
||||
"""
|
||||
self.query_api_url = query_api_url # type: str
|
||||
self.session = requests.session() # type: requests.Session
|
||||
|
||||
def load_index(self, params):
|
||||
"""Loads the xml query index based on the supplied params
|
||||
|
||||
:param dict[str, str] params: The query params
|
||||
:return: A list or generator of cdx objects
|
||||
:raises NotFoundException: If the query url is not found
|
||||
or the results of the query returns no cdx entries
|
||||
:raises BadRequestException: If the match type is not exact or prefix
|
||||
"""
|
||||
closest = params.get('closest')
|
||||
|
||||
url = params.get('url', '')
|
||||
@ -278,6 +286,11 @@ class XmlQueryIndexSource(BaseIndexSource):
|
||||
return cdx_iter
|
||||
|
||||
def prefix_query_iter(self, items):
|
||||
"""Returns an iterator yielding the results of performing a prefix query
|
||||
|
||||
:param items: The xml entry elements representing an query
|
||||
:return: An iterator yielding the results of the query
|
||||
"""
|
||||
for item in items:
|
||||
url = self.gettext(item, 'originalurl')
|
||||
if not url:
|
||||
@ -288,6 +301,12 @@ class XmlQueryIndexSource(BaseIndexSource):
|
||||
yield cdx
|
||||
|
||||
def convert_to_cdx(self, item):
|
||||
"""Converts the etree element to an CDX object
|
||||
|
||||
:param item: The etree element to be converted
|
||||
:return: The CDXObject representing the supplied etree element object
|
||||
:rtype: CDXObject
|
||||
"""
|
||||
cdx = CDXObject()
|
||||
cdx['urlkey'] = self.gettext(item, 'urlkey')
|
||||
cdx['timestamp'] = self.gettext(item, 'capturedate')[:14]
|
||||
@ -300,6 +319,13 @@ class XmlQueryIndexSource(BaseIndexSource):
|
||||
return cdx
|
||||
|
||||
def gettext(self, item, name):
|
||||
"""Returns the value of the supplied name
|
||||
|
||||
:param item: The etree element to be converted
|
||||
:param name: The name of the field to get its value for
|
||||
:return: The value of the field
|
||||
:rtype: str
|
||||
"""
|
||||
elem = item.find(name)
|
||||
if elem is not None:
|
||||
return elem.text
|
||||
@ -308,12 +334,25 @@ class XmlQueryIndexSource(BaseIndexSource):
|
||||
|
||||
@classmethod
|
||||
def init_from_string(cls, value):
|
||||
"""Creates and initializes a new instance of XmlQueryIndexSource
|
||||
IFF the supplied value starts with xmlquery+
|
||||
|
||||
:param str value: The string by which to initialize the XmlQueryIndexSource
|
||||
:return: The initialized XmlQueryIndexSource or None
|
||||
:rtype: XmlQueryIndexSource|None
|
||||
"""
|
||||
if value.startswith('xmlquery+'):
|
||||
return cls(value[9:])
|
||||
|
||||
|
||||
@classmethod
|
||||
def init_from_config(cls, config):
|
||||
"""Creates and initializes a new instance of XmlQueryIndexSource
|
||||
IFF the supplied dictionary contains the type key equal to xmlquery
|
||||
|
||||
:param dict[str, str] config:
|
||||
:return: The initialized XmlQueryIndexSource or None
|
||||
:rtype: XmlQueryIndexSource|None
|
||||
"""
|
||||
if config['type'] != 'xmlquery':
|
||||
return
|
||||
|
||||
@ -565,6 +604,7 @@ class MementoIndexSource(BaseIndexSource):
|
||||
timeout=params.get('_timeout'))
|
||||
|
||||
res.raise_for_status()
|
||||
assert(res.text)
|
||||
|
||||
except Exception as e:
|
||||
no_except_close(res)
|
||||
|
Loading…
x
Reference in New Issue
Block a user