diff --git a/extra_requirements.txt b/extra_requirements.txt index 2ec3ac8d..aa65d62a 100644 --- a/extra_requirements.txt +++ b/extra_requirements.txt @@ -4,3 +4,4 @@ boto3 uwsgi git+https://github.com/esnme/ultrajson.git pysocks +lxml diff --git a/pywb/warcserver/index/indexsource.py b/pywb/warcserver/index/indexsource.py index 40b8e750..9aa8ebe3 100644 --- a/pywb/warcserver/index/indexsource.py +++ b/pywb/warcserver/index/indexsource.py @@ -18,14 +18,8 @@ from pywb.utils.memento import MementoUtils from pywb.utils.wbexception import NotFoundException from pywb.warcserver.http import DefaultAdapters from pywb.warcserver.index.cdxobject import CDXObject - -from pywb.utils.format import ParamFormatter, res_template -from pywb.utils.memento import MementoUtils - from pywb.warcserver.index.cdxops import cdx_sort_closest -from six.moves.urllib.parse import quote_plus - try: from lxml import etree except: @@ -222,14 +216,28 @@ class RemoteIndexSource(BaseIndexSource): # ============================================================================= class XmlQueryIndexSource(BaseIndexSource): - EXACT_QUERY = 'type:urlquery url:' - PREFIX_QUERY = 'type:prefixquery url:' + """An index source class for XML files""" + + EXACT_QUERY = 'type:urlquery url:' # type: str + PREFIX_QUERY = 'type:prefixquery url:' # type: str def __init__(self, query_api_url): - self.query_api_url = query_api_url - self.session = requests.session() + """Initialize the XmlQueryIndexSource instance + + :param str query_api_url: The query api URL + """ + self.query_api_url = query_api_url # type: str + self.session = requests.session() # type: requests.Session def load_index(self, params): + """Loads the xml query index based on the supplied params + + :param dict[str, str] params: The query params + :return: A list or generator of cdx objects + :raises NotFoundException: If the query url is not found + or the results of the query returns no cdx entries + :raises BadRequestException: If the match type is not exact or prefix + """ closest = params.get('closest') url = params.get('url', '') @@ -244,8 +252,8 @@ class XmlQueryIndexSource(BaseIndexSource): raise BadRequestException('matchType={0} is not supported'.format(matchType=matchType)) try: - #OpenSearch API requires double-escaping - #TODO: add option to not double escape if needed + # OpenSearch API requires double-escaping + # TODO: add option to not double escape if needed query_url = self.query_api_url + '?q=' + quote_plus(query + quote_plus(url)) self.logger.debug("Running query: %s" % query_url) response = self.session.get(query_url) @@ -278,6 +286,11 @@ class XmlQueryIndexSource(BaseIndexSource): return cdx_iter def prefix_query_iter(self, items): + """Returns an iterator yielding the results of performing a prefix query + + :param items: The xml entry elements representing an query + :return: An iterator yielding the results of the query + """ for item in items: url = self.gettext(item, 'originalurl') if not url: @@ -288,6 +301,12 @@ class XmlQueryIndexSource(BaseIndexSource): yield cdx def convert_to_cdx(self, item): + """Converts the etree element to an CDX object + + :param item: The etree element to be converted + :return: The CDXObject representing the supplied etree element object + :rtype: CDXObject + """ cdx = CDXObject() cdx['urlkey'] = self.gettext(item, 'urlkey') cdx['timestamp'] = self.gettext(item, 'capturedate')[:14] @@ -300,6 +319,13 @@ class XmlQueryIndexSource(BaseIndexSource): return cdx def gettext(self, item, name): + """Returns the value of the supplied name + + :param item: The etree element to be converted + :param name: The name of the field to get its value for + :return: The value of the field + :rtype: str + """ elem = item.find(name) if elem is not None: return elem.text @@ -308,12 +334,25 @@ class XmlQueryIndexSource(BaseIndexSource): @classmethod def init_from_string(cls, value): + """Creates and initializes a new instance of XmlQueryIndexSource + IFF the supplied value starts with xmlquery+ + + :param str value: The string by which to initialize the XmlQueryIndexSource + :return: The initialized XmlQueryIndexSource or None + :rtype: XmlQueryIndexSource|None + """ if value.startswith('xmlquery+'): return cls(value[9:]) - @classmethod def init_from_config(cls, config): + """Creates and initializes a new instance of XmlQueryIndexSource + IFF the supplied dictionary contains the type key equal to xmlquery + + :param dict[str, str] config: + :return: The initialized XmlQueryIndexSource or None + :rtype: XmlQueryIndexSource|None + """ if config['type'] != 'xmlquery': return @@ -565,6 +604,7 @@ class MementoIndexSource(BaseIndexSource): timeout=params.get('_timeout')) res.raise_for_status() + assert(res.text) except Exception as e: no_except_close(res)