From b8124e3931e16c5bb205a97158aa2926809188ff Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 13 Feb 2019 22:50:02 -0800 Subject: [PATCH] lxml query parsing fix: (addressing part of ukwa/ukwa-pywb#38) - ensure lxml-enabled parsing in XmlQueryIndexSource works by passing the raw bytestring instead of unicode text to the parser - tests: add lxml and non-lxml parsing tests to test_xmlquery_indexsource.py, add lxml to test install - misc fixes: fix typo in banner.html, update gevent api to support latest gevent --- pywb/warcserver/index/indexsource.py | 4 +-- .../index/test/test_xmlquery_indexsource.py | 27 +++++++++++++++++++ setup.py | 1 + 3 files changed, 30 insertions(+), 2 deletions(-) diff --git a/pywb/warcserver/index/indexsource.py b/pywb/warcserver/index/indexsource.py index 4d1d7136..32a5161b 100644 --- a/pywb/warcserver/index/indexsource.py +++ b/pywb/warcserver/index/indexsource.py @@ -248,7 +248,7 @@ class XmlQueryIndexSource(BaseIndexSource): response = self.session.get(query_url) response.raise_for_status() - results = etree.fromstring(response.text) + results = etree.fromstring(response.content) items = results.find('results') @@ -259,7 +259,7 @@ class XmlQueryIndexSource(BaseIndexSource): raise NotFoundException('url {0} not found'.format(url)) - if not items: + if len(items) == 0: raise NotFoundException('url {0} not found'.format(url)) items = items.findall('result') diff --git a/pywb/warcserver/index/test/test_xmlquery_indexsource.py b/pywb/warcserver/index/test/test_xmlquery_indexsource.py index fc5d45b2..22158148 100644 --- a/pywb/warcserver/index/test/test_xmlquery_indexsource.py +++ b/pywb/warcserver/index/test/test_xmlquery_indexsource.py @@ -4,6 +4,7 @@ from pywb.warcserver.index.indexsource import XmlQueryIndexSource from pywb.warcserver.index.aggregator import SimpleAggregator from mock import patch +import pytest # ============================================================================ @@ -27,6 +28,10 @@ def mock_get(self, url): def text(self): return self.string + @property + def content(self): + return self.string.encode('utf-8') + def raise_for_status(self): pass @@ -40,6 +45,19 @@ class TestXmlQueryIndexSource(BaseTestClass): def setup_class(cls): super(TestXmlQueryIndexSource, cls).setup_class() + cls.xmlpatch = patch('pywb.warcserver.index.indexsource.etree', cls._get_etree()) + cls.xmlpatch.start() + + @classmethod + def _get_etree(cls): + import xml.etree.ElementTree as etree + return etree + + @classmethod + def teardown_class(cls): + cls.xmlpatch.stop() + super(TestXmlQueryIndexSource, cls).teardown_class() + def do_query(self, params): return SimpleAggregator({'source': XmlQueryIndexSource('http://localhost:8080/path')})(params) @@ -75,6 +93,15 @@ com,example)/some/path 20180216200300 example.warc.gz""" assert(errs == {}) +# ============================================================================ +class TestXmlQueryIndexSourceLXML(TestXmlQueryIndexSource): + @classmethod + def _get_etree(cls): + pytest.importorskip('lxml.etree') + import lxml.etree + return lxml.etree + + # ============================================================================ URL_RESPONSE_1 = """ diff --git a/setup.py b/setup.py index 308d0e33..19c973c3 100755 --- a/setup.py +++ b/setup.py @@ -117,6 +117,7 @@ setup( 'werkzeug', 'httpbin==0.5.0', 'ujson' + 'lxml', ], cmdclass={'test': PyTest}, test_suite='',