1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

lxml query parsing fix: (addressing part of ukwa/ukwa-pywb#38)

- ensure lxml-enabled parsing in XmlQueryIndexSource works by passing the raw bytestring instead of unicode text to the parser
- tests: add lxml and non-lxml parsing tests to test_xmlquery_indexsource.py, add lxml to test install
- misc fixes: fix typo in banner.html, update gevent api to support latest gevent
This commit is contained in:
Ilya Kreymer 2019-02-13 22:50:02 -08:00 committed by John Berlin
parent 8bf2f9debb
commit b8124e3931
No known key found for this signature in database
GPG Key ID: 6EF5E4B442011B02
3 changed files with 30 additions and 2 deletions

View File

@ -248,7 +248,7 @@ class XmlQueryIndexSource(BaseIndexSource):
response = self.session.get(query_url) response = self.session.get(query_url)
response.raise_for_status() response.raise_for_status()
results = etree.fromstring(response.text) results = etree.fromstring(response.content)
items = results.find('results') items = results.find('results')
@ -259,7 +259,7 @@ class XmlQueryIndexSource(BaseIndexSource):
raise NotFoundException('url {0} not found'.format(url)) raise NotFoundException('url {0} not found'.format(url))
if not items: if len(items) == 0:
raise NotFoundException('url {0} not found'.format(url)) raise NotFoundException('url {0} not found'.format(url))
items = items.findall('result') items = items.findall('result')

View File

@ -4,6 +4,7 @@ from pywb.warcserver.index.indexsource import XmlQueryIndexSource
from pywb.warcserver.index.aggregator import SimpleAggregator from pywb.warcserver.index.aggregator import SimpleAggregator
from mock import patch from mock import patch
import pytest
# ============================================================================ # ============================================================================
@ -27,6 +28,10 @@ def mock_get(self, url):
def text(self): def text(self):
return self.string return self.string
@property
def content(self):
return self.string.encode('utf-8')
def raise_for_status(self): def raise_for_status(self):
pass pass
@ -40,6 +45,19 @@ class TestXmlQueryIndexSource(BaseTestClass):
def setup_class(cls): def setup_class(cls):
super(TestXmlQueryIndexSource, cls).setup_class() super(TestXmlQueryIndexSource, cls).setup_class()
cls.xmlpatch = patch('pywb.warcserver.index.indexsource.etree', cls._get_etree())
cls.xmlpatch.start()
@classmethod
def _get_etree(cls):
import xml.etree.ElementTree as etree
return etree
@classmethod
def teardown_class(cls):
cls.xmlpatch.stop()
super(TestXmlQueryIndexSource, cls).teardown_class()
def do_query(self, params): def do_query(self, params):
return SimpleAggregator({'source': XmlQueryIndexSource('http://localhost:8080/path')})(params) return SimpleAggregator({'source': XmlQueryIndexSource('http://localhost:8080/path')})(params)
@ -75,6 +93,15 @@ com,example)/some/path 20180216200300 example.warc.gz"""
assert(errs == {}) assert(errs == {})
# ============================================================================
class TestXmlQueryIndexSourceLXML(TestXmlQueryIndexSource):
@classmethod
def _get_etree(cls):
pytest.importorskip('lxml.etree')
import lxml.etree
return lxml.etree
# ============================================================================ # ============================================================================
URL_RESPONSE_1 = """ URL_RESPONSE_1 = """
<wayback> <wayback>

View File

@ -117,6 +117,7 @@ setup(
'werkzeug', 'werkzeug',
'httpbin==0.5.0', 'httpbin==0.5.0',
'ujson' 'ujson'
'lxml',
], ],
cmdclass={'test': PyTest}, cmdclass={'test': PyTest},
test_suite='', test_suite='',