mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
xmlquery: use compressed length when available (#633)
The field is unfortunately misnamed compressedendoffset in XML but OWB actually uses this for the compressed length 'S' CDX field. Without this field when WARC files are accessed over HTTP pywb will make open byte range requests which results in a lot more data being read from disk than necessary.
This commit is contained in:
parent
73d6735bed
commit
c5c4a54e7d
@ -314,6 +314,11 @@ class XmlQueryIndexSource(BaseIndexSource):
|
||||
cdx['digest'] = self.gettext(item, 'digest')
|
||||
cdx['offset'] = self.gettext(item, 'compressedoffset')
|
||||
cdx['filename'] = self.gettext(item, 'file')
|
||||
|
||||
length = self.gettext(item, 'compressedendoffset')
|
||||
if length:
|
||||
cdx['length'] = length
|
||||
|
||||
return cdx
|
||||
|
||||
def gettext(self, item, name):
|
||||
|
@ -71,13 +71,16 @@ class TestXmlQueryIndexSource(BaseTestClass):
|
||||
@patch('pywb.warcserver.index.indexsource.requests.sessions.Session.get', mock_get)
|
||||
def test_exact_query(self):
|
||||
res, errs = self.do_query({'url': 'http://example.com/', 'limit': 100})
|
||||
reslist = list(res)
|
||||
|
||||
expected = """\
|
||||
com,example)/ 20180112200243 example.warc.gz
|
||||
com,example)/ 20180216200300 example.warc.gz"""
|
||||
assert(key_ts_res(res) == expected)
|
||||
assert(key_ts_res(reslist) == expected)
|
||||
assert(errs == {})
|
||||
assert query_url == 'http://localhost:8080/path?q=limit%3A+100+type%3Aurlquery+url%3Ahttp%253A%252F%252Fexample.com%252F'
|
||||
assert reslist[0]['length'] == '123'
|
||||
assert 'length' not in reslist[1]
|
||||
|
||||
|
||||
@patch('pywb.warcserver.index.indexsource.requests.sessions.Session.get', mock_get)
|
||||
@ -119,6 +122,7 @@ URL_RESPONSE_1 = """
|
||||
<results>
|
||||
<result>
|
||||
<compressedoffset>10</compressedoffset>
|
||||
<compressedendoffset>123</compressedendoffset>
|
||||
<mimetype>text/html</mimetype>
|
||||
<file>example.warc.gz</file>
|
||||
<redirecturl>-</redirecturl>
|
||||
|
Loading…
x
Reference in New Issue
Block a user