1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

xmlquery: use compressed length when available (#633)

The field is unfortunately misnamed compressedendoffset in XML but OWB
actually uses this for the compressed length 'S' CDX field.

Without this field when WARC files are accessed over HTTP pywb will make
open byte range requests which results in a lot more data being read
from disk than necessary.
This commit is contained in:
Alex Osborne 2021-04-27 12:59:37 +09:00 committed by GitHub
parent 73d6735bed
commit c5c4a54e7d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 10 additions and 1 deletions

View File

@ -314,6 +314,11 @@ class XmlQueryIndexSource(BaseIndexSource):
cdx['digest'] = self.gettext(item, 'digest')
cdx['offset'] = self.gettext(item, 'compressedoffset')
cdx['filename'] = self.gettext(item, 'file')
length = self.gettext(item, 'compressedendoffset')
if length:
cdx['length'] = length
return cdx
def gettext(self, item, name):

View File

@ -71,13 +71,16 @@ class TestXmlQueryIndexSource(BaseTestClass):
@patch('pywb.warcserver.index.indexsource.requests.sessions.Session.get', mock_get)
def test_exact_query(self):
res, errs = self.do_query({'url': 'http://example.com/', 'limit': 100})
reslist = list(res)
expected = """\
com,example)/ 20180112200243 example.warc.gz
com,example)/ 20180216200300 example.warc.gz"""
assert(key_ts_res(res) == expected)
assert(key_ts_res(reslist) == expected)
assert(errs == {})
assert query_url == 'http://localhost:8080/path?q=limit%3A+100+type%3Aurlquery+url%3Ahttp%253A%252F%252Fexample.com%252F'
assert reslist[0]['length'] == '123'
assert 'length' not in reslist[1]
@patch('pywb.warcserver.index.indexsource.requests.sessions.Session.get', mock_get)
@ -119,6 +122,7 @@ URL_RESPONSE_1 = """
<results>
<result>
<compressedoffset>10</compressedoffset>
<compressedendoffset>123</compressedendoffset>
<mimetype>text/html</mimetype>
<file>example.warc.gz</file>
<redirecturl>-</redirecturl>