From c5c4a54e7d4caded36dd939a75194cd89a49fdc6 Mon Sep 17 00:00:00 2001 From: Alex Osborne Date: Tue, 27 Apr 2021 12:59:37 +0900 Subject: [PATCH] xmlquery: use compressed length when available (#633) The field is unfortunately misnamed compressedendoffset in XML but OWB actually uses this for the compressed length 'S' CDX field. Without this field when WARC files are accessed over HTTP pywb will make open byte range requests which results in a lot more data being read from disk than necessary. --- pywb/warcserver/index/indexsource.py | 5 +++++ pywb/warcserver/index/test/test_xmlquery_indexsource.py | 6 +++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/pywb/warcserver/index/indexsource.py b/pywb/warcserver/index/indexsource.py index 84a54800..e5e3f7bb 100644 --- a/pywb/warcserver/index/indexsource.py +++ b/pywb/warcserver/index/indexsource.py @@ -314,6 +314,11 @@ class XmlQueryIndexSource(BaseIndexSource): cdx['digest'] = self.gettext(item, 'digest') cdx['offset'] = self.gettext(item, 'compressedoffset') cdx['filename'] = self.gettext(item, 'file') + + length = self.gettext(item, 'compressedendoffset') + if length: + cdx['length'] = length + return cdx def gettext(self, item, name): diff --git a/pywb/warcserver/index/test/test_xmlquery_indexsource.py b/pywb/warcserver/index/test/test_xmlquery_indexsource.py index 6861aff8..63f832ce 100644 --- a/pywb/warcserver/index/test/test_xmlquery_indexsource.py +++ b/pywb/warcserver/index/test/test_xmlquery_indexsource.py @@ -71,13 +71,16 @@ class TestXmlQueryIndexSource(BaseTestClass): @patch('pywb.warcserver.index.indexsource.requests.sessions.Session.get', mock_get) def test_exact_query(self): res, errs = self.do_query({'url': 'http://example.com/', 'limit': 100}) + reslist = list(res) expected = """\ com,example)/ 20180112200243 example.warc.gz com,example)/ 20180216200300 example.warc.gz""" - assert(key_ts_res(res) == expected) + assert(key_ts_res(reslist) == expected) assert(errs == {}) assert query_url == 'http://localhost:8080/path?q=limit%3A+100+type%3Aurlquery+url%3Ahttp%253A%252F%252Fexample.com%252F' + assert reslist[0]['length'] == '123' + assert 'length' not in reslist[1] @patch('pywb.warcserver.index.indexsource.requests.sessions.Session.get', mock_get) @@ -119,6 +122,7 @@ URL_RESPONSE_1 = """ 10 + 123 text/html example.warc.gz -