From f1901901280445715162d31156b79078eb580753 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 18 Aug 2022 23:25:38 -0700 Subject: [PATCH] Revisit headers load fix (#751) * revisit loading fix for revisit records with http headers: - if revisit record has http headers, always use those headers - otherwise, continue to use http headers from payload record - parse headers of http and payload records on initial lookup, to simplify loading - tests: add test for loading revisit records with different urls, different headers but same payload - fix for sul-dlss/was-pywb#64 * also bump version to 2.6.8 --- pywb/version.py | 2 +- pywb/warcserver/resource/responseloader.py | 32 ++--- tests/test_integration.py | 6 +- tests/test_redirect_revisits.py | 146 +++++++++++++++++++++ 4 files changed, 168 insertions(+), 18 deletions(-) create mode 100644 tests/test_redirect_revisits.py diff --git a/pywb/version.py b/pywb/version.py index 8dd43443..02a92943 100644 --- a/pywb/version.py +++ b/pywb/version.py @@ -1,4 +1,4 @@ -__version__ = '2.6.7' +__version__ = '2.6.8' if __name__ == '__main__': print(__version__) diff --git a/pywb/warcserver/resource/responseloader.py b/pywb/warcserver/resource/responseloader.py index 93b0e5ba..b254f441 100644 --- a/pywb/warcserver/resource/responseloader.py +++ b/pywb/warcserver/resource/responseloader.py @@ -172,7 +172,7 @@ class WARCPathLoader(DefaultResolverMixin, BaseLoader): self.resolvers = self.make_resolvers(self.paths) self.resolve_loader = ResolvingLoader(self.resolvers, - no_record_parse=True) + no_record_parse=False) self.headers_parser = StatusAndHeadersParser([], verify=False) @@ -206,18 +206,20 @@ class WARCPathLoader(DefaultResolverMixin, BaseLoader): local_index_query)) http_headers_buff = None + if payload.rec_type in ('response', 'revisit'): status = cdx.get('status') + try: + orig_size = payload.raw_stream.tell() + except: + orig_size = 0 + + http_headers = headers.http_headers or payload.http_headers + # if status is not set and not, 2xx, 4xx, 5xx # go through self-redirect check just in case if not status or not status.startswith(('2', '4', '5')): - http_headers = self.headers_parser.parse(payload.raw_stream) - try: - orig_size = payload.raw_stream.tell() - except: - orig_size = 0 - try: self.raise_on_self_redirect(params, cdx, http_headers.get_statuscode(), @@ -227,15 +229,15 @@ class WARCPathLoader(DefaultResolverMixin, BaseLoader): no_except_close(payload.raw_stream) raise - http_headers_buff = http_headers.to_bytes() + http_headers_buff = http_headers and http_headers.to_bytes() - # if new http_headers_buff is different length, - # attempt to adjust content-length on the WARC record - if orig_size and len(http_headers_buff) != orig_size: - orig_cl = payload.rec_headers.get_header('Content-Length') - if orig_cl: - new_cl = int(orig_cl) + (len(http_headers_buff) - orig_size) - payload.rec_headers.replace_header('Content-Length', str(new_cl)) + # if new http_headers_buff is different length, + # attempt to adjust content-length on the WARC record + if http_headers and orig_size and len(http_headers_buff) != orig_size: + orig_cl = payload.rec_headers.get_header('Content-Length') + if orig_cl: + new_cl = int(orig_cl) + (len(http_headers_buff) - orig_size) + payload.rec_headers.replace_header('Content-Length', str(new_cl)) warc_headers = payload.rec_headers diff --git a/tests/test_integration.py b/tests/test_integration.py index 1b0b33e4..36d53139 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -102,13 +102,15 @@ class TestWbIntegration(BaseConfigTest): assert not resp.headers.get('Content-Length') def test_replay_content_head_non_zero_content_length_match(self): - resp = self.testapp.get('/pywb/id_/http://www.iana.org/_js/2013.1/jquery.js', status=200) + resp = self.testapp.get('/pywb/20140126200625id_/http://www.iana.org/_js/2013.1/jquery.js', status=200) length = resp.content_length + print('length', length) # Content-Length included if non-zero - resp = self.testapp.head('/pywb/id_/http://www.iana.org/_js/2013.1/jquery.js', status=200) + resp = self.testapp.head('/pywb/20140126200625id_/http://www.iana.org/_js/2013.1/jquery.js', status=200) #assert resp.headers['Content-Length'] == length + print('length', resp.content_length) assert resp.content_length == length def test_replay_content(self, fmod): diff --git a/tests/test_redirect_revisits.py b/tests/test_redirect_revisits.py new file mode 100644 index 00000000..3188aa1d --- /dev/null +++ b/tests/test_redirect_revisits.py @@ -0,0 +1,146 @@ + +from io import BytesIO +import os + +from warcio import WARCWriter, StatusAndHeaders +from pywb.manager.manager import main as wb_manager + +from .base_config_test import BaseConfigTest, CollsDirMixin, fmod + + +# ============================================================================ +class TestRevisits(CollsDirMixin, BaseConfigTest): + @classmethod + def setup_class(cls): + super(TestRevisits, cls).setup_class('config_test.yaml') + + + def create_revisit_record(self, url, date, headers, refers_to_uri, refers_to_date): + http_headers = StatusAndHeaders( + "301 Permanent Redirect", headers, protocol="HTTP/1.0" + ) + + return self.writer.create_revisit_record( + url, + digest="sha1:B6QJ6BNJ3R4B23XXMRKZKHLPGJY2VE4O", + refers_to_uri=refers_to_uri, + refers_to_date=refers_to_date, + warc_headers_dict={"WARC-Date": date}, + http_headers=http_headers, + ) + + + def create_response_record(self, url, date, headers, payload): + http_headers = StatusAndHeaders( + "301 Permanent Redirect", headers, protocol="HTTP/1.0" + ) + + return self.writer.create_warc_record( + url, + record_type="response", + http_headers=http_headers, + payload=BytesIO(payload), + warc_headers_dict={"WARC-Date": date}, + length=len(payload), + ) + + def create(self): + payload = b"some\ntext" + + # record 1 + self.writer.write_record( + self.create_response_record( + "http://example.com/orig-1", + "2020-01-01T00:00:00Z", + [ + ("Content-Type", 'text/plain; charset="UTF-8"'), + ("Location", "https://example.com/redirect-1"), + ("Content-Length", str(len(payload))), + ("Custom", "1"), + ], + payload, + ) + ) + + # record 2 + self.writer.write_record( + self.create_response_record( + "http://example.com/orig-2", + "2020-01-01T00:00:00Z", + [ + ("Content-Type", 'text/plain; charset="UTF-8"'), + ("Location", "https://example.com/redirect-2"), + ("Content-Length", str(len(payload))), + ("Custom", "2"), + ], + payload, + ) + ) + + # record 3 + self.writer.write_record( + self.create_revisit_record( + "http://example.com/orig-2", + "2022-01-01T00:00:00Z", + [ + ("Content-Type", 'text/plain; charset="UTF-8"'), + ("Location", "https://example.com/redirect-3"), + ("Content-Length", str(len(payload))), + ("Custom", "3"), + ], + refers_to_uri="http://example.com/orig-1", + refers_to_date="2020-01-01T00:00:00Z", + ) + ) + + # record 4 + self.writer.write_record( + self.create_revisit_record( + "http://example.com/", + "2022-01-01T00:00:00Z", + [ + ("Content-Type", 'text/plain; charset="UTF-8"'), + ("Location", "https://example.com/redirect-4"), + ("Content-Length", str(len(payload))), + ("Custom", "4"), + ], + refers_to_uri="http://example.com/orig-2", + refers_to_date="2020-01-01T00:00:00Z", + ) + ) + + def test_init(self): + filename = os.path.join(self.root_dir, 'redir.warc.gz') + with open(filename, 'wb') as fh: + self.writer = WARCWriter(fh, gzip=True) + self.create() + + wb_manager(['init', 'revisits']) + + wb_manager(['add', 'revisits', filename]) + + assert os.path.isfile(os.path.join(self.root_dir, self.COLLS_DIR, 'revisits', 'indexes', 'index.cdxj')) + + def test_different_url_revisit_orig_headers(self, fmod): + res = self.get('/revisits/20220101{0}/http://example.com/', fmod, status=301) + assert res.headers["Custom"] == "4" + assert res.headers["Location"].endswith("/20220101{0}/https://example.com/redirect-4".format(fmod)) + assert res.text == 'some\ntext' + + def test_different_url_revisit_and_response(self, fmod): + res = self.get('/revisits/20200101{0}/http://example.com/orig-2', fmod, status=301) + assert res.headers["Custom"] == "2" + assert res.headers["Location"].endswith("/20200101{0}/https://example.com/redirect-2".format(fmod)) + assert res.text == 'some\ntext' + + res = self.get('/revisits/20220101{0}/http://example.com/orig-2', fmod, status=301) + assert res.headers["Custom"] == "3" + assert res.headers["Location"].endswith("/20220101{0}/https://example.com/redirect-3".format(fmod)) + assert res.text == 'some\ntext' + + def test_orig(self, fmod): + res = self.get('/revisits/20200101{0}/http://example.com/orig-1', fmod, status=301) + assert res.headers["Custom"] == "1" + assert res.headers["Location"].endswith("/20200101{0}/https://example.com/redirect-1".format(fmod)) + assert res.text == 'some\ntext' +