diff --git a/pywb/warcserver/resource/resolvingloader.py b/pywb/warcserver/resource/resolvingloader.py index aa584f5e..72d8c5ad 100644 --- a/pywb/warcserver/resource/resolvingloader.py +++ b/pywb/warcserver/resource/resolvingloader.py @@ -11,6 +11,8 @@ from pywb.warcserver.resource.blockrecordloader import BlockArcWarcRecordLoader class ResolvingLoader(object): MISSING_REVISIT_MSG = 'Original for revisit record could not be loaded' + EMPTY_DIGEST = '3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ' + def __init__(self, path_resolvers, record_loader=None, no_record_parse=False): self.path_resolvers = path_resolvers self.record_loader = record_loader if record_loader is not None else BlockArcWarcRecordLoader() @@ -163,6 +165,13 @@ class ResolvingLoader(object): Raise exception if no matches found. """ + digest = cdx.get('digest', '-') + + # if the digest is the empty record digest, don't attempt to look up the payload record! + # the payload is simply empty, so use empty payload of existing record + if digest == self.EMPTY_DIGEST: + return headers_record + ref_target_uri = (headers_record.rec_headers. get_header('WARC-Refers-To-Target-URI')) @@ -180,8 +189,6 @@ class ResolvingLoader(object): else: ref_target_date = iso_date_to_timestamp(ref_target_date) - digest = cdx.get('digest', '-') - try: orig_cdx_lines = self.load_cdx_for_dupe(ref_target_uri, ref_target_date, diff --git a/tests/test_redirects.py b/tests/test_redirects.py index 6ec904eb..79ce367c 100644 --- a/tests/test_redirects.py +++ b/tests/test_redirects.py @@ -57,7 +57,7 @@ class TestRedirects(CollsDirMixin, BaseConfigTest): self.writer.write_record(rec) return rec - def create_revisit_record(self, original, url, redirect_url, timestamp): + def create_revisit_record(self, url, timestamp, redirect_url, original_dt): warc_headers = {} warc_headers['WARC-Date'] = timestamp_to_iso_date(timestamp) @@ -67,9 +67,9 @@ class TestRedirects(CollsDirMixin, BaseConfigTest): http_headers = StatusAndHeaders('302 Temp Redirect', headers_list, protocol='HTTP/1.0') rec = self.writer.create_revisit_record(url, - digest=original.rec_headers['WARC-Payload-Digest'], + digest='3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ', refers_to_uri=url, - refers_to_date=original.rec_headers['WARC-Date'], + refers_to_date=original_dt, warc_headers_dict=warc_headers, http_headers=http_headers) @@ -80,9 +80,12 @@ class TestRedirects(CollsDirMixin, BaseConfigTest): with open(filename, 'wb') as fh: self.writer = WARCWriter(fh, gzip=True) - redirect = self.create_redirect_record('http://example.com/', 'https://example.com/', '201806026101112') - redirect = self.create_redirect_record('https://example.com/', 'https://www.example.com/', '201806026101112') - response = self.create_response_record('https://www.example.com/', '201806026101112', 'Some Text') + redirect = self.create_redirect_record('http://example.com/', 'https://example.com/', '20180626101112') + redirect = self.create_redirect_record('https://example.com/', 'https://www.example.com/', '20180626101112') + response = self.create_response_record('https://www.example.com/', '20180626101112', 'Some Text') + + revisit = self.create_revisit_record('https://example.com/path', '20190626101112', 'https://example.com/abc', response.rec_headers['WARC-Date']) + revisit = self.create_revisit_record('https://www.example.com/', '20190626101112', 'https://www.example.com/', response.rec_headers['WARC-Date']) wb_manager(['init', 'redir']) @@ -91,7 +94,7 @@ class TestRedirects(CollsDirMixin, BaseConfigTest): assert os.path.isfile(os.path.join(self.root_dir, self.COLLS_DIR, 'redir', 'indexes', 'index.cdxj')) def test_self_redir_1(self, fmod): - res = self.get('/redir/201806026101112{0}/https://example.com/', fmod) + res = self.get('/redir/20180626101112{0}/https://example.com/', fmod, status=200) assert res.status_code == 200 @@ -102,16 +105,16 @@ class TestRedirects(CollsDirMixin, BaseConfigTest): with open(filename, 'wb') as fh: self.writer = WARCWriter(fh, gzip=True) - response = self.create_response_record('https://www.example.com/sub/path/', '201806026101112', 'Sub Path Data') + response = self.create_response_record('https://www.example.com/sub/path/', '20180626101112', 'Sub Path Data') - response = self.create_response_record('https://www.example.com/sub/path/?foo=bar', '201806026101112', 'Sub Path Data Q') + response = self.create_response_record('https://www.example.com/sub/path/?foo=bar', '20180626101112', 'Sub Path Data Q') wb_manager(['add', 'redir', filename]) def test_redir_slash(self, fmod): - res = self.get('/redir/201806026101112{0}/https://example.com/sub/path', fmod, status=307) + res = self.get('/redir/20180626101112{0}/https://example.com/sub/path', fmod, status=307) - assert res.headers['Location'].endswith('/redir/201806026101112{0}/https://example.com/sub/path/'.format(fmod)) + assert res.headers['Location'].endswith('/redir/20180626101112{0}/https://example.com/sub/path/'.format(fmod)) res = res.follow() assert res.status_code == 200 @@ -119,14 +122,22 @@ class TestRedirects(CollsDirMixin, BaseConfigTest): assert res.text == 'Sub Path Data' def test_redir_slash_with_query(self, fmod): - res = self.get('/redir/201806026101112{0}/https://example.com/sub/path?foo=bar', fmod, status=307) + res = self.get('/redir/20180626101112{0}/https://example.com/sub/path?foo=bar', fmod, status=307) - assert res.headers['Location'].endswith('/redir/201806026101112{0}/https://example.com/sub/path/?foo=bar'.format(fmod)) + assert res.headers['Location'].endswith('/redir/20180626101112{0}/https://example.com/sub/path/?foo=bar'.format(fmod)) res = res.follow() assert res.status_code == 200 assert res.text == 'Sub Path Data Q' + def test_revisit_redirect_302(self, fmod): + res = self.get('/redir/20170626101112{0}/https://example.com/path', fmod, status=302) + assert res.headers['Location'].endswith('/redir/20170626101112{0}/https://example.com/abc'.format(fmod)) + assert res.text == '' + + def test_revisit_redirect_skip_self_redir(self, fmod): + res = self.get('/redir/20190626101112{0}/http://www.example.com/', fmod, status=200) + assert res.text == 'Some Text'