1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

revisit lookup fix (possible fix for ukwa/ukwa-pywb#53) (#530)

- if a revisit record has empty hash, don't attempt to lookup an original, simply use with empty payload
This commit is contained in:
Ilya Kreymer 2020-01-11 11:12:31 -08:00 committed by GitHub
parent f0b9d5b8e8
commit fb8aa7cbc1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 33 additions and 15 deletions

View File

@ -11,6 +11,8 @@ from pywb.warcserver.resource.blockrecordloader import BlockArcWarcRecordLoader
class ResolvingLoader(object):
MISSING_REVISIT_MSG = 'Original for revisit record could not be loaded'
EMPTY_DIGEST = '3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ'
def __init__(self, path_resolvers, record_loader=None, no_record_parse=False):
self.path_resolvers = path_resolvers
self.record_loader = record_loader if record_loader is not None else BlockArcWarcRecordLoader()
@ -163,6 +165,13 @@ class ResolvingLoader(object):
Raise exception if no matches found.
"""
digest = cdx.get('digest', '-')
# if the digest is the empty record digest, don't attempt to look up the payload record!
# the payload is simply empty, so use empty payload of existing record
if digest == self.EMPTY_DIGEST:
return headers_record
ref_target_uri = (headers_record.rec_headers.
get_header('WARC-Refers-To-Target-URI'))
@ -180,8 +189,6 @@ class ResolvingLoader(object):
else:
ref_target_date = iso_date_to_timestamp(ref_target_date)
digest = cdx.get('digest', '-')
try:
orig_cdx_lines = self.load_cdx_for_dupe(ref_target_uri,
ref_target_date,

View File

@ -57,7 +57,7 @@ class TestRedirects(CollsDirMixin, BaseConfigTest):
self.writer.write_record(rec)
return rec
def create_revisit_record(self, original, url, redirect_url, timestamp):
def create_revisit_record(self, url, timestamp, redirect_url, original_dt):
warc_headers = {}
warc_headers['WARC-Date'] = timestamp_to_iso_date(timestamp)
@ -67,9 +67,9 @@ class TestRedirects(CollsDirMixin, BaseConfigTest):
http_headers = StatusAndHeaders('302 Temp Redirect', headers_list, protocol='HTTP/1.0')
rec = self.writer.create_revisit_record(url,
digest=original.rec_headers['WARC-Payload-Digest'],
digest='3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ',
refers_to_uri=url,
refers_to_date=original.rec_headers['WARC-Date'],
refers_to_date=original_dt,
warc_headers_dict=warc_headers,
http_headers=http_headers)
@ -80,9 +80,12 @@ class TestRedirects(CollsDirMixin, BaseConfigTest):
with open(filename, 'wb') as fh:
self.writer = WARCWriter(fh, gzip=True)
redirect = self.create_redirect_record('http://example.com/', 'https://example.com/', '201806026101112')
redirect = self.create_redirect_record('https://example.com/', 'https://www.example.com/', '201806026101112')
response = self.create_response_record('https://www.example.com/', '201806026101112', 'Some Text')
redirect = self.create_redirect_record('http://example.com/', 'https://example.com/', '20180626101112')
redirect = self.create_redirect_record('https://example.com/', 'https://www.example.com/', '20180626101112')
response = self.create_response_record('https://www.example.com/', '20180626101112', 'Some Text')
revisit = self.create_revisit_record('https://example.com/path', '20190626101112', 'https://example.com/abc', response.rec_headers['WARC-Date'])
revisit = self.create_revisit_record('https://www.example.com/', '20190626101112', 'https://www.example.com/', response.rec_headers['WARC-Date'])
wb_manager(['init', 'redir'])
@ -91,7 +94,7 @@ class TestRedirects(CollsDirMixin, BaseConfigTest):
assert os.path.isfile(os.path.join(self.root_dir, self.COLLS_DIR, 'redir', 'indexes', 'index.cdxj'))
def test_self_redir_1(self, fmod):
res = self.get('/redir/201806026101112{0}/https://example.com/', fmod)
res = self.get('/redir/20180626101112{0}/https://example.com/', fmod, status=200)
assert res.status_code == 200
@ -102,16 +105,16 @@ class TestRedirects(CollsDirMixin, BaseConfigTest):
with open(filename, 'wb') as fh:
self.writer = WARCWriter(fh, gzip=True)
response = self.create_response_record('https://www.example.com/sub/path/', '201806026101112', 'Sub Path Data')
response = self.create_response_record('https://www.example.com/sub/path/', '20180626101112', 'Sub Path Data')
response = self.create_response_record('https://www.example.com/sub/path/?foo=bar', '201806026101112', 'Sub Path Data Q')
response = self.create_response_record('https://www.example.com/sub/path/?foo=bar', '20180626101112', 'Sub Path Data Q')
wb_manager(['add', 'redir', filename])
def test_redir_slash(self, fmod):
res = self.get('/redir/201806026101112{0}/https://example.com/sub/path', fmod, status=307)
res = self.get('/redir/20180626101112{0}/https://example.com/sub/path', fmod, status=307)
assert res.headers['Location'].endswith('/redir/201806026101112{0}/https://example.com/sub/path/'.format(fmod))
assert res.headers['Location'].endswith('/redir/20180626101112{0}/https://example.com/sub/path/'.format(fmod))
res = res.follow()
assert res.status_code == 200
@ -119,14 +122,22 @@ class TestRedirects(CollsDirMixin, BaseConfigTest):
assert res.text == 'Sub Path Data'
def test_redir_slash_with_query(self, fmod):
res = self.get('/redir/201806026101112{0}/https://example.com/sub/path?foo=bar', fmod, status=307)
res = self.get('/redir/20180626101112{0}/https://example.com/sub/path?foo=bar', fmod, status=307)
assert res.headers['Location'].endswith('/redir/201806026101112{0}/https://example.com/sub/path/?foo=bar'.format(fmod))
assert res.headers['Location'].endswith('/redir/20180626101112{0}/https://example.com/sub/path/?foo=bar'.format(fmod))
res = res.follow()
assert res.status_code == 200
assert res.text == 'Sub Path Data Q'
def test_revisit_redirect_302(self, fmod):
res = self.get('/redir/20170626101112{0}/https://example.com/path', fmod, status=302)
assert res.headers['Location'].endswith('/redir/20170626101112{0}/https://example.com/abc'.format(fmod))
assert res.text == ''
def test_revisit_redirect_skip_self_redir(self, fmod):
res = self.get('/redir/20190626101112{0}/http://www.example.com/', fmod, status=200)
assert res.text == 'Some Text'