1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

Revisit headers load fix (#751)

* revisit loading fix for revisit records with http headers:
- if revisit record has http headers, always use those headers
- otherwise, continue to use http headers from payload record
- parse headers of http and payload records on initial lookup, to simplify loading
- tests: add test for loading revisit records with different urls, different headers but same payload
- fix for sul-dlss/was-pywb#64
* also bump version to 2.6.8
This commit is contained in:
Ilya Kreymer 2022-08-18 23:25:38 -07:00 committed by GitHub
parent 49393ce16a
commit f190190128
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 168 additions and 18 deletions

View File

@ -1,4 +1,4 @@
__version__ = '2.6.7'
__version__ = '2.6.8'
if __name__ == '__main__':
print(__version__)

View File

@ -172,7 +172,7 @@ class WARCPathLoader(DefaultResolverMixin, BaseLoader):
self.resolvers = self.make_resolvers(self.paths)
self.resolve_loader = ResolvingLoader(self.resolvers,
no_record_parse=True)
no_record_parse=False)
self.headers_parser = StatusAndHeadersParser([], verify=False)
@ -206,18 +206,20 @@ class WARCPathLoader(DefaultResolverMixin, BaseLoader):
local_index_query))
http_headers_buff = None
if payload.rec_type in ('response', 'revisit'):
status = cdx.get('status')
# if status is not set and not, 2xx, 4xx, 5xx
# go through self-redirect check just in case
if not status or not status.startswith(('2', '4', '5')):
http_headers = self.headers_parser.parse(payload.raw_stream)
try:
orig_size = payload.raw_stream.tell()
except:
orig_size = 0
http_headers = headers.http_headers or payload.http_headers
# if status is not set and not, 2xx, 4xx, 5xx
# go through self-redirect check just in case
if not status or not status.startswith(('2', '4', '5')):
try:
self.raise_on_self_redirect(params, cdx,
http_headers.get_statuscode(),
@ -227,11 +229,11 @@ class WARCPathLoader(DefaultResolverMixin, BaseLoader):
no_except_close(payload.raw_stream)
raise
http_headers_buff = http_headers.to_bytes()
http_headers_buff = http_headers and http_headers.to_bytes()
# if new http_headers_buff is different length,
# attempt to adjust content-length on the WARC record
if orig_size and len(http_headers_buff) != orig_size:
if http_headers and orig_size and len(http_headers_buff) != orig_size:
orig_cl = payload.rec_headers.get_header('Content-Length')
if orig_cl:
new_cl = int(orig_cl) + (len(http_headers_buff) - orig_size)

View File

@ -102,13 +102,15 @@ class TestWbIntegration(BaseConfigTest):
assert not resp.headers.get('Content-Length')
def test_replay_content_head_non_zero_content_length_match(self):
resp = self.testapp.get('/pywb/id_/http://www.iana.org/_js/2013.1/jquery.js', status=200)
resp = self.testapp.get('/pywb/20140126200625id_/http://www.iana.org/_js/2013.1/jquery.js', status=200)
length = resp.content_length
print('length', length)
# Content-Length included if non-zero
resp = self.testapp.head('/pywb/id_/http://www.iana.org/_js/2013.1/jquery.js', status=200)
resp = self.testapp.head('/pywb/20140126200625id_/http://www.iana.org/_js/2013.1/jquery.js', status=200)
#assert resp.headers['Content-Length'] == length
print('length', resp.content_length)
assert resp.content_length == length
def test_replay_content(self, fmod):

View File

@ -0,0 +1,146 @@
from io import BytesIO
import os
from warcio import WARCWriter, StatusAndHeaders
from pywb.manager.manager import main as wb_manager
from .base_config_test import BaseConfigTest, CollsDirMixin, fmod
# ============================================================================
class TestRevisits(CollsDirMixin, BaseConfigTest):
@classmethod
def setup_class(cls):
super(TestRevisits, cls).setup_class('config_test.yaml')
def create_revisit_record(self, url, date, headers, refers_to_uri, refers_to_date):
http_headers = StatusAndHeaders(
"301 Permanent Redirect", headers, protocol="HTTP/1.0"
)
return self.writer.create_revisit_record(
url,
digest="sha1:B6QJ6BNJ3R4B23XXMRKZKHLPGJY2VE4O",
refers_to_uri=refers_to_uri,
refers_to_date=refers_to_date,
warc_headers_dict={"WARC-Date": date},
http_headers=http_headers,
)
def create_response_record(self, url, date, headers, payload):
http_headers = StatusAndHeaders(
"301 Permanent Redirect", headers, protocol="HTTP/1.0"
)
return self.writer.create_warc_record(
url,
record_type="response",
http_headers=http_headers,
payload=BytesIO(payload),
warc_headers_dict={"WARC-Date": date},
length=len(payload),
)
def create(self):
payload = b"some\ntext"
# record 1
self.writer.write_record(
self.create_response_record(
"http://example.com/orig-1",
"2020-01-01T00:00:00Z",
[
("Content-Type", 'text/plain; charset="UTF-8"'),
("Location", "https://example.com/redirect-1"),
("Content-Length", str(len(payload))),
("Custom", "1"),
],
payload,
)
)
# record 2
self.writer.write_record(
self.create_response_record(
"http://example.com/orig-2",
"2020-01-01T00:00:00Z",
[
("Content-Type", 'text/plain; charset="UTF-8"'),
("Location", "https://example.com/redirect-2"),
("Content-Length", str(len(payload))),
("Custom", "2"),
],
payload,
)
)
# record 3
self.writer.write_record(
self.create_revisit_record(
"http://example.com/orig-2",
"2022-01-01T00:00:00Z",
[
("Content-Type", 'text/plain; charset="UTF-8"'),
("Location", "https://example.com/redirect-3"),
("Content-Length", str(len(payload))),
("Custom", "3"),
],
refers_to_uri="http://example.com/orig-1",
refers_to_date="2020-01-01T00:00:00Z",
)
)
# record 4
self.writer.write_record(
self.create_revisit_record(
"http://example.com/",
"2022-01-01T00:00:00Z",
[
("Content-Type", 'text/plain; charset="UTF-8"'),
("Location", "https://example.com/redirect-4"),
("Content-Length", str(len(payload))),
("Custom", "4"),
],
refers_to_uri="http://example.com/orig-2",
refers_to_date="2020-01-01T00:00:00Z",
)
)
def test_init(self):
filename = os.path.join(self.root_dir, 'redir.warc.gz')
with open(filename, 'wb') as fh:
self.writer = WARCWriter(fh, gzip=True)
self.create()
wb_manager(['init', 'revisits'])
wb_manager(['add', 'revisits', filename])
assert os.path.isfile(os.path.join(self.root_dir, self.COLLS_DIR, 'revisits', 'indexes', 'index.cdxj'))
def test_different_url_revisit_orig_headers(self, fmod):
res = self.get('/revisits/20220101{0}/http://example.com/', fmod, status=301)
assert res.headers["Custom"] == "4"
assert res.headers["Location"].endswith("/20220101{0}/https://example.com/redirect-4".format(fmod))
assert res.text == 'some\ntext'
def test_different_url_revisit_and_response(self, fmod):
res = self.get('/revisits/20200101{0}/http://example.com/orig-2', fmod, status=301)
assert res.headers["Custom"] == "2"
assert res.headers["Location"].endswith("/20200101{0}/https://example.com/redirect-2".format(fmod))
assert res.text == 'some\ntext'
res = self.get('/revisits/20220101{0}/http://example.com/orig-2', fmod, status=301)
assert res.headers["Custom"] == "3"
assert res.headers["Location"].endswith("/20220101{0}/https://example.com/redirect-3".format(fmod))
assert res.text == 'some\ntext'
def test_orig(self, fmod):
res = self.get('/revisits/20200101{0}/http://example.com/orig-1', fmod, status=301)
assert res.headers["Custom"] == "1"
assert res.headers["Location"].endswith("/20200101{0}/https://example.com/redirect-1".format(fmod))
assert res.text == 'some\ntext'