mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
Revisit headers load fix (#751)
* revisit loading fix for revisit records with http headers: - if revisit record has http headers, always use those headers - otherwise, continue to use http headers from payload record - parse headers of http and payload records on initial lookup, to simplify loading - tests: add test for loading revisit records with different urls, different headers but same payload - fix for sul-dlss/was-pywb#64 * also bump version to 2.6.8
This commit is contained in:
parent
49393ce16a
commit
f190190128
@ -1,4 +1,4 @@
|
||||
__version__ = '2.6.7'
|
||||
__version__ = '2.6.8'
|
||||
|
||||
if __name__ == '__main__':
|
||||
print(__version__)
|
||||
|
@ -172,7 +172,7 @@ class WARCPathLoader(DefaultResolverMixin, BaseLoader):
|
||||
self.resolvers = self.make_resolvers(self.paths)
|
||||
|
||||
self.resolve_loader = ResolvingLoader(self.resolvers,
|
||||
no_record_parse=True)
|
||||
no_record_parse=False)
|
||||
|
||||
self.headers_parser = StatusAndHeadersParser([], verify=False)
|
||||
|
||||
@ -206,18 +206,20 @@ class WARCPathLoader(DefaultResolverMixin, BaseLoader):
|
||||
local_index_query))
|
||||
|
||||
http_headers_buff = None
|
||||
|
||||
if payload.rec_type in ('response', 'revisit'):
|
||||
status = cdx.get('status')
|
||||
|
||||
# if status is not set and not, 2xx, 4xx, 5xx
|
||||
# go through self-redirect check just in case
|
||||
if not status or not status.startswith(('2', '4', '5')):
|
||||
http_headers = self.headers_parser.parse(payload.raw_stream)
|
||||
try:
|
||||
orig_size = payload.raw_stream.tell()
|
||||
except:
|
||||
orig_size = 0
|
||||
|
||||
http_headers = headers.http_headers or payload.http_headers
|
||||
|
||||
# if status is not set and not, 2xx, 4xx, 5xx
|
||||
# go through self-redirect check just in case
|
||||
if not status or not status.startswith(('2', '4', '5')):
|
||||
try:
|
||||
self.raise_on_self_redirect(params, cdx,
|
||||
http_headers.get_statuscode(),
|
||||
@ -227,11 +229,11 @@ class WARCPathLoader(DefaultResolverMixin, BaseLoader):
|
||||
no_except_close(payload.raw_stream)
|
||||
raise
|
||||
|
||||
http_headers_buff = http_headers.to_bytes()
|
||||
http_headers_buff = http_headers and http_headers.to_bytes()
|
||||
|
||||
# if new http_headers_buff is different length,
|
||||
# attempt to adjust content-length on the WARC record
|
||||
if orig_size and len(http_headers_buff) != orig_size:
|
||||
if http_headers and orig_size and len(http_headers_buff) != orig_size:
|
||||
orig_cl = payload.rec_headers.get_header('Content-Length')
|
||||
if orig_cl:
|
||||
new_cl = int(orig_cl) + (len(http_headers_buff) - orig_size)
|
||||
|
@ -102,13 +102,15 @@ class TestWbIntegration(BaseConfigTest):
|
||||
assert not resp.headers.get('Content-Length')
|
||||
|
||||
def test_replay_content_head_non_zero_content_length_match(self):
|
||||
resp = self.testapp.get('/pywb/id_/http://www.iana.org/_js/2013.1/jquery.js', status=200)
|
||||
resp = self.testapp.get('/pywb/20140126200625id_/http://www.iana.org/_js/2013.1/jquery.js', status=200)
|
||||
length = resp.content_length
|
||||
print('length', length)
|
||||
|
||||
# Content-Length included if non-zero
|
||||
resp = self.testapp.head('/pywb/id_/http://www.iana.org/_js/2013.1/jquery.js', status=200)
|
||||
resp = self.testapp.head('/pywb/20140126200625id_/http://www.iana.org/_js/2013.1/jquery.js', status=200)
|
||||
|
||||
#assert resp.headers['Content-Length'] == length
|
||||
print('length', resp.content_length)
|
||||
assert resp.content_length == length
|
||||
|
||||
def test_replay_content(self, fmod):
|
||||
|
146
tests/test_redirect_revisits.py
Normal file
146
tests/test_redirect_revisits.py
Normal file
@ -0,0 +1,146 @@
|
||||
|
||||
from io import BytesIO
|
||||
import os
|
||||
|
||||
from warcio import WARCWriter, StatusAndHeaders
|
||||
from pywb.manager.manager import main as wb_manager
|
||||
|
||||
from .base_config_test import BaseConfigTest, CollsDirMixin, fmod
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class TestRevisits(CollsDirMixin, BaseConfigTest):
|
||||
@classmethod
|
||||
def setup_class(cls):
|
||||
super(TestRevisits, cls).setup_class('config_test.yaml')
|
||||
|
||||
|
||||
def create_revisit_record(self, url, date, headers, refers_to_uri, refers_to_date):
|
||||
http_headers = StatusAndHeaders(
|
||||
"301 Permanent Redirect", headers, protocol="HTTP/1.0"
|
||||
)
|
||||
|
||||
return self.writer.create_revisit_record(
|
||||
url,
|
||||
digest="sha1:B6QJ6BNJ3R4B23XXMRKZKHLPGJY2VE4O",
|
||||
refers_to_uri=refers_to_uri,
|
||||
refers_to_date=refers_to_date,
|
||||
warc_headers_dict={"WARC-Date": date},
|
||||
http_headers=http_headers,
|
||||
)
|
||||
|
||||
|
||||
def create_response_record(self, url, date, headers, payload):
|
||||
http_headers = StatusAndHeaders(
|
||||
"301 Permanent Redirect", headers, protocol="HTTP/1.0"
|
||||
)
|
||||
|
||||
return self.writer.create_warc_record(
|
||||
url,
|
||||
record_type="response",
|
||||
http_headers=http_headers,
|
||||
payload=BytesIO(payload),
|
||||
warc_headers_dict={"WARC-Date": date},
|
||||
length=len(payload),
|
||||
)
|
||||
|
||||
def create(self):
|
||||
payload = b"some\ntext"
|
||||
|
||||
# record 1
|
||||
self.writer.write_record(
|
||||
self.create_response_record(
|
||||
"http://example.com/orig-1",
|
||||
"2020-01-01T00:00:00Z",
|
||||
[
|
||||
("Content-Type", 'text/plain; charset="UTF-8"'),
|
||||
("Location", "https://example.com/redirect-1"),
|
||||
("Content-Length", str(len(payload))),
|
||||
("Custom", "1"),
|
||||
],
|
||||
payload,
|
||||
)
|
||||
)
|
||||
|
||||
# record 2
|
||||
self.writer.write_record(
|
||||
self.create_response_record(
|
||||
"http://example.com/orig-2",
|
||||
"2020-01-01T00:00:00Z",
|
||||
[
|
||||
("Content-Type", 'text/plain; charset="UTF-8"'),
|
||||
("Location", "https://example.com/redirect-2"),
|
||||
("Content-Length", str(len(payload))),
|
||||
("Custom", "2"),
|
||||
],
|
||||
payload,
|
||||
)
|
||||
)
|
||||
|
||||
# record 3
|
||||
self.writer.write_record(
|
||||
self.create_revisit_record(
|
||||
"http://example.com/orig-2",
|
||||
"2022-01-01T00:00:00Z",
|
||||
[
|
||||
("Content-Type", 'text/plain; charset="UTF-8"'),
|
||||
("Location", "https://example.com/redirect-3"),
|
||||
("Content-Length", str(len(payload))),
|
||||
("Custom", "3"),
|
||||
],
|
||||
refers_to_uri="http://example.com/orig-1",
|
||||
refers_to_date="2020-01-01T00:00:00Z",
|
||||
)
|
||||
)
|
||||
|
||||
# record 4
|
||||
self.writer.write_record(
|
||||
self.create_revisit_record(
|
||||
"http://example.com/",
|
||||
"2022-01-01T00:00:00Z",
|
||||
[
|
||||
("Content-Type", 'text/plain; charset="UTF-8"'),
|
||||
("Location", "https://example.com/redirect-4"),
|
||||
("Content-Length", str(len(payload))),
|
||||
("Custom", "4"),
|
||||
],
|
||||
refers_to_uri="http://example.com/orig-2",
|
||||
refers_to_date="2020-01-01T00:00:00Z",
|
||||
)
|
||||
)
|
||||
|
||||
def test_init(self):
|
||||
filename = os.path.join(self.root_dir, 'redir.warc.gz')
|
||||
with open(filename, 'wb') as fh:
|
||||
self.writer = WARCWriter(fh, gzip=True)
|
||||
self.create()
|
||||
|
||||
wb_manager(['init', 'revisits'])
|
||||
|
||||
wb_manager(['add', 'revisits', filename])
|
||||
|
||||
assert os.path.isfile(os.path.join(self.root_dir, self.COLLS_DIR, 'revisits', 'indexes', 'index.cdxj'))
|
||||
|
||||
def test_different_url_revisit_orig_headers(self, fmod):
|
||||
res = self.get('/revisits/20220101{0}/http://example.com/', fmod, status=301)
|
||||
assert res.headers["Custom"] == "4"
|
||||
assert res.headers["Location"].endswith("/20220101{0}/https://example.com/redirect-4".format(fmod))
|
||||
assert res.text == 'some\ntext'
|
||||
|
||||
def test_different_url_revisit_and_response(self, fmod):
|
||||
res = self.get('/revisits/20200101{0}/http://example.com/orig-2', fmod, status=301)
|
||||
assert res.headers["Custom"] == "2"
|
||||
assert res.headers["Location"].endswith("/20200101{0}/https://example.com/redirect-2".format(fmod))
|
||||
assert res.text == 'some\ntext'
|
||||
|
||||
res = self.get('/revisits/20220101{0}/http://example.com/orig-2', fmod, status=301)
|
||||
assert res.headers["Custom"] == "3"
|
||||
assert res.headers["Location"].endswith("/20220101{0}/https://example.com/redirect-3".format(fmod))
|
||||
assert res.text == 'some\ntext'
|
||||
|
||||
def test_orig(self, fmod):
|
||||
res = self.get('/revisits/20200101{0}/http://example.com/orig-1', fmod, status=301)
|
||||
assert res.headers["Custom"] == "1"
|
||||
assert res.headers["Location"].endswith("/20200101{0}/https://example.com/redirect-1".format(fmod))
|
||||
assert res.text == 'some\ntext'
|
||||
|
Loading…
x
Reference in New Issue
Block a user