1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00
pywb/tests/test_redirect_revisits.py
Ilya Kreymer f190190128
Revisit headers load fix (#751)
* revisit loading fix for revisit records with http headers:
- if revisit record has http headers, always use those headers
- otherwise, continue to use http headers from payload record
- parse headers of http and payload records on initial lookup, to simplify loading
- tests: add test for loading revisit records with different urls, different headers but same payload
- fix for sul-dlss/was-pywb#64
* also bump version to 2.6.8
2022-08-18 23:25:38 -07:00

147 lines
5.2 KiB
Python

from io import BytesIO
import os
from warcio import WARCWriter, StatusAndHeaders
from pywb.manager.manager import main as wb_manager
from .base_config_test import BaseConfigTest, CollsDirMixin, fmod
# ============================================================================
class TestRevisits(CollsDirMixin, BaseConfigTest):
@classmethod
def setup_class(cls):
super(TestRevisits, cls).setup_class('config_test.yaml')
def create_revisit_record(self, url, date, headers, refers_to_uri, refers_to_date):
http_headers = StatusAndHeaders(
"301 Permanent Redirect", headers, protocol="HTTP/1.0"
)
return self.writer.create_revisit_record(
url,
digest="sha1:B6QJ6BNJ3R4B23XXMRKZKHLPGJY2VE4O",
refers_to_uri=refers_to_uri,
refers_to_date=refers_to_date,
warc_headers_dict={"WARC-Date": date},
http_headers=http_headers,
)
def create_response_record(self, url, date, headers, payload):
http_headers = StatusAndHeaders(
"301 Permanent Redirect", headers, protocol="HTTP/1.0"
)
return self.writer.create_warc_record(
url,
record_type="response",
http_headers=http_headers,
payload=BytesIO(payload),
warc_headers_dict={"WARC-Date": date},
length=len(payload),
)
def create(self):
payload = b"some\ntext"
# record 1
self.writer.write_record(
self.create_response_record(
"http://example.com/orig-1",
"2020-01-01T00:00:00Z",
[
("Content-Type", 'text/plain; charset="UTF-8"'),
("Location", "https://example.com/redirect-1"),
("Content-Length", str(len(payload))),
("Custom", "1"),
],
payload,
)
)
# record 2
self.writer.write_record(
self.create_response_record(
"http://example.com/orig-2",
"2020-01-01T00:00:00Z",
[
("Content-Type", 'text/plain; charset="UTF-8"'),
("Location", "https://example.com/redirect-2"),
("Content-Length", str(len(payload))),
("Custom", "2"),
],
payload,
)
)
# record 3
self.writer.write_record(
self.create_revisit_record(
"http://example.com/orig-2",
"2022-01-01T00:00:00Z",
[
("Content-Type", 'text/plain; charset="UTF-8"'),
("Location", "https://example.com/redirect-3"),
("Content-Length", str(len(payload))),
("Custom", "3"),
],
refers_to_uri="http://example.com/orig-1",
refers_to_date="2020-01-01T00:00:00Z",
)
)
# record 4
self.writer.write_record(
self.create_revisit_record(
"http://example.com/",
"2022-01-01T00:00:00Z",
[
("Content-Type", 'text/plain; charset="UTF-8"'),
("Location", "https://example.com/redirect-4"),
("Content-Length", str(len(payload))),
("Custom", "4"),
],
refers_to_uri="http://example.com/orig-2",
refers_to_date="2020-01-01T00:00:00Z",
)
)
def test_init(self):
filename = os.path.join(self.root_dir, 'redir.warc.gz')
with open(filename, 'wb') as fh:
self.writer = WARCWriter(fh, gzip=True)
self.create()
wb_manager(['init', 'revisits'])
wb_manager(['add', 'revisits', filename])
assert os.path.isfile(os.path.join(self.root_dir, self.COLLS_DIR, 'revisits', 'indexes', 'index.cdxj'))
def test_different_url_revisit_orig_headers(self, fmod):
res = self.get('/revisits/20220101{0}/http://example.com/', fmod, status=301)
assert res.headers["Custom"] == "4"
assert res.headers["Location"].endswith("/20220101{0}/https://example.com/redirect-4".format(fmod))
assert res.text == 'some\ntext'
def test_different_url_revisit_and_response(self, fmod):
res = self.get('/revisits/20200101{0}/http://example.com/orig-2', fmod, status=301)
assert res.headers["Custom"] == "2"
assert res.headers["Location"].endswith("/20200101{0}/https://example.com/redirect-2".format(fmod))
assert res.text == 'some\ntext'
res = self.get('/revisits/20220101{0}/http://example.com/orig-2', fmod, status=301)
assert res.headers["Custom"] == "3"
assert res.headers["Location"].endswith("/20220101{0}/https://example.com/redirect-3".format(fmod))
assert res.text == 'some\ntext'
def test_orig(self, fmod):
res = self.get('/revisits/20200101{0}/http://example.com/orig-1', fmod, status=301)
assert res.headers["Custom"] == "1"
assert res.headers["Location"].endswith("/20200101{0}/https://example.com/redirect-1".format(fmod))
assert res.text == 'some\ntext'