mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
* revisit loading fix for revisit records with http headers: - if revisit record has http headers, always use those headers - otherwise, continue to use http headers from payload record - parse headers of http and payload records on initial lookup, to simplify loading - tests: add test for loading revisit records with different urls, different headers but same payload - fix for sul-dlss/was-pywb#64 * also bump version to 2.6.8
147 lines
5.2 KiB
Python
147 lines
5.2 KiB
Python
|
|
from io import BytesIO
|
|
import os
|
|
|
|
from warcio import WARCWriter, StatusAndHeaders
|
|
from pywb.manager.manager import main as wb_manager
|
|
|
|
from .base_config_test import BaseConfigTest, CollsDirMixin, fmod
|
|
|
|
|
|
# ============================================================================
|
|
class TestRevisits(CollsDirMixin, BaseConfigTest):
|
|
@classmethod
|
|
def setup_class(cls):
|
|
super(TestRevisits, cls).setup_class('config_test.yaml')
|
|
|
|
|
|
def create_revisit_record(self, url, date, headers, refers_to_uri, refers_to_date):
|
|
http_headers = StatusAndHeaders(
|
|
"301 Permanent Redirect", headers, protocol="HTTP/1.0"
|
|
)
|
|
|
|
return self.writer.create_revisit_record(
|
|
url,
|
|
digest="sha1:B6QJ6BNJ3R4B23XXMRKZKHLPGJY2VE4O",
|
|
refers_to_uri=refers_to_uri,
|
|
refers_to_date=refers_to_date,
|
|
warc_headers_dict={"WARC-Date": date},
|
|
http_headers=http_headers,
|
|
)
|
|
|
|
|
|
def create_response_record(self, url, date, headers, payload):
|
|
http_headers = StatusAndHeaders(
|
|
"301 Permanent Redirect", headers, protocol="HTTP/1.0"
|
|
)
|
|
|
|
return self.writer.create_warc_record(
|
|
url,
|
|
record_type="response",
|
|
http_headers=http_headers,
|
|
payload=BytesIO(payload),
|
|
warc_headers_dict={"WARC-Date": date},
|
|
length=len(payload),
|
|
)
|
|
|
|
def create(self):
|
|
payload = b"some\ntext"
|
|
|
|
# record 1
|
|
self.writer.write_record(
|
|
self.create_response_record(
|
|
"http://example.com/orig-1",
|
|
"2020-01-01T00:00:00Z",
|
|
[
|
|
("Content-Type", 'text/plain; charset="UTF-8"'),
|
|
("Location", "https://example.com/redirect-1"),
|
|
("Content-Length", str(len(payload))),
|
|
("Custom", "1"),
|
|
],
|
|
payload,
|
|
)
|
|
)
|
|
|
|
# record 2
|
|
self.writer.write_record(
|
|
self.create_response_record(
|
|
"http://example.com/orig-2",
|
|
"2020-01-01T00:00:00Z",
|
|
[
|
|
("Content-Type", 'text/plain; charset="UTF-8"'),
|
|
("Location", "https://example.com/redirect-2"),
|
|
("Content-Length", str(len(payload))),
|
|
("Custom", "2"),
|
|
],
|
|
payload,
|
|
)
|
|
)
|
|
|
|
# record 3
|
|
self.writer.write_record(
|
|
self.create_revisit_record(
|
|
"http://example.com/orig-2",
|
|
"2022-01-01T00:00:00Z",
|
|
[
|
|
("Content-Type", 'text/plain; charset="UTF-8"'),
|
|
("Location", "https://example.com/redirect-3"),
|
|
("Content-Length", str(len(payload))),
|
|
("Custom", "3"),
|
|
],
|
|
refers_to_uri="http://example.com/orig-1",
|
|
refers_to_date="2020-01-01T00:00:00Z",
|
|
)
|
|
)
|
|
|
|
# record 4
|
|
self.writer.write_record(
|
|
self.create_revisit_record(
|
|
"http://example.com/",
|
|
"2022-01-01T00:00:00Z",
|
|
[
|
|
("Content-Type", 'text/plain; charset="UTF-8"'),
|
|
("Location", "https://example.com/redirect-4"),
|
|
("Content-Length", str(len(payload))),
|
|
("Custom", "4"),
|
|
],
|
|
refers_to_uri="http://example.com/orig-2",
|
|
refers_to_date="2020-01-01T00:00:00Z",
|
|
)
|
|
)
|
|
|
|
def test_init(self):
|
|
filename = os.path.join(self.root_dir, 'redir.warc.gz')
|
|
with open(filename, 'wb') as fh:
|
|
self.writer = WARCWriter(fh, gzip=True)
|
|
self.create()
|
|
|
|
wb_manager(['init', 'revisits'])
|
|
|
|
wb_manager(['add', 'revisits', filename])
|
|
|
|
assert os.path.isfile(os.path.join(self.root_dir, self.COLLS_DIR, 'revisits', 'indexes', 'index.cdxj'))
|
|
|
|
def test_different_url_revisit_orig_headers(self, fmod):
|
|
res = self.get('/revisits/20220101{0}/http://example.com/', fmod, status=301)
|
|
assert res.headers["Custom"] == "4"
|
|
assert res.headers["Location"].endswith("/20220101{0}/https://example.com/redirect-4".format(fmod))
|
|
assert res.text == 'some\ntext'
|
|
|
|
def test_different_url_revisit_and_response(self, fmod):
|
|
res = self.get('/revisits/20200101{0}/http://example.com/orig-2', fmod, status=301)
|
|
assert res.headers["Custom"] == "2"
|
|
assert res.headers["Location"].endswith("/20200101{0}/https://example.com/redirect-2".format(fmod))
|
|
assert res.text == 'some\ntext'
|
|
|
|
res = self.get('/revisits/20220101{0}/http://example.com/orig-2', fmod, status=301)
|
|
assert res.headers["Custom"] == "3"
|
|
assert res.headers["Location"].endswith("/20220101{0}/https://example.com/redirect-3".format(fmod))
|
|
assert res.text == 'some\ntext'
|
|
|
|
def test_orig(self, fmod):
|
|
res = self.get('/revisits/20200101{0}/http://example.com/orig-1', fmod, status=301)
|
|
assert res.headers["Custom"] == "1"
|
|
assert res.headers["Location"].endswith("/20200101{0}/https://example.com/redirect-1".format(fmod))
|
|
assert res.text == 'some\ntext'
|
|
|