1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

revisit of redirect optimization: (#753)

- if a revisit is of a redirect (3xx response) and revisit has http headers, return
the http headers with empty payload -- don't bother loading the original record
builds on changes in #751
- cleanup redirect revisit tests from #751
This commit is contained in:
Ilya Kreymer 2022-08-20 13:53:16 -07:00 committed by GitHub
parent 0cc912da95
commit c121198183
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 16 additions and 7 deletions

View File

@ -75,6 +75,14 @@ class ResolvingLoader(object):
# two index lookups # two index lookups
# Case 1: if mimetype is still warc/revisit # Case 1: if mimetype is still warc/revisit
if cdx.get('mime') == 'warc/revisit' and headers_record: if cdx.get('mime') == 'warc/revisit' and headers_record:
if headers_record.http_headers:
status = headers_record.http_headers.get_statuscode()
# optimization: if redirect, don't load payload record, as it'll be ignored by browser
# always replay zero-length payload
if status and status.startswith('3'):
headers_record.http_headers.replace_header('Content-Length', '0')
return headers_record, headers_record
payload_record = self._load_different_url_payload(cdx, payload_record = self._load_different_url_payload(cdx,
headers_record, headers_record,
failed_files, failed_files,

View File

@ -104,13 +104,11 @@ class TestWbIntegration(BaseConfigTest):
def test_replay_content_head_non_zero_content_length_match(self): def test_replay_content_head_non_zero_content_length_match(self):
resp = self.testapp.get('/pywb/20140126200625id_/http://www.iana.org/_js/2013.1/jquery.js', status=200) resp = self.testapp.get('/pywb/20140126200625id_/http://www.iana.org/_js/2013.1/jquery.js', status=200)
length = resp.content_length length = resp.content_length
print('length', length)
# Content-Length included if non-zero # Content-Length included if non-zero
resp = self.testapp.head('/pywb/20140126200625id_/http://www.iana.org/_js/2013.1/jquery.js', status=200) resp = self.testapp.head('/pywb/20140126200625id_/http://www.iana.org/_js/2013.1/jquery.js', status=200)
#assert resp.headers['Content-Length'] == length #assert resp.headers['Content-Length'] == length
print('length', resp.content_length)
assert resp.content_length == length assert resp.content_length == length
def test_replay_content(self, fmod): def test_replay_content(self, fmod):

View File

@ -1,3 +1,4 @@
from .base_config_test import BaseConfigTest, CollsDirMixin, fmod
from io import BytesIO from io import BytesIO
import os import os
@ -5,8 +6,6 @@ import os
from warcio import WARCWriter, StatusAndHeaders from warcio import WARCWriter, StatusAndHeaders
from pywb.manager.manager import main as wb_manager from pywb.manager.manager import main as wb_manager
from .base_config_test import BaseConfigTest, CollsDirMixin, fmod
# ============================================================================ # ============================================================================
class TestRevisits(CollsDirMixin, BaseConfigTest): class TestRevisits(CollsDirMixin, BaseConfigTest):
@ -125,18 +124,22 @@ class TestRevisits(CollsDirMixin, BaseConfigTest):
res = self.get('/revisits/20220101{0}/http://example.com/', fmod, status=301) res = self.get('/revisits/20220101{0}/http://example.com/', fmod, status=301)
assert res.headers["Custom"] == "4" assert res.headers["Custom"] == "4"
assert res.headers["Location"].endswith("/20220101{0}/https://example.com/redirect-4".format(fmod)) assert res.headers["Location"].endswith("/20220101{0}/https://example.com/redirect-4".format(fmod))
assert res.text == 'some\ntext' assert res.content_length == 0
assert res.text == ''
def test_different_url_revisit_and_response(self, fmod): def test_different_url_response_and_revisit(self, fmod):
# response
res = self.get('/revisits/20200101{0}/http://example.com/orig-2', fmod, status=301) res = self.get('/revisits/20200101{0}/http://example.com/orig-2', fmod, status=301)
assert res.headers["Custom"] == "2" assert res.headers["Custom"] == "2"
assert res.headers["Location"].endswith("/20200101{0}/https://example.com/redirect-2".format(fmod)) assert res.headers["Location"].endswith("/20200101{0}/https://example.com/redirect-2".format(fmod))
assert res.text == 'some\ntext' assert res.text == 'some\ntext'
# revisit
res = self.get('/revisits/20220101{0}/http://example.com/orig-2', fmod, status=301) res = self.get('/revisits/20220101{0}/http://example.com/orig-2', fmod, status=301)
assert res.headers["Custom"] == "3" assert res.headers["Custom"] == "3"
assert res.headers["Location"].endswith("/20220101{0}/https://example.com/redirect-3".format(fmod)) assert res.headers["Location"].endswith("/20220101{0}/https://example.com/redirect-3".format(fmod))
assert res.text == 'some\ntext' assert res.content_length == 0
assert res.text == ''
def test_orig(self, fmod): def test_orig(self, fmod):
res = self.get('/revisits/20200101{0}/http://example.com/orig-1', fmod, status=301) res = self.get('/revisits/20200101{0}/http://example.com/orig-1', fmod, status=301)