mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
revisit of redirect optimization: (#753)
- if a revisit is of a redirect (3xx response) and revisit has http headers, return the http headers with empty payload -- don't bother loading the original record builds on changes in #751 - cleanup redirect revisit tests from #751
This commit is contained in:
parent
0cc912da95
commit
c121198183
@ -75,6 +75,14 @@ class ResolvingLoader(object):
|
|||||||
# two index lookups
|
# two index lookups
|
||||||
# Case 1: if mimetype is still warc/revisit
|
# Case 1: if mimetype is still warc/revisit
|
||||||
if cdx.get('mime') == 'warc/revisit' and headers_record:
|
if cdx.get('mime') == 'warc/revisit' and headers_record:
|
||||||
|
if headers_record.http_headers:
|
||||||
|
status = headers_record.http_headers.get_statuscode()
|
||||||
|
# optimization: if redirect, don't load payload record, as it'll be ignored by browser
|
||||||
|
# always replay zero-length payload
|
||||||
|
if status and status.startswith('3'):
|
||||||
|
headers_record.http_headers.replace_header('Content-Length', '0')
|
||||||
|
return headers_record, headers_record
|
||||||
|
|
||||||
payload_record = self._load_different_url_payload(cdx,
|
payload_record = self._load_different_url_payload(cdx,
|
||||||
headers_record,
|
headers_record,
|
||||||
failed_files,
|
failed_files,
|
||||||
|
@ -104,13 +104,11 @@ class TestWbIntegration(BaseConfigTest):
|
|||||||
def test_replay_content_head_non_zero_content_length_match(self):
|
def test_replay_content_head_non_zero_content_length_match(self):
|
||||||
resp = self.testapp.get('/pywb/20140126200625id_/http://www.iana.org/_js/2013.1/jquery.js', status=200)
|
resp = self.testapp.get('/pywb/20140126200625id_/http://www.iana.org/_js/2013.1/jquery.js', status=200)
|
||||||
length = resp.content_length
|
length = resp.content_length
|
||||||
print('length', length)
|
|
||||||
|
|
||||||
# Content-Length included if non-zero
|
# Content-Length included if non-zero
|
||||||
resp = self.testapp.head('/pywb/20140126200625id_/http://www.iana.org/_js/2013.1/jquery.js', status=200)
|
resp = self.testapp.head('/pywb/20140126200625id_/http://www.iana.org/_js/2013.1/jquery.js', status=200)
|
||||||
|
|
||||||
#assert resp.headers['Content-Length'] == length
|
#assert resp.headers['Content-Length'] == length
|
||||||
print('length', resp.content_length)
|
|
||||||
assert resp.content_length == length
|
assert resp.content_length == length
|
||||||
|
|
||||||
def test_replay_content(self, fmod):
|
def test_replay_content(self, fmod):
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
from .base_config_test import BaseConfigTest, CollsDirMixin, fmod
|
||||||
|
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
import os
|
import os
|
||||||
@ -5,8 +6,6 @@ import os
|
|||||||
from warcio import WARCWriter, StatusAndHeaders
|
from warcio import WARCWriter, StatusAndHeaders
|
||||||
from pywb.manager.manager import main as wb_manager
|
from pywb.manager.manager import main as wb_manager
|
||||||
|
|
||||||
from .base_config_test import BaseConfigTest, CollsDirMixin, fmod
|
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
class TestRevisits(CollsDirMixin, BaseConfigTest):
|
class TestRevisits(CollsDirMixin, BaseConfigTest):
|
||||||
@ -125,18 +124,22 @@ class TestRevisits(CollsDirMixin, BaseConfigTest):
|
|||||||
res = self.get('/revisits/20220101{0}/http://example.com/', fmod, status=301)
|
res = self.get('/revisits/20220101{0}/http://example.com/', fmod, status=301)
|
||||||
assert res.headers["Custom"] == "4"
|
assert res.headers["Custom"] == "4"
|
||||||
assert res.headers["Location"].endswith("/20220101{0}/https://example.com/redirect-4".format(fmod))
|
assert res.headers["Location"].endswith("/20220101{0}/https://example.com/redirect-4".format(fmod))
|
||||||
assert res.text == 'some\ntext'
|
assert res.content_length == 0
|
||||||
|
assert res.text == ''
|
||||||
|
|
||||||
def test_different_url_revisit_and_response(self, fmod):
|
def test_different_url_response_and_revisit(self, fmod):
|
||||||
|
# response
|
||||||
res = self.get('/revisits/20200101{0}/http://example.com/orig-2', fmod, status=301)
|
res = self.get('/revisits/20200101{0}/http://example.com/orig-2', fmod, status=301)
|
||||||
assert res.headers["Custom"] == "2"
|
assert res.headers["Custom"] == "2"
|
||||||
assert res.headers["Location"].endswith("/20200101{0}/https://example.com/redirect-2".format(fmod))
|
assert res.headers["Location"].endswith("/20200101{0}/https://example.com/redirect-2".format(fmod))
|
||||||
assert res.text == 'some\ntext'
|
assert res.text == 'some\ntext'
|
||||||
|
|
||||||
|
# revisit
|
||||||
res = self.get('/revisits/20220101{0}/http://example.com/orig-2', fmod, status=301)
|
res = self.get('/revisits/20220101{0}/http://example.com/orig-2', fmod, status=301)
|
||||||
assert res.headers["Custom"] == "3"
|
assert res.headers["Custom"] == "3"
|
||||||
assert res.headers["Location"].endswith("/20220101{0}/https://example.com/redirect-3".format(fmod))
|
assert res.headers["Location"].endswith("/20220101{0}/https://example.com/redirect-3".format(fmod))
|
||||||
assert res.text == 'some\ntext'
|
assert res.content_length == 0
|
||||||
|
assert res.text == ''
|
||||||
|
|
||||||
def test_orig(self, fmod):
|
def test_orig(self, fmod):
|
||||||
res = self.get('/revisits/20200101{0}/http://example.com/orig-1', fmod, status=301)
|
res = self.get('/revisits/20200101{0}/http://example.com/orig-1', fmod, status=301)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user