mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
Self-Redirect Fix (#345)
* self-redirect fix for multiple continuous 3xx responses: if after one self-redirect, next match is also a redirect where url canonicalizes to same as previously rejected, also treat as self-redirect tests: add new test_self_redirect for generating example pattern where self-redirect could occur * self-redirect: ensure warc records are closed when handling self-redirect exception!
This commit is contained in:
parent
a3476d8baa
commit
ac5b4da9eb
@ -7,6 +7,8 @@ from warcio.statusandheaders import StatusAndHeaders, StatusAndHeadersParser
|
|||||||
|
|
||||||
from pywb.utils.wbexception import LiveResourceException, WbException
|
from pywb.utils.wbexception import LiveResourceException, WbException
|
||||||
|
|
||||||
|
from pywb.utils.canonicalize import canonicalize
|
||||||
|
|
||||||
from pywb.utils.memento import MementoUtils
|
from pywb.utils.memento import MementoUtils
|
||||||
from pywb.utils.io import StreamIter, compress_gzip_iter, call_release_conn
|
from pywb.utils.io import StreamIter, compress_gzip_iter, call_release_conn
|
||||||
from pywb.utils.format import ParamFormatter
|
from pywb.utils.format import ParamFormatter
|
||||||
@ -131,6 +133,7 @@ class BaseLoader(object):
|
|||||||
if not location_url:
|
if not location_url:
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
location_url = location_url.lower()
|
location_url = location_url.lower()
|
||||||
if location_url.startswith('/'):
|
if location_url.startswith('/'):
|
||||||
host = urlsplit(cdx['url']).netloc
|
host = urlsplit(cdx['url']).netloc
|
||||||
@ -139,9 +142,19 @@ class BaseLoader(object):
|
|||||||
location_url = location_url.split('://', 1)[-1].rstrip('/')
|
location_url = location_url.split('://', 1)[-1].rstrip('/')
|
||||||
request_url = request_url.split('://', 1)[-1].rstrip('/')
|
request_url = request_url.split('://', 1)[-1].rstrip('/')
|
||||||
|
|
||||||
|
self_redir = False
|
||||||
|
|
||||||
if request_url == location_url:
|
if request_url == location_url:
|
||||||
|
self_redir = True
|
||||||
|
elif params.get('sr-urlkey'):
|
||||||
|
# if new location canonicalized matches old key, also self-redirect
|
||||||
|
if canonicalize(location_url) == params.get('sr-urlkey'):
|
||||||
|
self_redir = True
|
||||||
|
|
||||||
|
if self_redir:
|
||||||
msg = 'Self Redirect {0} -> {1}'
|
msg = 'Self Redirect {0} -> {1}'
|
||||||
msg = msg.format(request_url, location_url)
|
msg = msg.format(request_url, location_url)
|
||||||
|
params['sr-urlkey'] = cdx['urlkey']
|
||||||
raise LiveResourceException(msg)
|
raise LiveResourceException(msg)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@ -198,9 +211,15 @@ class WARCPathLoader(DefaultResolverMixin, BaseLoader):
|
|||||||
# status may not be set for 'revisit'
|
# status may not be set for 'revisit'
|
||||||
if not status or status.startswith('3'):
|
if not status or status.startswith('3'):
|
||||||
http_headers = self.headers_parser.parse(payload.raw_stream)
|
http_headers = self.headers_parser.parse(payload.raw_stream)
|
||||||
self.raise_on_self_redirect(params, cdx,
|
|
||||||
http_headers.get_statuscode(),
|
try:
|
||||||
http_headers.get_header('Location'))
|
self.raise_on_self_redirect(params, cdx,
|
||||||
|
http_headers.get_statuscode(),
|
||||||
|
http_headers.get_header('Location'))
|
||||||
|
except LiveResourceException:
|
||||||
|
headers.raw_stream.close()
|
||||||
|
payload.raw_stream.close()
|
||||||
|
raise
|
||||||
|
|
||||||
http_headers_buff = http_headers.to_bytes()
|
http_headers_buff = http_headers.to_bytes()
|
||||||
|
|
||||||
|
108
tests/test_self_redirect.py
Normal file
108
tests/test_self_redirect.py
Normal file
@ -0,0 +1,108 @@
|
|||||||
|
from .base_config_test import BaseConfigTest, CollsDirMixin, fmod
|
||||||
|
|
||||||
|
from warcio.timeutils import timestamp_to_iso_date
|
||||||
|
from warcio.warcwriter import WARCWriter
|
||||||
|
from warcio.statusandheaders import StatusAndHeaders
|
||||||
|
from io import BytesIO
|
||||||
|
import os
|
||||||
|
|
||||||
|
from pywb.manager.manager import main as wb_manager
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
class TestSelfRedirect(CollsDirMixin, BaseConfigTest):
|
||||||
|
@classmethod
|
||||||
|
def setup_class(cls):
|
||||||
|
super(TestSelfRedirect, cls).setup_class('config_test.yaml')
|
||||||
|
|
||||||
|
def create_redirect_record(self, url, redirect_url, timestamp):
|
||||||
|
warc_headers = {}
|
||||||
|
warc_headers['WARC-Date'] = timestamp_to_iso_date(timestamp)
|
||||||
|
|
||||||
|
#content = 'Redirect to ' + redirect_url
|
||||||
|
content = ''
|
||||||
|
payload = content.encode('utf-8')
|
||||||
|
headers_list = [('Content-Length', str(len(payload))),
|
||||||
|
('Location', redirect_url)
|
||||||
|
]
|
||||||
|
|
||||||
|
http_headers = StatusAndHeaders('301 Permanent Redirect', headers_list, protocol='HTTP/1.0')
|
||||||
|
|
||||||
|
rec = self.writer.create_warc_record(url, 'response',
|
||||||
|
payload=BytesIO(payload),
|
||||||
|
length=len(payload),
|
||||||
|
http_headers=http_headers,
|
||||||
|
warc_headers_dict=warc_headers)
|
||||||
|
|
||||||
|
self.writer.write_record(rec)
|
||||||
|
|
||||||
|
return rec
|
||||||
|
|
||||||
|
def create_response_record(self, url, timestamp, text):
|
||||||
|
payload = text.encode('utf-8')
|
||||||
|
|
||||||
|
warc_headers = {}
|
||||||
|
warc_headers['WARC-Date'] = timestamp_to_iso_date(timestamp)
|
||||||
|
|
||||||
|
headers_list = [('Content-Length', str(len(payload)))]
|
||||||
|
|
||||||
|
http_headers = StatusAndHeaders('200 OK', headers_list, protocol='HTTP/1.0')
|
||||||
|
|
||||||
|
rec = self.writer.create_warc_record(url, 'response',
|
||||||
|
payload=BytesIO(payload),
|
||||||
|
length=len(payload),
|
||||||
|
http_headers=http_headers,
|
||||||
|
warc_headers_dict=warc_headers)
|
||||||
|
|
||||||
|
self.writer.write_record(rec)
|
||||||
|
return rec
|
||||||
|
|
||||||
|
def create_revisit_record(self, original, url, redirect_url, timestamp):
|
||||||
|
warc_headers = {}
|
||||||
|
warc_headers['WARC-Date'] = timestamp_to_iso_date(timestamp)
|
||||||
|
|
||||||
|
headers_list = [('Content-Length', '0'),
|
||||||
|
('Location', redirect_url)]
|
||||||
|
|
||||||
|
http_headers = StatusAndHeaders('302 Temp Redirect', headers_list, protocol='HTTP/1.0')
|
||||||
|
|
||||||
|
rec = self.writer.create_revisit_record(url,
|
||||||
|
digest=original.rec_headers['WARC-Payload-Digest'],
|
||||||
|
refers_to_uri=url,
|
||||||
|
refers_to_date=original.rec_headers['WARC-Date'],
|
||||||
|
warc_headers_dict=warc_headers,
|
||||||
|
http_headers=http_headers)
|
||||||
|
|
||||||
|
self.writer.write_record(rec)
|
||||||
|
|
||||||
|
def init_warc(self, filename, coll):
|
||||||
|
filename = os.path.join(self.root_dir, filename)
|
||||||
|
with open(filename, 'wb') as fh:
|
||||||
|
self.writer = WARCWriter(fh, gzip=True)
|
||||||
|
|
||||||
|
redirect = self.create_redirect_record('http://example.com/', 'https://example.com/', '201806026101112')
|
||||||
|
redirect = self.create_redirect_record('https://example.com/', 'https://www.example.com/', '201806026101112')
|
||||||
|
response = self.create_response_record('https://www.example.com/', '201806026101112', 'Some Text')
|
||||||
|
|
||||||
|
wb_manager(['init', coll])
|
||||||
|
|
||||||
|
wb_manager(['add', coll, filename])
|
||||||
|
|
||||||
|
def test_self_redir_init(self):
|
||||||
|
self.init_warc('redirect.warc.gz', 'redir')
|
||||||
|
|
||||||
|
assert os.path.isfile(os.path.join(self.root_dir, self.COLLS_DIR, 'redir', 'indexes', 'index.cdxj'))
|
||||||
|
|
||||||
|
|
||||||
|
def test_self_redir_1(self, fmod):
|
||||||
|
res = self.get('/redir/201806026101112{0}/https://example.com/', fmod)
|
||||||
|
|
||||||
|
assert res.status_code == 200
|
||||||
|
|
||||||
|
assert res.text == 'Some Text'
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
x
Reference in New Issue
Block a user