1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

Self-Redirect Fix (#345)

* self-redirect fix for multiple continuous 3xx responses: if after one self-redirect, next match is also a redirect where url canonicalizes to same as previously rejected, also treat as self-redirect
tests: add new test_self_redirect for generating example pattern where self-redirect could occur

* self-redirect: ensure warc records are closed when handling self-redirect exception!
This commit is contained in:
Ilya Kreymer 2018-06-14 10:48:32 -04:00 committed by GitHub
parent a3476d8baa
commit ac5b4da9eb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 130 additions and 3 deletions

View File

@ -7,6 +7,8 @@ from warcio.statusandheaders import StatusAndHeaders, StatusAndHeadersParser
from pywb.utils.wbexception import LiveResourceException, WbException from pywb.utils.wbexception import LiveResourceException, WbException
from pywb.utils.canonicalize import canonicalize
from pywb.utils.memento import MementoUtils from pywb.utils.memento import MementoUtils
from pywb.utils.io import StreamIter, compress_gzip_iter, call_release_conn from pywb.utils.io import StreamIter, compress_gzip_iter, call_release_conn
from pywb.utils.format import ParamFormatter from pywb.utils.format import ParamFormatter
@ -131,6 +133,7 @@ class BaseLoader(object):
if not location_url: if not location_url:
return return
location_url = location_url.lower() location_url = location_url.lower()
if location_url.startswith('/'): if location_url.startswith('/'):
host = urlsplit(cdx['url']).netloc host = urlsplit(cdx['url']).netloc
@ -139,9 +142,19 @@ class BaseLoader(object):
location_url = location_url.split('://', 1)[-1].rstrip('/') location_url = location_url.split('://', 1)[-1].rstrip('/')
request_url = request_url.split('://', 1)[-1].rstrip('/') request_url = request_url.split('://', 1)[-1].rstrip('/')
self_redir = False
if request_url == location_url: if request_url == location_url:
self_redir = True
elif params.get('sr-urlkey'):
# if new location canonicalized matches old key, also self-redirect
if canonicalize(location_url) == params.get('sr-urlkey'):
self_redir = True
if self_redir:
msg = 'Self Redirect {0} -> {1}' msg = 'Self Redirect {0} -> {1}'
msg = msg.format(request_url, location_url) msg = msg.format(request_url, location_url)
params['sr-urlkey'] = cdx['urlkey']
raise LiveResourceException(msg) raise LiveResourceException(msg)
@staticmethod @staticmethod
@ -198,9 +211,15 @@ class WARCPathLoader(DefaultResolverMixin, BaseLoader):
# status may not be set for 'revisit' # status may not be set for 'revisit'
if not status or status.startswith('3'): if not status or status.startswith('3'):
http_headers = self.headers_parser.parse(payload.raw_stream) http_headers = self.headers_parser.parse(payload.raw_stream)
self.raise_on_self_redirect(params, cdx,
http_headers.get_statuscode(), try:
http_headers.get_header('Location')) self.raise_on_self_redirect(params, cdx,
http_headers.get_statuscode(),
http_headers.get_header('Location'))
except LiveResourceException:
headers.raw_stream.close()
payload.raw_stream.close()
raise
http_headers_buff = http_headers.to_bytes() http_headers_buff = http_headers.to_bytes()

108
tests/test_self_redirect.py Normal file
View File

@ -0,0 +1,108 @@
from .base_config_test import BaseConfigTest, CollsDirMixin, fmod
from warcio.timeutils import timestamp_to_iso_date
from warcio.warcwriter import WARCWriter
from warcio.statusandheaders import StatusAndHeaders
from io import BytesIO
import os
from pywb.manager.manager import main as wb_manager
# ============================================================================
class TestSelfRedirect(CollsDirMixin, BaseConfigTest):
@classmethod
def setup_class(cls):
super(TestSelfRedirect, cls).setup_class('config_test.yaml')
def create_redirect_record(self, url, redirect_url, timestamp):
warc_headers = {}
warc_headers['WARC-Date'] = timestamp_to_iso_date(timestamp)
#content = 'Redirect to ' + redirect_url
content = ''
payload = content.encode('utf-8')
headers_list = [('Content-Length', str(len(payload))),
('Location', redirect_url)
]
http_headers = StatusAndHeaders('301 Permanent Redirect', headers_list, protocol='HTTP/1.0')
rec = self.writer.create_warc_record(url, 'response',
payload=BytesIO(payload),
length=len(payload),
http_headers=http_headers,
warc_headers_dict=warc_headers)
self.writer.write_record(rec)
return rec
def create_response_record(self, url, timestamp, text):
payload = text.encode('utf-8')
warc_headers = {}
warc_headers['WARC-Date'] = timestamp_to_iso_date(timestamp)
headers_list = [('Content-Length', str(len(payload)))]
http_headers = StatusAndHeaders('200 OK', headers_list, protocol='HTTP/1.0')
rec = self.writer.create_warc_record(url, 'response',
payload=BytesIO(payload),
length=len(payload),
http_headers=http_headers,
warc_headers_dict=warc_headers)
self.writer.write_record(rec)
return rec
def create_revisit_record(self, original, url, redirect_url, timestamp):
warc_headers = {}
warc_headers['WARC-Date'] = timestamp_to_iso_date(timestamp)
headers_list = [('Content-Length', '0'),
('Location', redirect_url)]
http_headers = StatusAndHeaders('302 Temp Redirect', headers_list, protocol='HTTP/1.0')
rec = self.writer.create_revisit_record(url,
digest=original.rec_headers['WARC-Payload-Digest'],
refers_to_uri=url,
refers_to_date=original.rec_headers['WARC-Date'],
warc_headers_dict=warc_headers,
http_headers=http_headers)
self.writer.write_record(rec)
def init_warc(self, filename, coll):
filename = os.path.join(self.root_dir, filename)
with open(filename, 'wb') as fh:
self.writer = WARCWriter(fh, gzip=True)
redirect = self.create_redirect_record('http://example.com/', 'https://example.com/', '201806026101112')
redirect = self.create_redirect_record('https://example.com/', 'https://www.example.com/', '201806026101112')
response = self.create_response_record('https://www.example.com/', '201806026101112', 'Some Text')
wb_manager(['init', coll])
wb_manager(['add', coll, filename])
def test_self_redir_init(self):
self.init_warc('redirect.warc.gz', 'redir')
assert os.path.isfile(os.path.join(self.root_dir, self.COLLS_DIR, 'redir', 'indexes', 'index.cdxj'))
def test_self_redir_1(self, fmod):
res = self.get('/redir/201806026101112{0}/https://example.com/', fmod)
assert res.status_code == 200
assert res.text == 'Some Text'