From ac5b4da9eb790ec61eddedebb65acd8540ab02ab Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 14 Jun 2018 10:48:32 -0400 Subject: [PATCH] Self-Redirect Fix (#345) * self-redirect fix for multiple continuous 3xx responses: if after one self-redirect, next match is also a redirect where url canonicalizes to same as previously rejected, also treat as self-redirect tests: add new test_self_redirect for generating example pattern where self-redirect could occur * self-redirect: ensure warc records are closed when handling self-redirect exception! --- pywb/warcserver/resource/responseloader.py | 25 ++++- tests/test_self_redirect.py | 108 +++++++++++++++++++++ 2 files changed, 130 insertions(+), 3 deletions(-) create mode 100644 tests/test_self_redirect.py diff --git a/pywb/warcserver/resource/responseloader.py b/pywb/warcserver/resource/responseloader.py index ea64e081..e6470176 100644 --- a/pywb/warcserver/resource/responseloader.py +++ b/pywb/warcserver/resource/responseloader.py @@ -7,6 +7,8 @@ from warcio.statusandheaders import StatusAndHeaders, StatusAndHeadersParser from pywb.utils.wbexception import LiveResourceException, WbException +from pywb.utils.canonicalize import canonicalize + from pywb.utils.memento import MementoUtils from pywb.utils.io import StreamIter, compress_gzip_iter, call_release_conn from pywb.utils.format import ParamFormatter @@ -131,6 +133,7 @@ class BaseLoader(object): if not location_url: return + location_url = location_url.lower() if location_url.startswith('/'): host = urlsplit(cdx['url']).netloc @@ -139,9 +142,19 @@ class BaseLoader(object): location_url = location_url.split('://', 1)[-1].rstrip('/') request_url = request_url.split('://', 1)[-1].rstrip('/') + self_redir = False + if request_url == location_url: + self_redir = True + elif params.get('sr-urlkey'): + # if new location canonicalized matches old key, also self-redirect + if canonicalize(location_url) == params.get('sr-urlkey'): + self_redir = True + + if self_redir: msg = 'Self Redirect {0} -> {1}' msg = msg.format(request_url, location_url) + params['sr-urlkey'] = cdx['urlkey'] raise LiveResourceException(msg) @staticmethod @@ -198,9 +211,15 @@ class WARCPathLoader(DefaultResolverMixin, BaseLoader): # status may not be set for 'revisit' if not status or status.startswith('3'): http_headers = self.headers_parser.parse(payload.raw_stream) - self.raise_on_self_redirect(params, cdx, - http_headers.get_statuscode(), - http_headers.get_header('Location')) + + try: + self.raise_on_self_redirect(params, cdx, + http_headers.get_statuscode(), + http_headers.get_header('Location')) + except LiveResourceException: + headers.raw_stream.close() + payload.raw_stream.close() + raise http_headers_buff = http_headers.to_bytes() diff --git a/tests/test_self_redirect.py b/tests/test_self_redirect.py new file mode 100644 index 00000000..d49ec2dd --- /dev/null +++ b/tests/test_self_redirect.py @@ -0,0 +1,108 @@ +from .base_config_test import BaseConfigTest, CollsDirMixin, fmod + +from warcio.timeutils import timestamp_to_iso_date +from warcio.warcwriter import WARCWriter +from warcio.statusandheaders import StatusAndHeaders +from io import BytesIO +import os + +from pywb.manager.manager import main as wb_manager + + +# ============================================================================ +class TestSelfRedirect(CollsDirMixin, BaseConfigTest): + @classmethod + def setup_class(cls): + super(TestSelfRedirect, cls).setup_class('config_test.yaml') + + def create_redirect_record(self, url, redirect_url, timestamp): + warc_headers = {} + warc_headers['WARC-Date'] = timestamp_to_iso_date(timestamp) + + #content = 'Redirect to ' + redirect_url + content = '' + payload = content.encode('utf-8') + headers_list = [('Content-Length', str(len(payload))), + ('Location', redirect_url) + ] + + http_headers = StatusAndHeaders('301 Permanent Redirect', headers_list, protocol='HTTP/1.0') + + rec = self.writer.create_warc_record(url, 'response', + payload=BytesIO(payload), + length=len(payload), + http_headers=http_headers, + warc_headers_dict=warc_headers) + + self.writer.write_record(rec) + + return rec + + def create_response_record(self, url, timestamp, text): + payload = text.encode('utf-8') + + warc_headers = {} + warc_headers['WARC-Date'] = timestamp_to_iso_date(timestamp) + + headers_list = [('Content-Length', str(len(payload)))] + + http_headers = StatusAndHeaders('200 OK', headers_list, protocol='HTTP/1.0') + + rec = self.writer.create_warc_record(url, 'response', + payload=BytesIO(payload), + length=len(payload), + http_headers=http_headers, + warc_headers_dict=warc_headers) + + self.writer.write_record(rec) + return rec + + def create_revisit_record(self, original, url, redirect_url, timestamp): + warc_headers = {} + warc_headers['WARC-Date'] = timestamp_to_iso_date(timestamp) + + headers_list = [('Content-Length', '0'), + ('Location', redirect_url)] + + http_headers = StatusAndHeaders('302 Temp Redirect', headers_list, protocol='HTTP/1.0') + + rec = self.writer.create_revisit_record(url, + digest=original.rec_headers['WARC-Payload-Digest'], + refers_to_uri=url, + refers_to_date=original.rec_headers['WARC-Date'], + warc_headers_dict=warc_headers, + http_headers=http_headers) + + self.writer.write_record(rec) + + def init_warc(self, filename, coll): + filename = os.path.join(self.root_dir, filename) + with open(filename, 'wb') as fh: + self.writer = WARCWriter(fh, gzip=True) + + redirect = self.create_redirect_record('http://example.com/', 'https://example.com/', '201806026101112') + redirect = self.create_redirect_record('https://example.com/', 'https://www.example.com/', '201806026101112') + response = self.create_response_record('https://www.example.com/', '201806026101112', 'Some Text') + + wb_manager(['init', coll]) + + wb_manager(['add', coll, filename]) + + def test_self_redir_init(self): + self.init_warc('redirect.warc.gz', 'redir') + + assert os.path.isfile(os.path.join(self.root_dir, self.COLLS_DIR, 'redir', 'indexes', 'index.cdxj')) + + + def test_self_redir_1(self, fmod): + res = self.get('/redir/201806026101112{0}/https://example.com/', fmod) + + assert res.status_code == 200 + + assert res.text == 'Some Text' + + + + + +