1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

Misc Fixes for RC5 (#534)

* misc fixes (rc 5):
- banner: only auto init banner if not in top-frame (check for no-frame mode and replay url is set)
- index: 'cdx+' fix for use as internal index: if cdx has a warc filename and offset, don't attempt default live web load
- improved self-redirect: avoid www2 -> www redirect altogether, not just for second redirect
- tests: update tests for improved self-redirect checking
- bump version to pywb-2.4.0-rc5
This commit is contained in:
Ilya Kreymer 2020-01-17 17:38:08 -08:00 committed by GitHub
parent 93ce4f6f7a
commit fa021eebab
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 38 additions and 12 deletions

View File

@ -306,8 +306,8 @@ This file is part of pywb, https://github.com/webrecorder/pywb
// all banners will expose themselves by adding themselves as WBBanner on window // all banners will expose themselves by adding themselves as WBBanner on window
window.WBBanner = new DefaultBanner(); window.WBBanner = new DefaultBanner();
// if in replay frame, init immediately // if wbinfo.url is set and not-framed, init banner in content frame
if (window.wbinfo) { if (window.wbinfo && window.wbinfo.url && !window.wbinfo.is_framed) {
if (document.readyState === "loading") { if (document.readyState === "loading") {
document.addEventListener("DOMContentLoaded", function() { document.addEventListener("DOMContentLoaded", function() {
window.WBBanner.init(); window.WBBanner.init();

View File

@ -1,4 +1,4 @@
__version__ = '2.4.0rc4' __version__ = '2.4.0-rc5'
if __name__ == '__main__': if __name__ == '__main__':
print(__version__) print(__version__)

View File

@ -139,18 +139,19 @@ class BaseLoader(object):
request_url = request_url.split('://', 1)[-1].rstrip('/') request_url = request_url.split('://', 1)[-1].rstrip('/')
self_redir = False self_redir = False
orig_key = params.get('sr-urlkey') or cdx['urlkey']
if request_url == location_url: if request_url == location_url:
self_redir = True self_redir = True
elif params.get('sr-urlkey'):
# if new location canonicalized matches old key, also self-redirect # if new location canonicalized matches old key, also self-redirect
if canonicalize(location_url) == params.get('sr-urlkey'): elif canonicalize(location_url) == orig_key:
self_redir = True self_redir = True
if self_redir: if self_redir:
msg = 'Self Redirect {0} -> {1}' msg = 'Self Redirect {0} -> {1}'
msg = msg.format(request_url, location_url) msg = msg.format(request_url, location_url)
params['sr-urlkey'] = cdx['urlkey'] params['sr-urlkey'] = orig_key
raise LiveResourceException(msg) raise LiveResourceException(msg)
@staticmethod @staticmethod
@ -267,6 +268,9 @@ class LiveWebLoader(BaseLoader):
self.socks_proxy = None self.socks_proxy = None
def load_resource(self, cdx, params): def load_resource(self, cdx, params):
if cdx.get('filename') and cdx.get('offset') is not None:
return None
load_url = cdx.get('load_url') load_url = cdx.get('load_url')
if not load_url: if not load_url:
return None return None

View File

@ -220,8 +220,8 @@ class TestBaseWarcServer(HttpBinLiveTests, MementoOverrideTests, FakeRedisTests,
buff = BytesIO(resp.body) buff = BytesIO(resp.body)
record = ArcWarcRecordLoader().parse_record_stream(buff, no_record_parse=False) record = ArcWarcRecordLoader().parse_record_stream(buff, no_record_parse=False)
print(record.http_headers) print(record.http_headers)
assert record.http_headers.get_statuscode() == '302' assert record.http_headers.get_statuscode() == '200'
assert record.http_headers.get_header('Location') == 'https://www.iana.org/' #assert record.http_headers.get_header('Location') == 'https://www.iana.org/'
@patch('pywb.warcserver.index.indexsource.MementoIndexSource.get_timegate_links', MementoOverrideTests.mock_link_header('select_live')) @patch('pywb.warcserver.index.indexsource.MementoIndexSource.get_timegate_links', MementoOverrideTests.mock_link_header('select_live'))
def test_agg_select_live(self): def test_agg_select_live(self):

View File

@ -15,7 +15,7 @@ class TestRedirects(CollsDirMixin, BaseConfigTest):
def setup_class(cls): def setup_class(cls):
super(TestRedirects, cls).setup_class('config_test.yaml') super(TestRedirects, cls).setup_class('config_test.yaml')
def create_redirect_record(self, url, redirect_url, timestamp): def create_redirect_record(self, url, redirect_url, timestamp, status='301'):
warc_headers = {} warc_headers = {}
warc_headers['WARC-Date'] = timestamp_to_iso_date(timestamp) warc_headers['WARC-Date'] = timestamp_to_iso_date(timestamp)
@ -26,7 +26,7 @@ class TestRedirects(CollsDirMixin, BaseConfigTest):
('Location', redirect_url) ('Location', redirect_url)
] ]
http_headers = StatusAndHeaders('301 Permanent Redirect', headers_list, protocol='HTTP/1.0') http_headers = StatusAndHeaders(status + ' Redirect', headers_list, protocol='HTTP/1.0')
rec = self.writer.create_warc_record(url, 'response', rec = self.writer.create_warc_record(url, 'response',
payload=BytesIO(payload), payload=BytesIO(payload),
@ -140,4 +140,26 @@ class TestRedirects(CollsDirMixin, BaseConfigTest):
res = self.get('/redir/20190626101112{0}/http://www.example.com/', fmod, status=200) res = self.get('/redir/20190626101112{0}/http://www.example.com/', fmod, status=200)
assert res.text == 'Some Text' assert res.text == 'Some Text'
def test_init_2(self):
filename = os.path.join(self.root_dir, 'redir2.warc.gz')
with open(filename, 'wb') as fh:
self.writer = WARCWriter(fh, gzip=True)
redirect = self.create_redirect_record('http://www.example.com/path', 'https://www.example.com/path/', '20191003115920')
redirect = self.create_redirect_record('https://www.example.com/path/', 'https://www2.example.com/path', '20191003115927', status='302')
response = self.create_response_record('https://www2.example.com/path', '20191024125646', 'Some Text')
revisit = self.create_revisit_record('https://www2.example.com/path', '20191024125648', 'https://www2.example.com/path', response.rec_headers['WARC-Date'])
wb_manager(['init', 'redir2'])
wb_manager(['add', 'redir2', filename])
assert os.path.isfile(os.path.join(self.root_dir, self.COLLS_DIR, 'redir2', 'indexes', 'index.cdxj'))
def test_revisit_redirect_skip_self_redir_2(self, fmod):
res = self.get('/redir2/20191024125648{0}/http://www2.example.com/path', fmod, status=200)
assert res.text == 'Some Text'
res = self.get('/redir2/20191024125648{0}/https://www.example.com/path', fmod, status=200)
assert res.text == 'Some Text'