1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

trailing slash fix: add trailing slash, preserving query, if no slash present after hostname (#211)

This commit is contained in:
Ilya Kreymer 2017-04-04 18:10:49 -07:00
parent 7ca5795976
commit f593b5f80f
3 changed files with 36 additions and 19 deletions

View File

@ -23,7 +23,7 @@ from pywb.framework.wbrequestresponse import WbResponse
from pywb.webagg.utils import MementoUtils, buffer_iter from pywb.webagg.utils import MementoUtils, buffer_iter
from werkzeug.http import HTTP_STATUS_CODES from werkzeug.http import HTTP_STATUS_CODES
from six.moves.urllib.parse import urlencode from six.moves.urllib.parse import urlencode, urlsplit, urlunsplit
from pywb.urlrewrite.rewriteinputreq import RewriteInputRequest from pywb.urlrewrite.rewriteinputreq import RewriteInputRequest
from pywb.urlrewrite.templateview import JinjaEnv, HeadInsertView, TopFrameView, BaseInsertView from pywb.urlrewrite.templateview import JinjaEnv, HeadInsertView, TopFrameView, BaseInsertView
@ -125,9 +125,13 @@ class RewriterApp(object):
full_prefix=full_prefix, full_prefix=full_prefix,
rel_prefix=rel_prefix) rel_prefix=rel_prefix)
scheme_inx = wb_url.url.find('//') url_parts = urlsplit(wb_url.url)
if wb_url.url.find('/', scheme_inx + 2) < 0: if not url_parts.path:
return WbResponse.redir_response(urlrewriter.rewrite(wb_url.url + '/')) scheme, netloc, path, query, frag = url_parts
path = '/'
url = urlunsplit((scheme, netloc, path, query, frag))
return WbResponse.redir_response(urlrewriter.rewrite(url),
'307 Temporary Redirect')
self.unrewrite_referrer(environ) self.unrewrite_referrer(environ)
@ -211,7 +215,8 @@ class RewriterApp(object):
cdx['url'] = target_uri cdx['url'] = target_uri
if target_uri != wb_url.url and r.headers.get('WebAgg-Fuzzy-Match') == '1': if target_uri != wb_url.url and r.headers.get('WebAgg-Fuzzy-Match') == '1':
return WbResponse.redir_response(urlrewriter.rewrite(target_uri)) return WbResponse.redir_response(urlrewriter.rewrite(target_uri),
'307 Temporary Redirect')
self._add_custom_params(cdx, r.headers, kwargs) self._add_custom_params(cdx, r.headers, kwargs)

View File

@ -106,7 +106,7 @@ class TestWbIntegration(BaseConfigTest):
def test_replay_fuzzy_1(self): def test_replay_fuzzy_1(self):
resp = self.testapp.get('/pywb/20140127171238mp_/http://www.iana.org/?_=123') resp = self.testapp.get('/pywb/20140127171238mp_/http://www.iana.org/?_=123')
assert resp.status_int == 302 assert resp.status_int == 307
assert resp.headers['Location'].endswith('/pywb/20140127171238mp_/http://www.iana.org/') assert resp.headers['Location'].endswith('/pywb/20140127171238mp_/http://www.iana.org/')
def test_replay_no_fuzzy_match(self): def test_replay_no_fuzzy_match(self):
@ -121,8 +121,18 @@ class TestWbIntegration(BaseConfigTest):
# assert 'wb.js' in resp.text # assert 'wb.js' in resp.text
# assert '/pywb-nosurt/20140103030321/http://www.iana.org/domains/example' in resp.text # assert '/pywb-nosurt/20140103030321/http://www.iana.org/domains/example' in resp.text
def test_no_slash_redir_1(self):
resp = self.testapp.get('/pywb/20140103030321mp_/http://example.com')
assert resp.status_int == 307
assert resp.headers['Location'].endswith('/pywb/20140103030321mp_/http://example.com/')
def test_no_slash_redir_2(self):
resp = self.testapp.get('/pywb/20140103030321mp_/http://example.com?example=1')
assert resp.status_int == 307
assert resp.headers['Location'].endswith('/pywb/20140103030321mp_/http://example.com/?example=1')
def test_replay_cdxj(self): def test_replay_cdxj(self):
resp = self.testapp.get('/pywb-cdxj/20140103030321mp_/http://example.com?example=1') resp = self.testapp.get('/pywb-cdxj/20140103030321mp_/http://example.com/?example=1')
self._assert_basic_html(resp) self._assert_basic_html(resp)
assert '"20140103030321"' in resp.text assert '"20140103030321"' in resp.text
@ -130,7 +140,7 @@ class TestWbIntegration(BaseConfigTest):
assert '/pywb-cdxj/20140103030321mp_/http://www.iana.org/domains/example' in resp.text assert '/pywb-cdxj/20140103030321mp_/http://www.iana.org/domains/example' in resp.text
def test_replay_cdxj_revisit(self): def test_replay_cdxj_revisit(self):
resp = self.testapp.get('/pywb-cdxj/20140103030341mp_/http://example.com?example=1') resp = self.testapp.get('/pywb-cdxj/20140103030341mp_/http://example.com/?example=1')
self._assert_basic_html(resp) self._assert_basic_html(resp)
assert '"20140103030341"' in resp.text assert '"20140103030341"' in resp.text
@ -138,7 +148,7 @@ class TestWbIntegration(BaseConfigTest):
assert '/pywb-cdxj/20140103030341mp_/http://www.iana.org/domains/example' in resp.text assert '/pywb-cdxj/20140103030341mp_/http://www.iana.org/domains/example' in resp.text
def test_zero_len_revisit(self): def test_zero_len_revisit(self):
resp = self.testapp.get('/pywb/20140603030341mp_/http://example.com?example=2') resp = self.testapp.get('/pywb/20140603030341mp_/http://example.com/?example=2')
self._assert_basic_html(resp) self._assert_basic_html(resp)
assert '"20140603030341"' in resp.text assert '"20140603030341"' in resp.text
@ -181,7 +191,7 @@ class TestWbIntegration(BaseConfigTest):
assert '"/_css/2013.1/screen.css"' in resp.text assert '"/_css/2013.1/screen.css"' in resp.text
def test_replay_identity_1(self): def test_replay_identity_1(self):
resp = self.testapp.get('/pywb/20140127171251id_/http://example.com') resp = self.testapp.get('/pywb/20140127171251id_/http://example.com/')
# no wb header insertion # no wb header insertion
assert 'wb.js' not in resp.text assert 'wb.js' not in resp.text
@ -235,7 +245,7 @@ class TestWbIntegration(BaseConfigTest):
assert resp.content_length == 0 assert resp.content_length == 0
def test_replay_identity_2_arcgz(self): def test_replay_identity_2_arcgz(self):
resp = self.testapp.get('/pywb/20140216050221id_/http://arc.gz.test.example.com') resp = self.testapp.get('/pywb/20140216050221id_/http://arc.gz.test.example.com/')
# no wb header insertion # no wb header insertion
assert 'wb.js' not in resp.text assert 'wb.js' not in resp.text
@ -244,7 +254,7 @@ class TestWbIntegration(BaseConfigTest):
assert '"http://www.iana.org/domains/example"' in resp.text assert '"http://www.iana.org/domains/example"' in resp.text
def test_replay_identity_2_arc(self): def test_replay_identity_2_arc(self):
resp = self.testapp.get('/pywb/20140216050221id_/http://arc.test.example.com') resp = self.testapp.get('/pywb/20140216050221id_/http://arc.test.example.com/')
# no wb header insertion # no wb header insertion
assert 'wb.js' not in resp.text assert 'wb.js' not in resp.text
@ -350,21 +360,21 @@ class TestWbIntegration(BaseConfigTest):
# assert resp.status_int == 302 # assert resp.status_int == 302
def test_not_existant_warc_other_capture(self): def test_not_existant_warc_other_capture(self):
resp = self.testapp.get('/pywb/20140703030321mp_/http://example.com?example=2') resp = self.testapp.get('/pywb/20140703030321mp_/http://example.com/?example=2')
assert resp.status_int == 200 assert resp.status_int == 200
assert resp.headers['Content-Location'].endswith('/pywb/20140603030341mp_/http://example.com?example=2') assert resp.headers['Content-Location'].endswith('/pywb/20140603030341mp_/http://example.com?example=2')
def test_missing_revisit_other_capture(self): def test_missing_revisit_other_capture(self):
resp = self.testapp.get('/pywb/20140603030351mp_/http://example.com?example=2') resp = self.testapp.get('/pywb/20140603030351mp_/http://example.com/?example=2')
assert resp.status_int == 200 assert resp.status_int == 200
assert resp.headers['Content-Location'].endswith('/pywb/20140603030341mp_/http://example.com?example=2') assert resp.headers['Content-Location'].endswith('/pywb/20140603030341mp_/http://example.com?example=2')
def test_not_existant_warc_no_other(self): def test_not_existant_warc_no_other(self):
resp = self.testapp.get('/pywb/20140703030321mp_/http://example.com?example=3', status=503) resp = self.testapp.get('/pywb/20140703030321mp_/http://example.com/?example=3', status=503)
assert resp.status_int == 503 assert resp.status_int == 503
def test_missing_revisit_no_other(self): def test_missing_revisit_no_other(self):
resp = self.testapp.get('/pywb/20140603030351mp_/http://example.com?example=3', status=503) resp = self.testapp.get('/pywb/20140603030351mp_/http://example.com/?example=3', status=503)
assert resp.status_int == 503 assert resp.status_int == 503
def test_live_frame(self): def test_live_frame(self):
@ -429,7 +439,7 @@ class TestWbIntegration(BaseConfigTest):
assert 'Excluded' in resp.text assert 'Excluded' in resp.text
def test_replay_not_found(self): def test_replay_not_found(self):
resp = self.testapp.head('/pywb/mp_/http://not-exist.example.com', status=404) resp = self.testapp.head('/pywb/mp_/http://not-exist.example.com/', status=404)
assert resp.content_type == 'text/html' assert resp.content_type == 'text/html'
assert resp.status_int == 404 assert resp.status_int == 404

View File

@ -32,11 +32,13 @@ class TestLiveRewriter(BaseConfigTest):
assert 'src="http://localhost:80/live/mp_/http://example.com/"' in resp.text, resp.text assert 'src="http://localhost:80/live/mp_/http://example.com/"' in resp.text, resp.text
def test_live_invalid(self): def test_live_invalid(self):
resp = self.testapp.get('/live/mp_/http://abcdef', status=400) resp = self.testapp.get('/live/mp_/http://abcdef', status=307)
resp = resp.follow(status=400)
assert resp.status_int == 400 assert resp.status_int == 400
def test_live_invalid_2(self): def test_live_invalid_2(self):
resp = self.testapp.get('/live/mp_/@#$@#$', status=400) resp = self.testapp.get('/live/mp_/@#$@#$', status=307)
resp = resp.follow(status=400)
assert resp.status_int == 400 assert resp.status_int == 400
def test_live_video_info(self): def test_live_video_info(self):