From 83f8d7d29becf024a72ccaccfa78917ac6571fe4 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sun, 22 Feb 2015 22:51:23 -0800 Subject: [PATCH 1/5] bump version to 0.8.2 --- CHANGES.rst | 3 +++ README.rst | 2 +- setup.py | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 1682d3d2..56045c8b 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,3 +1,6 @@ +pywb 0.8.2 changelist +~~~~~~~~~~~~~~~~~~~~~ + pywb 0.8.1 changelist ~~~~~~~~~~~~~~~~~~~~~ diff --git a/README.rst b/README.rst index 80e84c67..d8d85c2d 100644 --- a/README.rst +++ b/README.rst @@ -1,4 +1,4 @@ -PyWb 0.8.1 +PyWb 0.8.2 ========== .. image:: https://travis-ci.org/ikreymer/pywb.png?branch=develop diff --git a/setup.py b/setup.py index c6a0f1b9..e88cdb42 100755 --- a/setup.py +++ b/setup.py @@ -34,7 +34,7 @@ class PyTest(TestCommand): setup( name='pywb', - version='0.8.1', + version='0.8.2', url='https://github.com/ikreymer/pywb', author='Ilya Kreymer', author_email='ikreymer@gmail.com', From 5d80d2d89193e018685e4400538bc694349361c9 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sun, 22 Feb 2015 22:51:35 -0800 Subject: [PATCH 2/5] replay: change strip_scheme() to strip_scheme_www() to also strip away www. prefix for self-redirect checking, #73 --- pywb/webapp/replay_views.py | 42 ++++++++++++++++++++----------------- 1 file changed, 23 insertions(+), 19 deletions(-) diff --git a/pywb/webapp/replay_views.py b/pywb/webapp/replay_views.py index d474e178..cb39383d 100644 --- a/pywb/webapp/replay_views.py +++ b/pywb/webapp/replay_views.py @@ -34,7 +34,7 @@ class CaptureException(WbException): #================================================================= class ReplayView(object): - STRIP_SCHEME = re.compile('^([\w]+:[/]*)?(.*?)$') + STRIP_SCHEME_WWW = re.compile('^([\w]+:[/]*(?:www[\d]*\.)?)?(.*?)$') def __init__(self, content_loader, config): self.content_loader = content_loader @@ -286,8 +286,8 @@ class ReplayView(object): host = urlsplit(cdx['original']).netloc location_url = host + location_url - if (ReplayView.strip_scheme(request_url) == - ReplayView.strip_scheme(location_url)): + if (ReplayView.strip_scheme_www(request_url) == + ReplayView.strip_scheme_www(location_url)): raise CaptureException('Self Redirect: ' + str(cdx)) # TODO: reevaluate this, as it may reject valid refreshes of a page @@ -307,39 +307,43 @@ class ReplayView(object): request_url = (wbrequest.host_prefix + wbrequest.rel_prefix + str(wbrequest.wb_url)) - if (ReplayView.strip_scheme(request_url) == - ReplayView.strip_scheme(wbrequest.referrer)): + if (ReplayView.strip_scheme_www(request_url) == + ReplayView.strip_scheme_www(wbrequest.referrer)): raise CaptureException('Self Redirect via Referrer: ' + str(wbrequest.wb_url)) @staticmethod - def strip_scheme(url): + def strip_scheme_www(url): """ - >>> ReplayView.strip_scheme('https://example.com') ==\ - ReplayView.strip_scheme('http://example.com') + >>> ReplayView.strip_scheme_www('https://example.com') ==\ + ReplayView.strip_scheme_www('http://example.com') True - >>> ReplayView.strip_scheme('https://example.com') ==\ - ReplayView.strip_scheme('http:/example.com') + >>> ReplayView.strip_scheme_www('https://example.com') ==\ + ReplayView.strip_scheme_www('http:/example.com') True - >>> ReplayView.strip_scheme('https://example.com') ==\ - ReplayView.strip_scheme('example.com') + >>> ReplayView.strip_scheme_www('https://example.com') ==\ + ReplayView.strip_scheme_www('example.com') True - >>> ReplayView.strip_scheme('about://example.com') ==\ - ReplayView.strip_scheme('example.com') + >>> ReplayView.strip_scheme_www('https://example.com') ==\ + ReplayView.strip_scheme_www('http://www2.example.com') True - >>> ReplayView.strip_scheme('http://') ==\ - ReplayView.strip_scheme('') + >>> ReplayView.strip_scheme_www('about://example.com') ==\ + ReplayView.strip_scheme_www('example.com') True - >>> ReplayView.strip_scheme('#!@?') ==\ - ReplayView.strip_scheme('#!@?') + >>> ReplayView.strip_scheme_www('http://') ==\ + ReplayView.strip_scheme_www('') + True + + >>> ReplayView.strip_scheme_www('#!@?') ==\ + ReplayView.strip_scheme_www('#!@?') True """ - m = ReplayView.STRIP_SCHEME.match(url) + m = ReplayView.STRIP_SCHEME_WWW.match(url) match = m.group(2) return match From 39824711f06cd6c650b17fb17b28c2db2faa5720 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 23 Feb 2015 23:21:39 -0800 Subject: [PATCH 3/5] memento tweak: ensure rel=memento link for timegate uses exact in Location (cdx original) as opposed to url from request --- pywb/framework/memento.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/pywb/framework/memento.py b/pywb/framework/memento.py index 29fe7ca8..d5604bc6 100644 --- a/pywb/framework/memento.py +++ b/pywb/framework/memento.py @@ -74,10 +74,13 @@ class MementoRespMixin(object): is_memento = (wbrequest.wb_url.type == wbrequest.wb_url.REPLAY) link = [] + req_url = wbrequest.wb_url.url if is_memento or is_timegate: + url = req_url if cdx: ts = cdx['timestamp'] + url = cdx['original'] # for top frame elif wbrequest.wb_url.timestamp: ts = wbrequest.wb_url.timestamp @@ -91,13 +94,14 @@ class MementoRespMixin(object): self.status_headers.headers.append(('Memento-Datetime', http_date)) - canon_link = wbrequest.urlrewriter.get_new_url(mod='', timestamp=ts) + canon_link = wbrequest.urlrewriter.get_new_url(mod='', + timestamp=ts, + url=url) + link.append(self.make_memento_link(canon_link, 'memento', http_date)) - req_url = wbrequest.wb_url.url - if is_memento and is_timegate: link.append(self.make_link(req_url, 'original timegate')) else: From cb857df125f4da1d815c62c564c4cfe86cb1ffa1 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 24 Feb 2015 10:35:49 -0800 Subject: [PATCH 4/5] memento: fix MementoTimemapView to have consistent signature with other query views --- pywb/webapp/views.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pywb/webapp/views.py b/pywb/webapp/views.py index 11624a22..036977a1 100644 --- a/pywb/webapp/views.py +++ b/pywb/webapp/views.py @@ -189,7 +189,7 @@ class J2HtmlCapturesView(J2TemplateView): #================================================================= class MementoTimemapView(object): - def render_response(self, wbrequest, cdx_lines): + def render_response(self, wbrequest, cdx_lines, **kwargs): memento_lines = make_timemap(wbrequest, cdx_lines) return WbResponse.text_stream(memento_lines, content_type=LINK_FORMAT) From 6c8cb806d9fad73fa4a2a14d304216f4710edbe0 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 28 Feb 2015 09:04:15 -0800 Subject: [PATCH 5/5] update 0.8.2 changelist, minor fixes --- CHANGES.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CHANGES.rst b/CHANGES.rst index 56045c8b..7539b88a 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,6 +1,11 @@ pywb 0.8.2 changelist ~~~~~~~~~~~~~~~~~~~~~ +* rewrite: fix for redirect loop related to pages with 'www.' prefix. Since canonicalization removes the prefix, treat redirect to 'www.' as self-redirect (for now). + +* memento: ensure rel=memento url matches timegate redirect exactly (urls may differ due to canonicalization, use actual instead of requested for both) + + pywb 0.8.1 changelist ~~~~~~~~~~~~~~~~~~~~~