From f2d7bd074a7326a2a7083317c6a227d1e8f15186 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 5 Mar 2015 16:18:56 -0800 Subject: [PATCH 1/5] bump version to 0.8.3 cookie rewrite: remove 'secure' flag if present --- CHANGES.rst | 6 ++++++ README.rst | 2 +- pywb/rewrite/cookie_rewriter.py | 5 +++++ setup.py | 2 +- 4 files changed, 13 insertions(+), 2 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 7539b88a..335f43f6 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,3 +1,9 @@ +pywb 0.8.3 changelist +~~~~~~~~~~~~~~~~~~~~~ + +* cookie rewrite: remove cookie ``secure`` flag to allow equivalent replay via http as well as https + + pywb 0.8.2 changelist ~~~~~~~~~~~~~~~~~~~~~ diff --git a/README.rst b/README.rst index d8d85c2d..444cbae6 100644 --- a/README.rst +++ b/README.rst @@ -1,4 +1,4 @@ -PyWb 0.8.2 +PyWb 0.8.3 ========== .. image:: https://travis-ci.org/ikreymer/pywb.png?branch=develop diff --git a/pywb/rewrite/cookie_rewriter.py b/pywb/rewrite/cookie_rewriter.py index 63db9a93..c17d0dad 100644 --- a/pywb/rewrite/cookie_rewriter.py +++ b/pywb/rewrite/cookie_rewriter.py @@ -32,6 +32,11 @@ class WbUrlBaseCookieRewriter(object): if morsel.get('max-age'): del morsel['max-age'] + # for now, also remove secure to avoid issues when + # proxying over plain http (TODO: detect https?) + if morsel.get('secure'): + del morsel['secure'] + #================================================================= class MinimalScopeCookieRewriter(WbUrlBaseCookieRewriter): diff --git a/setup.py b/setup.py index e88cdb42..478630fe 100755 --- a/setup.py +++ b/setup.py @@ -34,7 +34,7 @@ class PyTest(TestCommand): setup( name='pywb', - version='0.8.2', + version='0.8.3', url='https://github.com/ikreymer/pywb', author='Ilya Kreymer', author_email='ikreymer@gmail.com', From 1fb631870b01e041b74f056eef68359d1ccccfaa Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 5 Mar 2015 17:04:44 -0800 Subject: [PATCH 2/5] wb_frame: fix extra slash typo in replaced frame url --- pywb/static/wb_frame.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pywb/static/wb_frame.js b/pywb/static/wb_frame.js index 79642db1..640b3e88 100644 --- a/pywb/static/wb_frame.js +++ b/pywb/static/wb_frame.js @@ -38,7 +38,7 @@ function make_inner_url(url, ts) if (ts) { return wbinfo.prefix + ts + "/" + url; } else { - return wbinfo.prefix + "/" + url; + return wbinfo.prefix + url; } } From 24021fcd5731e9e507617e295a2b0d1033710edb Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Fri, 13 Mar 2015 10:53:57 -0700 Subject: [PATCH 3/5] html rewrite: add trailing slash for tag rewrite if url is a scheme://host with no path component #77 cleanup: remove unused code path for tags with no rewriting -- all tags now checked for dynamic attrs which may need rewriting update tests, including live rewrite test dependent on live site (FB) --- pywb/rewrite/html_rewriter.py | 37 ++++++++++++++++--------- pywb/rewrite/test/test_html_rewriter.py | 4 +++ pywb/rewrite/test/test_rewrite_live.py | 5 ++-- 3 files changed, 31 insertions(+), 15 deletions(-) diff --git a/pywb/rewrite/html_rewriter.py b/pywb/rewrite/html_rewriter.py index c4ec854c..4005293f 100644 --- a/pywb/rewrite/html_rewriter.py +++ b/pywb/rewrite/html_rewriter.py @@ -5,6 +5,7 @@ import sys import re from HTMLParser import HTMLParser, HTMLParseError +from urlparse import urlsplit from url_rewriter import UrlRewriter from regex_rewriters import JSRewriter, CSSRewriter @@ -121,7 +122,22 @@ class HTMLRewriterMixin(object): meta_refresh[m.end(1):]) return meta_refresh - # =========================== + + def _rewrite_base(self, value, mod=''): + if not value.endswith('/'): + # check if hostname with no path, + # eg http://example.com + if not urlsplit(value).path: + value += '/' + + base_value = self._rewrite_url(value, mod) + + if self.opts.get('rewrite_base', True): + value = base_value + + self.url_rewriter = (self.url_rewriter. + rebase_rewriter(base_value)) + return value def _rewrite_url(self, value, mod=None): if value: @@ -221,12 +237,7 @@ class HTMLRewriterMixin(object): # special case: base tag elif (tag == 'base') and (attr_name == 'href') and attr_value: rw_mod = handler.get(attr_name) - base_value = self._rewrite_url(attr_value, rw_mod) - if self.opts.get('rewrite_base', True): - attr_value = base_value - self.url_rewriter = (self.url_rewriter. - rebase_rewriter(base_value)) - + attr_value = self._rewrite_base(attr_value, rw_mod) else: # rewrite url using tag handler rw_mod = handler.get(attr_name) @@ -338,15 +349,15 @@ class HTMLRewriter(HTMLRewriterMixin, HTMLParser): return s def handle_starttag(self, tag, attrs): - if not self._rewrite_tag_attrs(tag, attrs): - self.out.write(self.get_starttag_text()) - elif tag != 'head' or not self._rewrite_head(False): + self._rewrite_tag_attrs(tag, attrs) + + if tag != 'head' or not self._rewrite_head(False): self.out.write('>') def handle_startendtag(self, tag, attrs): - if not self._rewrite_tag_attrs(tag, attrs): - self.out.write(self.get_starttag_text()) - elif tag != 'head' or not self._rewrite_head(True): + self._rewrite_tag_attrs(tag, attrs) + + if tag != 'head' or not self._rewrite_head(True): self.out.write('/>') def handle_endtag(self, tag): diff --git a/pywb/rewrite/test/test_html_rewriter.py b/pywb/rewrite/test/test_html_rewriter.py index 3938efda..12988b50 100644 --- a/pywb/rewrite/test/test_html_rewriter.py +++ b/pywb/rewrite/test/test_html_rewriter.py @@ -27,6 +27,10 @@ ur""" >>> parse('') +# ensure trailing slash added +>>> parse('') + + # Base Tests -- no rewrite >>> parse('', urlrewriter=no_base_canon_rewriter) diff --git a/pywb/rewrite/test/test_rewrite_live.py b/pywb/rewrite/test/test_rewrite_live.py index 8153e223..b7380977 100644 --- a/pywb/rewrite/test/test_rewrite_live.py +++ b/pywb/rewrite/test/test_rewrite_live.py @@ -203,8 +203,9 @@ def test_example_4_rewrite_err(): def test_example_domain_specific_3(): status_headers, buff = get_rewritten('http://facebook.com/digitalpreservation', urlrewriter, follow_redirects=True) - # comment out bootloader - assert '/* Bootloader.configurePage' in buff + # comment out Bootloader.configurePage, if it is still there + if 'Bootloader.configurePage' in buff: + assert '/* Bootloader.configurePage' in buff def test_wombat_top(): #status_headers, buff = get_rewritten('https://assets-cdn.github.com/assets/github-0f06d0f46fe7bcfbf31f2380f23aec15ba21b8ec.js', urlrewriter) From 3e3794d4dc17f3baa5166a7b40e28294e23ab6b9 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Fri, 13 Mar 2015 11:04:37 -0700 Subject: [PATCH 4/5] Update CHANGES for 0.8.3 --- CHANGES.rst | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/CHANGES.rst b/CHANGES.rst index 335f43f6..8a22cce2 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,7 +1,13 @@ pywb 0.8.3 changelist ~~~~~~~~~~~~~~~~~~~~~ -* cookie rewrite: remove cookie ``secure`` flag to allow equivalent replay via http as well as https +* cookie rewrite: all cookie rewriters remove ``secure`` flag to allow equivalent replay of sites with cookies via HTTP and HTTPS. + +* html rewrite: fix ```` tag rewriting to keep add trailing slash if url is a hostname with no path, ex: + + ```` -> ```` + +* framed replay: fix double slash that remainded when rewriting top frame url. pywb 0.8.2 changelist From b2ce3feb80e7661af8032e7277edbe528d949e17 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Fri, 13 Mar 2015 11:05:32 -0700 Subject: [PATCH 5/5] readme fix --- CHANGES.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGES.rst b/CHANGES.rst index 8a22cce2..8b5d4211 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -3,7 +3,7 @@ pywb 0.8.3 changelist * cookie rewrite: all cookie rewriters remove ``secure`` flag to allow equivalent replay of sites with cookies via HTTP and HTTPS. -* html rewrite: fix ```` tag rewriting to keep add trailing slash if url is a hostname with no path, ex: +* html rewrite: fix ```` tag rewriting to add a trailing slash to the url if it is a hostname with no path, ex: ```` -> ````