From 2792a92ff6484c5cefda495c6e9426d2ac879af9 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 4 Aug 2014 21:11:46 -0700 Subject: [PATCH] rewrite: remove extra wb_url param from rewrite_content(), the wb_url will come from the urlrewriter, to get the 'mod' --- pywb/rewrite/rewrite_content.py | 4 +++- pywb/rewrite/rewrite_live.py | 11 ++--------- pywb/rewrite/test/test_rewrite_live.py | 21 +++++++-------------- pywb/rewrite/test/test_url_rewriter.py | 4 ++-- pywb/rewrite/url_rewriter.py | 5 +---- pywb/webapp/live_rewrite_handler.py | 8 ++++++-- pywb/webapp/replay_views.py | 3 +-- 7 files changed, 22 insertions(+), 34 deletions(-) diff --git a/pywb/rewrite/rewrite_content.py b/pywb/rewrite/rewrite_content.py index 3cbcd362..207d879e 100644 --- a/pywb/rewrite/rewrite_content.py +++ b/pywb/rewrite/rewrite_content.py @@ -58,10 +58,12 @@ class RewriteContent: return (rewritten_headers, stream) - def rewrite_content(self, wb_url, urlrewriter, headers, stream, + def rewrite_content(self, urlrewriter, headers, stream, head_insert_func=None, urlkey='', cdx=None): + wb_url = urlrewriter.wburl + if (wb_url.is_identity or (not head_insert_func and wb_url.is_banner_only)): status_headers, stream = self.sanitize_content(headers, stream) diff --git a/pywb/rewrite/rewrite_live.py b/pywb/rewrite/rewrite_live.py index 5d77ff52..41313738 100644 --- a/pywb/rewrite/rewrite_live.py +++ b/pywb/rewrite/rewrite_live.py @@ -119,7 +119,7 @@ class LiveRewriter(object): return (status_headers, stream) - def fetch_request(self, wb_url, urlrewriter, + def fetch_request(self, url, urlrewriter, head_insert_func=None, urlkey=None, env=None, @@ -128,12 +128,6 @@ class LiveRewriter(object): follow_redirects=False, proxies=None): - if isinstance(wb_url, str): - url = wb_url - wb_url = WbUrl(url) - else: - url = wb_url.url - ts_err = url.split('///') if len(ts_err) > 1 and ts_err[0] != 'file:': @@ -167,8 +161,7 @@ class LiveRewriter(object): } result = (self.rewriter. - rewrite_content(wb_url, - urlrewriter, + rewrite_content(urlrewriter, status_headers, stream, head_insert_func=head_insert_func, diff --git a/pywb/rewrite/test/test_rewrite_live.py b/pywb/rewrite/test/test_rewrite_live.py index af25762b..fcb51ea3 100644 --- a/pywb/rewrite/test/test_rewrite_live.py +++ b/pywb/rewrite/test/test_rewrite_live.py @@ -10,6 +10,7 @@ from io import BytesIO # As such, the content may change and the test may break urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/pywb/') +bn_urlrewriter = UrlRewriter('20131226101010bn_/http://example.com/some/path/index.html', '/pywb/') def head_insert_func(rule, cdx): if rule.js_rewrite_location == True: @@ -35,8 +36,7 @@ def test_local_1(): def test_local_no_head(): - wb_url = WbUrl('file://' + get_test_dir() + 'text_content/sample_no_head.html') - status_headers, buff = get_rewritten(wb_url, + status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample_no_head.html', urlrewriter, head_insert_func, 'com,example,test)/') @@ -51,11 +51,8 @@ def test_local_no_head(): assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff def test_local_no_head_banner_only(): - wb_url = WbUrl('file://' + get_test_dir() + 'text_content/sample_no_head.html') - wb_url.mod = 'bn_' - - status_headers, buff = get_rewritten(wb_url, - urlrewriter, + status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample_no_head.html', + bn_urlrewriter, head_insert_func, 'com,example,test)/') @@ -69,11 +66,8 @@ def test_local_no_head_banner_only(): assert '"another.html"' in buff def test_local_banner_only(): - wb_url = WbUrl('file://' + get_test_dir() + 'text_content/sample.html') - wb_url.mod = 'bn_' - - status_headers, buff = get_rewritten(wb_url, - urlrewriter, + status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html', + bn_urlrewriter, head_insert_func, 'com,example,test)/') @@ -129,8 +123,7 @@ def test_example_4_rewrite_err(): assert status_headers.get_statuscode() == '200' def test_example_domain_specific_3(): - urlrewriter2 = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/pywb/') - status_headers, buff = get_rewritten('http://facebook.com/digitalpreservation', urlrewriter2, follow_redirects=True) + status_headers, buff = get_rewritten('http://facebook.com/digitalpreservation', urlrewriter, follow_redirects=True) # comment out bootloader assert '/* Bootloader.configurePage' in buff diff --git a/pywb/rewrite/test/test_url_rewriter.py b/pywb/rewrite/test/test_url_rewriter.py index 345c4faf..73340c95 100644 --- a/pywb/rewrite/test/test_url_rewriter.py +++ b/pywb/rewrite/test/test_url_rewriter.py @@ -76,10 +76,10 @@ # HttpsUrlRewriter tests ->>> HttpsUrlRewriter(None, None).rewrite('https://example.com/abc') +>>> HttpsUrlRewriter('http://example.com/', None).rewrite('https://example.com/abc') 'http://example.com/abc' ->>> HttpsUrlRewriter(None, None).rewrite('http://example.com/abc') +>>> HttpsUrlRewriter('http://example.com/', None).rewrite('http://example.com/abc') 'http://example.com/abc' """ diff --git a/pywb/rewrite/url_rewriter.py b/pywb/rewrite/url_rewriter.py index d9b42c1b..5b2f8e7b 100644 --- a/pywb/rewrite/url_rewriter.py +++ b/pywb/rewrite/url_rewriter.py @@ -126,7 +126,7 @@ class UrlRewriter(object): #================================================================= -class HttpsUrlRewriter(object): +class HttpsUrlRewriter(UrlRewriter): """ A url rewriter which urls that start with https:// to http:// Other urls/input is unchanged. @@ -135,9 +135,6 @@ class HttpsUrlRewriter(object): HTTP = 'http://' HTTPS = 'https://' - def __init__(self, wburl, prefix, full_prefix=None): - pass - def rewrite(self, url, mod=None): if url.startswith(self.HTTPS): result = self.HTTP + url[len(self.HTTPS):] diff --git a/pywb/webapp/live_rewrite_handler.py b/pywb/webapp/live_rewrite_handler.py index a1b602d4..cb279beb 100644 --- a/pywb/webapp/live_rewrite_handler.py +++ b/pywb/webapp/live_rewrite_handler.py @@ -38,6 +38,10 @@ class RewriteHandler(SearchPageWbUrlHandler): return self.render_content(wbrequest) except Exception as exc: + import traceback + err_details = traceback.format_exc(exc) + print err_details + url = wbrequest.wb_url.url msg = 'Could not load the url from the live web: ' + url raise LiveResourceException(msg=msg, url=url) @@ -53,8 +57,8 @@ class RewriteHandler(SearchPageWbUrlHandler): if ref_wburl_str: wbrequest.env['REL_REFERER'] = WbUrl(ref_wburl_str).url - wb_url = wbrequest.wb_url - result = self.rewriter.fetch_request(wb_url, wbrequest.urlrewriter, + result = self.rewriter.fetch_request(wbrequest.wb_url.url, + wbrequest.urlrewriter, head_insert_func=head_insert_func, req_headers=req_headers, env=wbrequest.env) diff --git a/pywb/webapp/replay_views.py b/pywb/webapp/replay_views.py index 5002a18d..9f32ad5d 100644 --- a/pywb/webapp/replay_views.py +++ b/pywb/webapp/replay_views.py @@ -130,8 +130,7 @@ class ReplayView(object): create_insert_func(wbrequest)) result = (self.content_rewriter. - rewrite_content(wbrequest.wb_url, - urlrewriter, + rewrite_content(urlrewriter, headers=status_headers, stream=stream, head_insert_func=head_insert_func,