diff --git a/pywb/core/replay_views.py b/pywb/core/replay_views.py index 7de46f21..f4f5346d 100644 --- a/pywb/core/replay_views.py +++ b/pywb/core/replay_views.py @@ -1,6 +1,6 @@ +import re from io import BytesIO -from pywb.rewrite.url_rewriter import UrlRewriter from pywb.utils.bufferedreaders import ChunkedDataReader from pywb.framework.wbrequestresponse import WbResponse @@ -11,6 +11,9 @@ from pywb.utils.loaders import LimitReader #================================================================= class ReplayView: + + STRIP_SCHEME = re.compile('^([\w]+:[/]*)?(.*?)$') + def __init__(self, content_loader, content_rewriter, head_insert_view = None, redir_to_exact = True, buffer_response = False, reporter = None): @@ -181,16 +184,20 @@ class ReplayView: if not status_headers.statusline.startswith('3'): return + # skip all 304s + if (status_headers.statusline.startswith('304') and + not wbrequest.wb_url.mod == 'id_'): + + raise CaptureException('Skipping 304 Modified: ' + str(cdx)) + request_url = wbrequest.wb_url.url.lower() location_url = status_headers.get_header('Location') if not location_url: - if status_headers.statusline.startswith('304'): - raise CaptureException('Skipping 304 Modified: ' + str(cdx)) - return + return location_url = location_url.lower() - if (UrlRewriter.strip_protocol(request_url) == UrlRewriter.strip_protocol(location_url)): + if (ReplayView.strip_scheme(request_url) == ReplayView.strip_scheme(location_url)): raise CaptureException('Self Redirect: ' + str(cdx)) def _reject_referrer_self_redirect(self, wbrequest): @@ -206,6 +213,36 @@ class ReplayView: request_url = (wbrequest.host_prefix + wbrequest.rel_prefix + str(wbrequest.wb_url)) - if (UrlRewriter.strip_protocol(request_url) == - UrlRewriter.strip_protocol(wbrequest.referrer)): + if (ReplayView.strip_scheme(request_url) == + ReplayView.strip_scheme(wbrequest.referrer)): raise CaptureException('Self Redirect via Referrer: ' + str(wbrequest.wb_url)) + + + @staticmethod + def strip_scheme(url): + """ + >>> ReplayView.strip_scheme('https://example.com') == ReplayView.strip_scheme('http://example.com') + True + + >>> ReplayView.strip_scheme('https://example.com') == ReplayView.strip_scheme('http:/example.com') + True + + >>> ReplayView.strip_scheme('https://example.com') == ReplayView.strip_scheme('example.com') + True + + >>> ReplayView.strip_scheme('about://example.com') == ReplayView.strip_scheme('example.com') + True + """ + m = ReplayView.STRIP_SCHEME.match(url) + if not m: + return url + + match = m.group(2) + if match: + return match + else: + return url + +if __name__ == "__main__": + import doctest + doctest.testmod() diff --git a/pywb/framework/proxy.py b/pywb/framework/proxy.py index 8aa8f81d..fa0421b0 100644 --- a/pywb/framework/proxy.py +++ b/pywb/framework/proxy.py @@ -2,6 +2,7 @@ from wbrequestresponse import WbResponse, WbRequest from archivalrouter import ArchivalRouter import urlparse +from pywb.rewrite.url_rewriter import HttpsUrlRewriter #================================================================= # An experimental router which combines both archival and proxy modes @@ -64,7 +65,7 @@ class ProxyRouter: #rel_prefix=url, host_prefix=self.hostpaths[0], wburl_class=self.handler.get_wburl_type(), - urlrewriter_class=ProxyHttpsUrlRewriter, + urlrewriter_class=HttpsUrlRewriter, use_abs_prefix=False, is_proxy=True) @@ -97,26 +98,3 @@ class ProxyRouter: content_type = 'application/x-ns-proxy-autoconfig' return WbResponse.text_response(buff, content_type=content_type) - - -#================================================================= -# A rewriter which only rewrites https -> http -#================================================================= -class ProxyHttpsUrlRewriter: - HTTP = 'http://' - HTTPS = 'https://' - - def __init__(self, wbrequest, prefix): - pass - - def rewrite(self, url, mod=None): - if url.startswith(self.HTTPS): - return self.HTTP + url[len(self.HTTPS):] - else: - return url - - def get_timestamp_url(self, timestamp, url): - return url - - def get_abs_url(self, url=''): - return url diff --git a/pywb/rewrite/regex_rewriters.py b/pywb/rewrite/regex_rewriters.py index a435b104..7970ab48 100644 --- a/pywb/rewrite/regex_rewriters.py +++ b/pywb/rewrite/regex_rewriters.py @@ -108,7 +108,10 @@ class JSLinkOnlyRewriter(RegexRewriter): JS_HTTPX = r'(?<="|\')(?:https?:)?\\{0,2}/\\{0,2}/[A-Za-z0-9:_@.-]+' def __init__(self, rewriter, rules=[]): - rules = rules + [(self.JS_HTTPX, rewriter.get_abs_url(), 0)] + rules = rules + [ + #(self.JS_HTTPX, rewriter.get_abs_url(), 0) + (self.JS_HTTPX, RegexRewriter.archival_rewrite(rewriter), 0) + ] super(JSLinkOnlyRewriter, self).__init__(rules) diff --git a/pywb/rewrite/url_rewriter.py b/pywb/rewrite/url_rewriter.py index 5e8aec19..632215b5 100644 --- a/pywb/rewrite/url_rewriter.py +++ b/pywb/rewrite/url_rewriter.py @@ -4,6 +4,7 @@ import urlparse from wburl import WbUrl +#================================================================= class UrlRewriter: """ >>> do_rewrite('other.html', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/') @@ -30,6 +31,9 @@ class UrlRewriter: >>> do_rewrite('http://some-other-site.com', '20101226101112/http://example.com/index.html', 'localhost:8080/') 'localhost:8080/20101226101112/http://some-other-site.com' + >>> do_rewrite(r'http:\/\/some-other-site.com', '20101226101112/http://example.com/index.html', 'localhost:8080/') + 'localhost:8080/20101226101112/http:\\\\/\\\\/some-other-site.com' + >>> do_rewrite('../../other.html', '2020/http://example.com/index.html', '/') '/2020/http://example.com/other.html' @@ -50,14 +54,11 @@ class UrlRewriter: >>> UrlRewriter('2013id_/example.com/file/path/blah.html', '/123/').get_timestamp_url('20131024') '/123/20131024id_/http://example.com/file/path/blah.html' - - >>> UrlRewriter.strip_protocol('https://example.com') == UrlRewriter.strip_protocol('http://example.com') - True """ NO_REWRITE_URI_PREFIX = ['#', 'javascript:', 'data:', 'mailto:', 'about:'] - PROTOCOLS = ['http:/', 'https:/', '//', 'ftp:/', 'mms:/', 'rtsp:/', 'wais:/'] + PROTOCOLS = ['http:', 'https:', '//', 'ftp:', 'mms:', 'rtsp:', 'wais:'] def __init__(self, wburl, prefix): self.wburl = wburl if isinstance(wburl, WbUrl) else WbUrl(wburl) @@ -109,19 +110,45 @@ class UrlRewriter: def __repr__(self): return "UrlRewriter('{0}', '{1}')".format(self.wburl, self.prefix) - @staticmethod - def strip_protocol(url): - for protocol in UrlRewriter.PROTOCOLS: - if url.startswith(protocol): - return url[len(protocol):] - - return url - def do_rewrite(rel_url, base_url, prefix, mod = None): rewriter = UrlRewriter(base_url, prefix) return rewriter.rewrite(rel_url, mod) + +#================================================================= +class HttpsUrlRewriter: + """ + A url rewriter which urls that start with https:// to http:// + Other urls/input is unchanged. + + >>> HttpsUrlRewriter(None, None).rewrite('https://example.com/abc') + 'http://example.com/abc' + + >>> HttpsUrlRewriter(None, None).rewrite('http://example.com/abc') + 'http://example.com/abc' + """ + HTTP = 'http://' + HTTPS = 'https://' + + def __init__(self, wburl, prefix): + pass + + def rewrite(self, url, mod=None): + if url.startswith(self.HTTPS): + result = self.HTTP + url[len(self.HTTPS):] + return result + else: + return url + + def get_timestamp_url(self, timestamp, url): + return url + + def get_abs_url(self, url=''): + return url + + def set_base_url(self, newUrl): + pass if __name__ == "__main__": import doctest doctest.testmod()