mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
proxy cleanup: move HttpsUrlRewriter to url_rewriter module,
move strip_scheme to replay_views where it is used regex rewriters: use url rewriter for rewriting http:// in JS, instead of just prefix, to support custom rewriters (such as https->http rewriter in proxy mode)
This commit is contained in:
parent
68878fa72a
commit
e384425d48
@ -1,6 +1,6 @@
|
||||
import re
|
||||
from io import BytesIO
|
||||
|
||||
from pywb.rewrite.url_rewriter import UrlRewriter
|
||||
from pywb.utils.bufferedreaders import ChunkedDataReader
|
||||
from pywb.framework.wbrequestresponse import WbResponse
|
||||
|
||||
@ -11,6 +11,9 @@ from pywb.utils.loaders import LimitReader
|
||||
|
||||
#=================================================================
|
||||
class ReplayView:
|
||||
|
||||
STRIP_SCHEME = re.compile('^([\w]+:[/]*)?(.*?)$')
|
||||
|
||||
def __init__(self, content_loader, content_rewriter, head_insert_view = None,
|
||||
redir_to_exact = True, buffer_response = False, reporter = None):
|
||||
|
||||
@ -181,16 +184,20 @@ class ReplayView:
|
||||
if not status_headers.statusline.startswith('3'):
|
||||
return
|
||||
|
||||
# skip all 304s
|
||||
if (status_headers.statusline.startswith('304') and
|
||||
not wbrequest.wb_url.mod == 'id_'):
|
||||
|
||||
raise CaptureException('Skipping 304 Modified: ' + str(cdx))
|
||||
|
||||
request_url = wbrequest.wb_url.url.lower()
|
||||
location_url = status_headers.get_header('Location')
|
||||
if not location_url:
|
||||
if status_headers.statusline.startswith('304'):
|
||||
raise CaptureException('Skipping 304 Modified: ' + str(cdx))
|
||||
return
|
||||
return
|
||||
|
||||
location_url = location_url.lower()
|
||||
|
||||
if (UrlRewriter.strip_protocol(request_url) == UrlRewriter.strip_protocol(location_url)):
|
||||
if (ReplayView.strip_scheme(request_url) == ReplayView.strip_scheme(location_url)):
|
||||
raise CaptureException('Self Redirect: ' + str(cdx))
|
||||
|
||||
def _reject_referrer_self_redirect(self, wbrequest):
|
||||
@ -206,6 +213,36 @@ class ReplayView:
|
||||
request_url = (wbrequest.host_prefix +
|
||||
wbrequest.rel_prefix + str(wbrequest.wb_url))
|
||||
|
||||
if (UrlRewriter.strip_protocol(request_url) ==
|
||||
UrlRewriter.strip_protocol(wbrequest.referrer)):
|
||||
if (ReplayView.strip_scheme(request_url) ==
|
||||
ReplayView.strip_scheme(wbrequest.referrer)):
|
||||
raise CaptureException('Self Redirect via Referrer: ' + str(wbrequest.wb_url))
|
||||
|
||||
|
||||
@staticmethod
|
||||
def strip_scheme(url):
|
||||
"""
|
||||
>>> ReplayView.strip_scheme('https://example.com') == ReplayView.strip_scheme('http://example.com')
|
||||
True
|
||||
|
||||
>>> ReplayView.strip_scheme('https://example.com') == ReplayView.strip_scheme('http:/example.com')
|
||||
True
|
||||
|
||||
>>> ReplayView.strip_scheme('https://example.com') == ReplayView.strip_scheme('example.com')
|
||||
True
|
||||
|
||||
>>> ReplayView.strip_scheme('about://example.com') == ReplayView.strip_scheme('example.com')
|
||||
True
|
||||
"""
|
||||
m = ReplayView.STRIP_SCHEME.match(url)
|
||||
if not m:
|
||||
return url
|
||||
|
||||
match = m.group(2)
|
||||
if match:
|
||||
return match
|
||||
else:
|
||||
return url
|
||||
|
||||
if __name__ == "__main__":
|
||||
import doctest
|
||||
doctest.testmod()
|
||||
|
@ -2,6 +2,7 @@ from wbrequestresponse import WbResponse, WbRequest
|
||||
from archivalrouter import ArchivalRouter
|
||||
import urlparse
|
||||
|
||||
from pywb.rewrite.url_rewriter import HttpsUrlRewriter
|
||||
|
||||
#=================================================================
|
||||
# An experimental router which combines both archival and proxy modes
|
||||
@ -64,7 +65,7 @@ class ProxyRouter:
|
||||
#rel_prefix=url,
|
||||
host_prefix=self.hostpaths[0],
|
||||
wburl_class=self.handler.get_wburl_type(),
|
||||
urlrewriter_class=ProxyHttpsUrlRewriter,
|
||||
urlrewriter_class=HttpsUrlRewriter,
|
||||
use_abs_prefix=False,
|
||||
is_proxy=True)
|
||||
|
||||
@ -97,26 +98,3 @@ class ProxyRouter:
|
||||
content_type = 'application/x-ns-proxy-autoconfig'
|
||||
|
||||
return WbResponse.text_response(buff, content_type=content_type)
|
||||
|
||||
|
||||
#=================================================================
|
||||
# A rewriter which only rewrites https -> http
|
||||
#=================================================================
|
||||
class ProxyHttpsUrlRewriter:
|
||||
HTTP = 'http://'
|
||||
HTTPS = 'https://'
|
||||
|
||||
def __init__(self, wbrequest, prefix):
|
||||
pass
|
||||
|
||||
def rewrite(self, url, mod=None):
|
||||
if url.startswith(self.HTTPS):
|
||||
return self.HTTP + url[len(self.HTTPS):]
|
||||
else:
|
||||
return url
|
||||
|
||||
def get_timestamp_url(self, timestamp, url):
|
||||
return url
|
||||
|
||||
def get_abs_url(self, url=''):
|
||||
return url
|
||||
|
@ -108,7 +108,10 @@ class JSLinkOnlyRewriter(RegexRewriter):
|
||||
JS_HTTPX = r'(?<="|\')(?:https?:)?\\{0,2}/\\{0,2}/[A-Za-z0-9:_@.-]+'
|
||||
|
||||
def __init__(self, rewriter, rules=[]):
|
||||
rules = rules + [(self.JS_HTTPX, rewriter.get_abs_url(), 0)]
|
||||
rules = rules + [
|
||||
#(self.JS_HTTPX, rewriter.get_abs_url(), 0)
|
||||
(self.JS_HTTPX, RegexRewriter.archival_rewrite(rewriter), 0)
|
||||
]
|
||||
super(JSLinkOnlyRewriter, self).__init__(rules)
|
||||
|
||||
|
||||
|
@ -4,6 +4,7 @@ import urlparse
|
||||
from wburl import WbUrl
|
||||
|
||||
|
||||
#=================================================================
|
||||
class UrlRewriter:
|
||||
"""
|
||||
>>> do_rewrite('other.html', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/')
|
||||
@ -30,6 +31,9 @@ class UrlRewriter:
|
||||
>>> do_rewrite('http://some-other-site.com', '20101226101112/http://example.com/index.html', 'localhost:8080/')
|
||||
'localhost:8080/20101226101112/http://some-other-site.com'
|
||||
|
||||
>>> do_rewrite(r'http:\/\/some-other-site.com', '20101226101112/http://example.com/index.html', 'localhost:8080/')
|
||||
'localhost:8080/20101226101112/http:\\\\/\\\\/some-other-site.com'
|
||||
|
||||
>>> do_rewrite('../../other.html', '2020/http://example.com/index.html', '/')
|
||||
'/2020/http://example.com/other.html'
|
||||
|
||||
@ -50,14 +54,11 @@ class UrlRewriter:
|
||||
|
||||
>>> UrlRewriter('2013id_/example.com/file/path/blah.html', '/123/').get_timestamp_url('20131024')
|
||||
'/123/20131024id_/http://example.com/file/path/blah.html'
|
||||
|
||||
>>> UrlRewriter.strip_protocol('https://example.com') == UrlRewriter.strip_protocol('http://example.com')
|
||||
True
|
||||
"""
|
||||
|
||||
NO_REWRITE_URI_PREFIX = ['#', 'javascript:', 'data:', 'mailto:', 'about:']
|
||||
|
||||
PROTOCOLS = ['http:/', 'https:/', '//', 'ftp:/', 'mms:/', 'rtsp:/', 'wais:/']
|
||||
PROTOCOLS = ['http:', 'https:', '//', 'ftp:', 'mms:', 'rtsp:', 'wais:']
|
||||
|
||||
def __init__(self, wburl, prefix):
|
||||
self.wburl = wburl if isinstance(wburl, WbUrl) else WbUrl(wburl)
|
||||
@ -109,19 +110,45 @@ class UrlRewriter:
|
||||
def __repr__(self):
|
||||
return "UrlRewriter('{0}', '{1}')".format(self.wburl, self.prefix)
|
||||
|
||||
@staticmethod
|
||||
def strip_protocol(url):
|
||||
for protocol in UrlRewriter.PROTOCOLS:
|
||||
if url.startswith(protocol):
|
||||
return url[len(protocol):]
|
||||
|
||||
return url
|
||||
|
||||
|
||||
def do_rewrite(rel_url, base_url, prefix, mod = None):
|
||||
rewriter = UrlRewriter(base_url, prefix)
|
||||
return rewriter.rewrite(rel_url, mod)
|
||||
|
||||
|
||||
#=================================================================
|
||||
class HttpsUrlRewriter:
|
||||
"""
|
||||
A url rewriter which urls that start with https:// to http://
|
||||
Other urls/input is unchanged.
|
||||
|
||||
>>> HttpsUrlRewriter(None, None).rewrite('https://example.com/abc')
|
||||
'http://example.com/abc'
|
||||
|
||||
>>> HttpsUrlRewriter(None, None).rewrite('http://example.com/abc')
|
||||
'http://example.com/abc'
|
||||
"""
|
||||
HTTP = 'http://'
|
||||
HTTPS = 'https://'
|
||||
|
||||
def __init__(self, wburl, prefix):
|
||||
pass
|
||||
|
||||
def rewrite(self, url, mod=None):
|
||||
if url.startswith(self.HTTPS):
|
||||
result = self.HTTP + url[len(self.HTTPS):]
|
||||
return result
|
||||
else:
|
||||
return url
|
||||
|
||||
def get_timestamp_url(self, timestamp, url):
|
||||
return url
|
||||
|
||||
def get_abs_url(self, url=''):
|
||||
return url
|
||||
|
||||
def set_base_url(self, newUrl):
|
||||
pass
|
||||
if __name__ == "__main__":
|
||||
import doctest
|
||||
doctest.testmod()
|
||||
|
Loading…
x
Reference in New Issue
Block a user