1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

proxy cleanup: move HttpsUrlRewriter to url_rewriter module,

move strip_scheme to replay_views where it is used
regex rewriters: use url rewriter for rewriting http:// in JS,
instead of just prefix, to support custom rewriters (such as
https->http rewriter in proxy mode)
This commit is contained in:
Ilya Kreymer 2014-03-09 14:21:32 -07:00
parent 68878fa72a
commit e384425d48
4 changed files with 89 additions and 44 deletions

View File

@ -1,6 +1,6 @@
import re
from io import BytesIO
from pywb.rewrite.url_rewriter import UrlRewriter
from pywb.utils.bufferedreaders import ChunkedDataReader
from pywb.framework.wbrequestresponse import WbResponse
@ -11,6 +11,9 @@ from pywb.utils.loaders import LimitReader
#=================================================================
class ReplayView:
STRIP_SCHEME = re.compile('^([\w]+:[/]*)?(.*?)$')
def __init__(self, content_loader, content_rewriter, head_insert_view = None,
redir_to_exact = True, buffer_response = False, reporter = None):
@ -181,16 +184,20 @@ class ReplayView:
if not status_headers.statusline.startswith('3'):
return
# skip all 304s
if (status_headers.statusline.startswith('304') and
not wbrequest.wb_url.mod == 'id_'):
raise CaptureException('Skipping 304 Modified: ' + str(cdx))
request_url = wbrequest.wb_url.url.lower()
location_url = status_headers.get_header('Location')
if not location_url:
if status_headers.statusline.startswith('304'):
raise CaptureException('Skipping 304 Modified: ' + str(cdx))
return
return
location_url = location_url.lower()
if (UrlRewriter.strip_protocol(request_url) == UrlRewriter.strip_protocol(location_url)):
if (ReplayView.strip_scheme(request_url) == ReplayView.strip_scheme(location_url)):
raise CaptureException('Self Redirect: ' + str(cdx))
def _reject_referrer_self_redirect(self, wbrequest):
@ -206,6 +213,36 @@ class ReplayView:
request_url = (wbrequest.host_prefix +
wbrequest.rel_prefix + str(wbrequest.wb_url))
if (UrlRewriter.strip_protocol(request_url) ==
UrlRewriter.strip_protocol(wbrequest.referrer)):
if (ReplayView.strip_scheme(request_url) ==
ReplayView.strip_scheme(wbrequest.referrer)):
raise CaptureException('Self Redirect via Referrer: ' + str(wbrequest.wb_url))
@staticmethod
def strip_scheme(url):
"""
>>> ReplayView.strip_scheme('https://example.com') == ReplayView.strip_scheme('http://example.com')
True
>>> ReplayView.strip_scheme('https://example.com') == ReplayView.strip_scheme('http:/example.com')
True
>>> ReplayView.strip_scheme('https://example.com') == ReplayView.strip_scheme('example.com')
True
>>> ReplayView.strip_scheme('about://example.com') == ReplayView.strip_scheme('example.com')
True
"""
m = ReplayView.STRIP_SCHEME.match(url)
if not m:
return url
match = m.group(2)
if match:
return match
else:
return url
if __name__ == "__main__":
import doctest
doctest.testmod()

View File

@ -2,6 +2,7 @@ from wbrequestresponse import WbResponse, WbRequest
from archivalrouter import ArchivalRouter
import urlparse
from pywb.rewrite.url_rewriter import HttpsUrlRewriter
#=================================================================
# An experimental router which combines both archival and proxy modes
@ -64,7 +65,7 @@ class ProxyRouter:
#rel_prefix=url,
host_prefix=self.hostpaths[0],
wburl_class=self.handler.get_wburl_type(),
urlrewriter_class=ProxyHttpsUrlRewriter,
urlrewriter_class=HttpsUrlRewriter,
use_abs_prefix=False,
is_proxy=True)
@ -97,26 +98,3 @@ class ProxyRouter:
content_type = 'application/x-ns-proxy-autoconfig'
return WbResponse.text_response(buff, content_type=content_type)
#=================================================================
# A rewriter which only rewrites https -> http
#=================================================================
class ProxyHttpsUrlRewriter:
HTTP = 'http://'
HTTPS = 'https://'
def __init__(self, wbrequest, prefix):
pass
def rewrite(self, url, mod=None):
if url.startswith(self.HTTPS):
return self.HTTP + url[len(self.HTTPS):]
else:
return url
def get_timestamp_url(self, timestamp, url):
return url
def get_abs_url(self, url=''):
return url

View File

@ -108,7 +108,10 @@ class JSLinkOnlyRewriter(RegexRewriter):
JS_HTTPX = r'(?<="|\')(?:https?:)?\\{0,2}/\\{0,2}/[A-Za-z0-9:_@.-]+'
def __init__(self, rewriter, rules=[]):
rules = rules + [(self.JS_HTTPX, rewriter.get_abs_url(), 0)]
rules = rules + [
#(self.JS_HTTPX, rewriter.get_abs_url(), 0)
(self.JS_HTTPX, RegexRewriter.archival_rewrite(rewriter), 0)
]
super(JSLinkOnlyRewriter, self).__init__(rules)

View File

@ -4,6 +4,7 @@ import urlparse
from wburl import WbUrl
#=================================================================
class UrlRewriter:
"""
>>> do_rewrite('other.html', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/')
@ -30,6 +31,9 @@ class UrlRewriter:
>>> do_rewrite('http://some-other-site.com', '20101226101112/http://example.com/index.html', 'localhost:8080/')
'localhost:8080/20101226101112/http://some-other-site.com'
>>> do_rewrite(r'http:\/\/some-other-site.com', '20101226101112/http://example.com/index.html', 'localhost:8080/')
'localhost:8080/20101226101112/http:\\\\/\\\\/some-other-site.com'
>>> do_rewrite('../../other.html', '2020/http://example.com/index.html', '/')
'/2020/http://example.com/other.html'
@ -50,14 +54,11 @@ class UrlRewriter:
>>> UrlRewriter('2013id_/example.com/file/path/blah.html', '/123/').get_timestamp_url('20131024')
'/123/20131024id_/http://example.com/file/path/blah.html'
>>> UrlRewriter.strip_protocol('https://example.com') == UrlRewriter.strip_protocol('http://example.com')
True
"""
NO_REWRITE_URI_PREFIX = ['#', 'javascript:', 'data:', 'mailto:', 'about:']
PROTOCOLS = ['http:/', 'https:/', '//', 'ftp:/', 'mms:/', 'rtsp:/', 'wais:/']
PROTOCOLS = ['http:', 'https:', '//', 'ftp:', 'mms:', 'rtsp:', 'wais:']
def __init__(self, wburl, prefix):
self.wburl = wburl if isinstance(wburl, WbUrl) else WbUrl(wburl)
@ -109,19 +110,45 @@ class UrlRewriter:
def __repr__(self):
return "UrlRewriter('{0}', '{1}')".format(self.wburl, self.prefix)
@staticmethod
def strip_protocol(url):
for protocol in UrlRewriter.PROTOCOLS:
if url.startswith(protocol):
return url[len(protocol):]
return url
def do_rewrite(rel_url, base_url, prefix, mod = None):
rewriter = UrlRewriter(base_url, prefix)
return rewriter.rewrite(rel_url, mod)
#=================================================================
class HttpsUrlRewriter:
"""
A url rewriter which urls that start with https:// to http://
Other urls/input is unchanged.
>>> HttpsUrlRewriter(None, None).rewrite('https://example.com/abc')
'http://example.com/abc'
>>> HttpsUrlRewriter(None, None).rewrite('http://example.com/abc')
'http://example.com/abc'
"""
HTTP = 'http://'
HTTPS = 'https://'
def __init__(self, wburl, prefix):
pass
def rewrite(self, url, mod=None):
if url.startswith(self.HTTPS):
result = self.HTTP + url[len(self.HTTPS):]
return result
else:
return url
def get_timestamp_url(self, timestamp, url):
return url
def get_abs_url(self, url=''):
return url
def set_base_url(self, newUrl):
pass
if __name__ == "__main__":
import doctest
doctest.testmod()