From 925f8337a5fcc3f421158525453dc2f67edcc91e Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 27 Sep 2017 13:47:02 -0700 Subject: [PATCH] Proxy Mode Support (#244) proxy mode support readded! - use wsgiprox wrapper in FrontEndApp.init_proxy() with fixed collection prefix, ca options - cli --proxy flag added to specify proxy collection - cleanup: remove cookie rw (already disabled), fix post handling paths - headers: ensure request headers are not rewritten when in proxy mode, response headers marked with 'url-rewrite' also no rewritten if no url rewrite/proxy mode - urlrewriter: add IdentityRewriter with no rewriting as default, instead of SchemeOnlyUrlRewriter - memento support: for now, only include rel="original" and Memento-Datetime in for proxy replay response - responseloader: disable urllib3 unsecure response warnings - tests: add test for proxy replay and proxy record/replay of new collection --- pywb/apps/cli.py | 5 ++ pywb/apps/frontendapp.py | 13 +++- pywb/apps/rewriterapp.py | 38 ++++++------ pywb/rewrite/header_rewriter.py | 6 +- pywb/rewrite/rewriteinputreq.py | 48 +++++---------- pywb/rewrite/url_rewriter.py | 34 +++++++---- pywb/rules.yaml | 18 ------ pywb/warcserver/http.py | 2 + pywb/warcserver/inputrequest.py | 6 +- requirements.txt | 1 + tests/test_proxy.py | 101 ++++++++++++++++++++++++++++++++ 11 files changed, 185 insertions(+), 87 deletions(-) create mode 100644 tests/test_proxy.py diff --git a/pywb/apps/cli.py b/pywb/apps/cli.py index 6e6574ba..101be519 100644 --- a/pywb/apps/cli.py +++ b/pywb/apps/cli.py @@ -34,6 +34,8 @@ class BaseCli(object): parser.add_argument('--debug', action='store_true') parser.add_argument('--profile', action='store_true') + parser.add_argument('--proxy', help='Enable HTTP/S Proxy on specified collection') + parser.add_argument('--live', action='store_true', help='Add /live handler') self.desc = desc @@ -48,6 +50,9 @@ class BaseCli(object): self.application = self.load() + if self.r.proxy: + self.application = self.application.init_proxy(self.r.proxy) + if self.r.profile: from werkzeug.contrib.profiler import ProfilerMiddleware self.application = ProfilerMiddleware(self.application) diff --git a/pywb/apps/frontendapp.py b/pywb/apps/frontendapp.py index 4d0b3391..a799846a 100644 --- a/pywb/apps/frontendapp.py +++ b/pywb/apps/frontendapp.py @@ -8,6 +8,7 @@ from six.moves.urllib.parse import urljoin from six import iteritems from warcio.utils import to_native_str +from wsgiprox.wsgiprox import WSGIProxMiddleware from pywb.recorder.multifilewarcwriter import MultiFileWARCWriter from pywb.recorder.recorderapp import RecorderApp @@ -202,7 +203,6 @@ class FrontEndApp(object): metadata = self.get_metadata(coll) if record: metadata['type'] = 'record' - print('RECORD') if timemap_output: metadata['output'] = timemap_output @@ -304,6 +304,17 @@ class FrontEndApp(object): app_server = GeventServer(app, port=port, hostname='0.0.0.0') return app_server + def init_proxy(self, proxy_coll, opts=None): + if not opts: + opts = {'ca_name': 'pywb HTTPS Proxy CA', + 'ca_file_cache': os.path.join('proxy-certs', 'pywb-ca.pem')} + + prefix = '/{0}/bn_/'.format(proxy_coll) + + return WSGIProxMiddleware(self, prefix, + proxy_host='pywb.proxy', + proxy_options=opts) + # ============================================================================ class MetadataCache(object): diff --git a/pywb/apps/rewriterapp.py b/pywb/apps/rewriterapp.py index 49e29c63..48e2412f 100644 --- a/pywb/apps/rewriterapp.py +++ b/pywb/apps/rewriterapp.py @@ -6,7 +6,7 @@ from six.moves.urllib.parse import urlencode, urlsplit, urlunsplit from pywb.rewrite.default_rewriter import DefaultRewriter, RewriterWithJSProxy from pywb.rewrite.wburl import WbUrl -from pywb.rewrite.url_rewriter import UrlRewriter, SchemeOnlyUrlRewriter +from pywb.rewrite.url_rewriter import UrlRewriter, IdentityUrlRewriter from pywb.utils.wbexception import WbException from pywb.utils.canonicalize import canonicalize @@ -122,18 +122,18 @@ class RewriterApp(object): rel_prefix = self.get_rel_prefix(environ) full_prefix = host_prefix + rel_prefix + is_proxy = ('wsgiprox.proxy_host' in environ) + response = self.handle_custom_response(environ, wb_url, full_prefix, host_prefix, kwargs) if response: - return self.format_response(response, wb_url, full_prefix, is_timegate) - - is_proxy = ('wsgiprox.proxy_host' in environ) + return self.format_response(response, wb_url, full_prefix, is_timegate, is_proxy) if is_proxy: environ['pywb_proxy_magic'] = environ['wsgiprox.proxy_host'] - urlrewriter = SchemeOnlyUrlRewriter(wb_url, '') + urlrewriter = IdentityUrlRewriter(wb_url, '') framed_replay = False else: @@ -293,24 +293,18 @@ class RewriterApp(object): if not is_ajax and self.enable_memento: self._add_memento_links(cdx['url'], full_prefix, memento_dt, cdx['timestamp'], status_headers, - is_timegate) + is_timegate, is_proxy) set_content_loc = True if set_content_loc: status_headers.headers.append(('Content-Location', urlrewriter.get_new_url(timestamp=cdx['timestamp'], url=cdx['url']))) - #gen = buffer_iter(status_headers, gen) response = WbResponse(status_headers, gen) - if is_proxy: - response.status_headers.remove_header('Content-Security-Policy-Report-Only') - response.status_headers.remove_header('Content-Security-Policy') - response.status_headers.remove_header('X-Frame-Options') - return response - def format_response(self, response, wb_url, full_prefix, is_timegate): + def format_response(self, response, wb_url, full_prefix, is_timegate, is_proxy): memento_ts = None if not isinstance(response, WbResponse): content_type = 'text/html' @@ -324,11 +318,11 @@ class RewriterApp(object): response = WbResponse.text_response(response, content_type=content_type) self._add_memento_links(wb_url.url, full_prefix, None, memento_ts, - response.status_headers, is_timegate) + response.status_headers, is_timegate, is_proxy) return response def _add_memento_links(self, url, full_prefix, memento_dt, memento_ts, - status_headers, is_timegate): + status_headers, is_timegate, is_proxy): # memento url + header if not memento_dt and memento_ts: @@ -337,17 +331,21 @@ class RewriterApp(object): if memento_dt: status_headers.headers.append(('Memento-Datetime', memento_dt)) - memento_url = full_prefix + memento_ts + self.replay_mod - memento_url += '/' + url + if is_proxy: + memento_url = url + else: + memento_url = full_prefix + memento_ts + self.replay_mod + memento_url += '/' + url else: memento_url = None timegate_url, timemap_url = self._get_timegate_timemap(url, full_prefix) link = [] - link.append(MementoUtils.make_link(url, 'original')) - link.append(MementoUtils.make_link(timegate_url, 'timegate')) - link.append(MementoUtils.make_link(timemap_url, 'timemap')) + if not is_proxy: + link.append(MementoUtils.make_link(url, 'original')) + link.append(MementoUtils.make_link(timegate_url, 'timegate')) + link.append(MementoUtils.make_link(timemap_url, 'timemap')) if memento_dt: link.append(MementoUtils.make_memento_link(memento_url, 'memento', memento_dt)) diff --git a/pywb/rewrite/header_rewriter.py b/pywb/rewrite/header_rewriter.py index 9a2ce32f..e851c442 100644 --- a/pywb/rewrite/header_rewriter.py +++ b/pywb/rewrite/header_rewriter.py @@ -34,6 +34,7 @@ class DefaultHeaderRewriter(object): 'content-md5': 'prefix', 'content-range': 'keep', 'content-security-policy': 'prefix', + 'content-security-policy-report-only': 'prefix', 'content-type': 'keep', 'date': 'keep', @@ -102,7 +103,10 @@ class DefaultHeaderRewriter(object): return (name, value) elif rule == 'url-rewrite': - return (name, self.rwinfo.url_rewriter.rewrite(value)) + if self.rwinfo.is_url_rw(): + return (name, self.rwinfo.url_rewriter.rewrite(value)) + else: + return (name, value) elif rule == 'prefix-if-content-rewrite': if self.rwinfo.is_content_rw: diff --git a/pywb/rewrite/rewriteinputreq.py b/pywb/rewrite/rewriteinputreq.py index acc81f3d..21efb1f5 100644 --- a/pywb/rewrite/rewriteinputreq.py +++ b/pywb/rewrite/rewriteinputreq.py @@ -19,9 +19,14 @@ class RewriteInputRequest(DirectWSGIInputRequest): self.rewriter = rewriter self.extra_cookie = None - self.splits = urlsplit(self.url) + is_proxy = ('wsgiprox.proxy_host' in env) + + self.splits = urlsplit(self.url) if not is_proxy else None def get_full_request_uri(self): + if not self.splits: + return self.url + uri = self.splits.path if not uri: uri = '/' @@ -39,17 +44,20 @@ class RewriteInputRequest(DirectWSGIInputRequest): for name, value in iteritems(self.env): if name == 'HTTP_HOST': name = 'Host' - value = self.splits.netloc + if self.splits: + value = self.splits.netloc elif name == 'HTTP_ORIGIN': name = 'Origin' - value = (self.splits.scheme + '://' + self.splits.netloc) + if self.splits: + value = (self.splits.scheme + '://' + self.splits.netloc) elif name == 'HTTP_X_CSRFTOKEN': name = 'X-CSRFToken' - cookie_val = extract_client_cookie(self.env, 'csrftoken') - if cookie_val: - value = cookie_val + if self.splits: + cookie_val = extract_client_cookie(self.env, 'csrftoken') + if cookie_val: + value = cookie_val elif name == 'HTTP_X_PYWB_REQUESTED_WITH': continue @@ -62,12 +70,8 @@ class RewriteInputRequest(DirectWSGIInputRequest): elif name == 'HTTP_X_FORWARDED_PROTO': name = 'X-Forwarded-Proto' - value = self.splits.scheme - - elif name == 'HTTP_COOKIE': - name = 'Cookie' - value = self._req_cookie_rewrite(value) - has_cookies = True + if self.splits: + value = self.splits.scheme elif name.startswith('HTTP_'): name = name[5:].title().replace('_', '-') @@ -81,31 +85,11 @@ class RewriteInputRequest(DirectWSGIInputRequest): if value: headers[name] = value - if not has_cookies: - value = self._req_cookie_rewrite('') - if value: - headers['Cookie'] = value - if self.extra_cookie: headers['Cookie'] = self.extra_cookie + ';' + headers.get('Cookie', '') return headers - def _req_cookie_rewrite(self, value): - return value - - rule = self.rewriter.ruleset.get_first_match(self.urlkey) - if not rule or not rule.req_cookie_rewrite: - return value - - for cr in rule.req_cookie_rewrite: - try: - value = cr['rx'].sub(cr['replace'], value) - except KeyError: - pass - - return value - def extract_range(self): use_206 = False start = None diff --git a/pywb/rewrite/url_rewriter.py b/pywb/rewrite/url_rewriter.py index 273d5c36..6a42e3bf 100644 --- a/pywb/rewrite/url_rewriter.py +++ b/pywb/rewrite/url_rewriter.py @@ -161,7 +161,28 @@ class UrlRewriter(object): #================================================================= -class SchemeOnlyUrlRewriter(UrlRewriter): +class IdentityUrlRewriter(UrlRewriter): + """ + No rewriting performed, return original url + """ + def rewrite(self, url, mod=None): + return url + + def get_new_url(self, **kwargs): + return kwargs.get('url', self.wburl.url) + + def rebase_rewriter(self, new_url): + return self + + def get_cookie_rewriter(self, scope=None): + return None + + def deprefix_url(self): + return self.wburl.url + + +#================================================================= +class SchemeOnlyUrlRewriter(IdentityUrlRewriter): """ A url rewriter which ensures that any urls have the same scheme (http or https) as the base url. @@ -182,14 +203,3 @@ class SchemeOnlyUrlRewriter(UrlRewriter): return url - def get_new_url(self, **kwargs): - return kwargs.get('url', self.wburl.url) - - def rebase_rewriter(self, new_url): - return self - - def get_cookie_rewriter(self, scope=None): - return None - - def deprefix_url(self): - return self.wburl.url diff --git a/pywb/rules.yaml b/pywb/rules.yaml index 25d7139b..6e18c111 100644 --- a/pywb/rules.yaml +++ b/pywb/rules.yaml @@ -337,13 +337,6 @@ rules: - match: 'yt\.setConfig.*PLAYER_CONFIG.*args":\s*{' replace: '{0} "dash": "0", dashmpd: "", ' - req_cookie_rewrite: - - match: '^(((?!PREF).)*)$' - replace: '\1; PREF=f2=40000000' - - - match: '(.*PREF=)([^ ;]*)(.*)' - replace: '\1&f2=40000000\3' - # testing rules -- not for valid domain #================================================================= # this rule block is a non-existent prefix merely for testing @@ -376,17 +369,6 @@ rules: rewrite: js_rewrite_location: urls - req_cookie_rewrite: - - match: '^(((?!FOO).)*)$' - replace: '\1; FOO=bar=1' - - - match: '(.*FOO=)([^ ;]*)(.*)' - replace: '\1&bar=1\3' - - - match: '' - invalid_: '' - - # all domain rules -- fallback to this dataset #================================================================= # Applies to all urls -- should be last diff --git a/pywb/warcserver/http.py b/pywb/warcserver/http.py index 5a1b19f8..a667caf4 100644 --- a/pywb/warcserver/http.py +++ b/pywb/warcserver/http.py @@ -1,7 +1,9 @@ from requests.adapters import HTTPAdapter +import requests class DefaultAdapters(object): live_adapter = HTTPAdapter(max_retries=3) remote_adapter = HTTPAdapter(max_retries=3) +requests.packages.urllib3.disable_warnings() diff --git a/pywb/warcserver/inputrequest.py b/pywb/warcserver/inputrequest.py index 491e60b1..0af5faf1 100644 --- a/pywb/warcserver/inputrequest.py +++ b/pywb/warcserver/inputrequest.py @@ -3,8 +3,8 @@ from warcio.statusandheaders import StatusAndHeadersParser from warcio.utils import to_native_str -from six.moves.urllib.parse import urlsplit, quote, unquote_plus -from six import iteritems, StringIO +from six.moves.urllib.parse import urlsplit, quote, unquote_plus, urlencode +from six import iteritems, StringIO, PY3 from io import BytesIO import base64 @@ -230,7 +230,7 @@ class PostQueryExtractor(object): environ=env, keep_blank_values=True) - if six.PY3: + if PY3: args['encoding'] = 'utf-8' data = cgi.FieldStorage(**args) diff --git a/requirements.txt b/requirements.txt index 175a4fc6..246dc957 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,3 +12,4 @@ webencodings gevent==1.2.2 webassets==0.12.1 portalocker +wsgiprox>=1.4.1 diff --git a/tests/test_proxy.py b/tests/test_proxy.py new file mode 100644 index 00000000..6b22f683 --- /dev/null +++ b/tests/test_proxy.py @@ -0,0 +1,101 @@ +from pywb.warcserver.test.testutils import BaseTestClass, TempDirTests + +from .base_config_test import CollsDirMixin +from pywb.utils.geventserver import GeventServer +from pywb.apps.frontendapp import FrontEndApp +from pywb.manager.manager import main as manager + +import os +import requests +import pytest + + +# ============================================================================ +@pytest.fixture(params=['http', 'https']) +def scheme(request): + return request.param + + +# ============================================================================ +class BaseTestProxy(TempDirTests, BaseTestClass): + @classmethod + def setup_class(cls, coll='pywb', config_file='config_test.yaml'): + super(BaseTestProxy, cls).setup_class() + config_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), config_file) + + cls.root_ca_file = os.path.join(cls.root_dir, 'pywb-ca-test.pem') + + cls.app = FrontEndApp(config_file=config_file) + opts = {'ca_name': 'pywb HTTPS Proxy CA', + 'ca_file_cache': cls.root_ca_file} + + cls.proxy_app = cls.app.init_proxy(coll, opts) + + cls.server = GeventServer(cls.proxy_app) + cls.proxies = cls.proxy_dict(cls.server.port) + + @classmethod + def teardown_class(cls): + cls.server.stop() + + super(BaseTestProxy, cls).teardown_class() + + @classmethod + def proxy_dict(cls, port, host='localhost'): + return {'http': 'http://{0}:{1}'.format(host, port), + 'https': 'https://{0}:{1}'.format(host, port) + } + + +# ============================================================================ +class TestProxy(BaseTestProxy): + def test_proxy_replay(self, scheme): + res = requests.get('{0}://example.com/'.format(scheme), + proxies=self.proxies, + verify=self.root_ca_file) + + assert 'WB Insert' in res.text + assert 'Example Domain' in res.text + + assert res.headers['Link'] == '; rel="memento"; datetime="Mon, 27 Jan 2014 17:12:51 GMT"' + assert res.headers['Memento-Datetime'] == 'Mon, 27 Jan 2014 17:12:51 GMT' + + +# ============================================================================ +class TestRecordingProxy(CollsDirMixin, BaseTestProxy): + @classmethod + def setup_class(cls, coll='pywb', config_file='config_test.yaml'): + super(TestRecordingProxy, cls).setup_class('test/record', 'config_test_record.yaml') + manager(['init', 'test']) + + @classmethod + def teardown_class(cls): + if cls.app.recorder: + cls.app.recorder.writer.close() + super(TestRecordingProxy, cls).teardown_class() + + def test_proxy_record(self, scheme): + archive_dir = os.path.join(self.root_dir, '_test_colls', 'test', 'archive') + assert os.path.isdir(archive_dir) + + res = requests.get('{0}://httpbin.org/'.format(scheme), + proxies=self.proxies, + verify=self.root_ca_file) + + assert 'is_live = true' in res.text + assert 'httpbin(1)' in res.text + + assert len(os.listdir(archive_dir)) == 1 + + def test_proxy_replay_recorded(self, scheme): + manager(['reindex', 'test']) + + self.proxy_app.prefix_resolver.fixed_prefix = '/test/bn_/' + + res = requests.get('{0}://httpbin.org/'.format(scheme), + proxies=self.proxies, + verify=self.root_ca_file) + + assert 'is_live = false' in res.text + assert 'httpbin(1)' in res.text +