1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

Proxy Mode Support (#244)

proxy mode support readded!
- use wsgiprox wrapper in FrontEndApp.init_proxy() with fixed collection prefix, ca options
- cli --proxy <coll> flag added to specify proxy collection
- cleanup: remove cookie rw (already disabled), fix post handling paths
- headers: ensure request headers are not rewritten when in proxy mode, response headers marked with 'url-rewrite' also no rewritten if no url rewrite/proxy mode
- urlrewriter: add IdentityRewriter with no rewriting as default, instead of SchemeOnlyUrlRewriter
- memento support: for now, only include rel="original" and Memento-Datetime in for proxy replay response
- responseloader: disable urllib3 unsecure response warnings
- tests: add test for proxy replay and proxy record/replay of new collection
This commit is contained in:
Ilya Kreymer 2017-09-27 13:47:02 -07:00 committed by GitHub
parent bbbb62ad52
commit 925f8337a5
11 changed files with 185 additions and 87 deletions

View File

@ -34,6 +34,8 @@ class BaseCli(object):
parser.add_argument('--debug', action='store_true') parser.add_argument('--debug', action='store_true')
parser.add_argument('--profile', action='store_true') parser.add_argument('--profile', action='store_true')
parser.add_argument('--proxy', help='Enable HTTP/S Proxy on specified collection')
parser.add_argument('--live', action='store_true', help='Add /live handler') parser.add_argument('--live', action='store_true', help='Add /live handler')
self.desc = desc self.desc = desc
@ -48,6 +50,9 @@ class BaseCli(object):
self.application = self.load() self.application = self.load()
if self.r.proxy:
self.application = self.application.init_proxy(self.r.proxy)
if self.r.profile: if self.r.profile:
from werkzeug.contrib.profiler import ProfilerMiddleware from werkzeug.contrib.profiler import ProfilerMiddleware
self.application = ProfilerMiddleware(self.application) self.application = ProfilerMiddleware(self.application)

View File

@ -8,6 +8,7 @@ from six.moves.urllib.parse import urljoin
from six import iteritems from six import iteritems
from warcio.utils import to_native_str from warcio.utils import to_native_str
from wsgiprox.wsgiprox import WSGIProxMiddleware
from pywb.recorder.multifilewarcwriter import MultiFileWARCWriter from pywb.recorder.multifilewarcwriter import MultiFileWARCWriter
from pywb.recorder.recorderapp import RecorderApp from pywb.recorder.recorderapp import RecorderApp
@ -202,7 +203,6 @@ class FrontEndApp(object):
metadata = self.get_metadata(coll) metadata = self.get_metadata(coll)
if record: if record:
metadata['type'] = 'record' metadata['type'] = 'record'
print('RECORD')
if timemap_output: if timemap_output:
metadata['output'] = timemap_output metadata['output'] = timemap_output
@ -304,6 +304,17 @@ class FrontEndApp(object):
app_server = GeventServer(app, port=port, hostname='0.0.0.0') app_server = GeventServer(app, port=port, hostname='0.0.0.0')
return app_server return app_server
def init_proxy(self, proxy_coll, opts=None):
if not opts:
opts = {'ca_name': 'pywb HTTPS Proxy CA',
'ca_file_cache': os.path.join('proxy-certs', 'pywb-ca.pem')}
prefix = '/{0}/bn_/'.format(proxy_coll)
return WSGIProxMiddleware(self, prefix,
proxy_host='pywb.proxy',
proxy_options=opts)
# ============================================================================ # ============================================================================
class MetadataCache(object): class MetadataCache(object):

View File

@ -6,7 +6,7 @@ from six.moves.urllib.parse import urlencode, urlsplit, urlunsplit
from pywb.rewrite.default_rewriter import DefaultRewriter, RewriterWithJSProxy from pywb.rewrite.default_rewriter import DefaultRewriter, RewriterWithJSProxy
from pywb.rewrite.wburl import WbUrl from pywb.rewrite.wburl import WbUrl
from pywb.rewrite.url_rewriter import UrlRewriter, SchemeOnlyUrlRewriter from pywb.rewrite.url_rewriter import UrlRewriter, IdentityUrlRewriter
from pywb.utils.wbexception import WbException from pywb.utils.wbexception import WbException
from pywb.utils.canonicalize import canonicalize from pywb.utils.canonicalize import canonicalize
@ -122,18 +122,18 @@ class RewriterApp(object):
rel_prefix = self.get_rel_prefix(environ) rel_prefix = self.get_rel_prefix(environ)
full_prefix = host_prefix + rel_prefix full_prefix = host_prefix + rel_prefix
is_proxy = ('wsgiprox.proxy_host' in environ)
response = self.handle_custom_response(environ, wb_url, response = self.handle_custom_response(environ, wb_url,
full_prefix, host_prefix, full_prefix, host_prefix,
kwargs) kwargs)
if response: if response:
return self.format_response(response, wb_url, full_prefix, is_timegate) return self.format_response(response, wb_url, full_prefix, is_timegate, is_proxy)
is_proxy = ('wsgiprox.proxy_host' in environ)
if is_proxy: if is_proxy:
environ['pywb_proxy_magic'] = environ['wsgiprox.proxy_host'] environ['pywb_proxy_magic'] = environ['wsgiprox.proxy_host']
urlrewriter = SchemeOnlyUrlRewriter(wb_url, '') urlrewriter = IdentityUrlRewriter(wb_url, '')
framed_replay = False framed_replay = False
else: else:
@ -293,24 +293,18 @@ class RewriterApp(object):
if not is_ajax and self.enable_memento: if not is_ajax and self.enable_memento:
self._add_memento_links(cdx['url'], full_prefix, self._add_memento_links(cdx['url'], full_prefix,
memento_dt, cdx['timestamp'], status_headers, memento_dt, cdx['timestamp'], status_headers,
is_timegate) is_timegate, is_proxy)
set_content_loc = True set_content_loc = True
if set_content_loc: if set_content_loc:
status_headers.headers.append(('Content-Location', urlrewriter.get_new_url(timestamp=cdx['timestamp'], status_headers.headers.append(('Content-Location', urlrewriter.get_new_url(timestamp=cdx['timestamp'],
url=cdx['url']))) url=cdx['url'])))
#gen = buffer_iter(status_headers, gen)
response = WbResponse(status_headers, gen) response = WbResponse(status_headers, gen)
if is_proxy:
response.status_headers.remove_header('Content-Security-Policy-Report-Only')
response.status_headers.remove_header('Content-Security-Policy')
response.status_headers.remove_header('X-Frame-Options')
return response return response
def format_response(self, response, wb_url, full_prefix, is_timegate): def format_response(self, response, wb_url, full_prefix, is_timegate, is_proxy):
memento_ts = None memento_ts = None
if not isinstance(response, WbResponse): if not isinstance(response, WbResponse):
content_type = 'text/html' content_type = 'text/html'
@ -324,11 +318,11 @@ class RewriterApp(object):
response = WbResponse.text_response(response, content_type=content_type) response = WbResponse.text_response(response, content_type=content_type)
self._add_memento_links(wb_url.url, full_prefix, None, memento_ts, self._add_memento_links(wb_url.url, full_prefix, None, memento_ts,
response.status_headers, is_timegate) response.status_headers, is_timegate, is_proxy)
return response return response
def _add_memento_links(self, url, full_prefix, memento_dt, memento_ts, def _add_memento_links(self, url, full_prefix, memento_dt, memento_ts,
status_headers, is_timegate): status_headers, is_timegate, is_proxy):
# memento url + header # memento url + header
if not memento_dt and memento_ts: if not memento_dt and memento_ts:
@ -337,17 +331,21 @@ class RewriterApp(object):
if memento_dt: if memento_dt:
status_headers.headers.append(('Memento-Datetime', memento_dt)) status_headers.headers.append(('Memento-Datetime', memento_dt))
memento_url = full_prefix + memento_ts + self.replay_mod if is_proxy:
memento_url += '/' + url memento_url = url
else:
memento_url = full_prefix + memento_ts + self.replay_mod
memento_url += '/' + url
else: else:
memento_url = None memento_url = None
timegate_url, timemap_url = self._get_timegate_timemap(url, full_prefix) timegate_url, timemap_url = self._get_timegate_timemap(url, full_prefix)
link = [] link = []
link.append(MementoUtils.make_link(url, 'original')) if not is_proxy:
link.append(MementoUtils.make_link(timegate_url, 'timegate')) link.append(MementoUtils.make_link(url, 'original'))
link.append(MementoUtils.make_link(timemap_url, 'timemap')) link.append(MementoUtils.make_link(timegate_url, 'timegate'))
link.append(MementoUtils.make_link(timemap_url, 'timemap'))
if memento_dt: if memento_dt:
link.append(MementoUtils.make_memento_link(memento_url, 'memento', memento_dt)) link.append(MementoUtils.make_memento_link(memento_url, 'memento', memento_dt))

View File

@ -34,6 +34,7 @@ class DefaultHeaderRewriter(object):
'content-md5': 'prefix', 'content-md5': 'prefix',
'content-range': 'keep', 'content-range': 'keep',
'content-security-policy': 'prefix', 'content-security-policy': 'prefix',
'content-security-policy-report-only': 'prefix',
'content-type': 'keep', 'content-type': 'keep',
'date': 'keep', 'date': 'keep',
@ -102,7 +103,10 @@ class DefaultHeaderRewriter(object):
return (name, value) return (name, value)
elif rule == 'url-rewrite': elif rule == 'url-rewrite':
return (name, self.rwinfo.url_rewriter.rewrite(value)) if self.rwinfo.is_url_rw():
return (name, self.rwinfo.url_rewriter.rewrite(value))
else:
return (name, value)
elif rule == 'prefix-if-content-rewrite': elif rule == 'prefix-if-content-rewrite':
if self.rwinfo.is_content_rw: if self.rwinfo.is_content_rw:

View File

@ -19,9 +19,14 @@ class RewriteInputRequest(DirectWSGIInputRequest):
self.rewriter = rewriter self.rewriter = rewriter
self.extra_cookie = None self.extra_cookie = None
self.splits = urlsplit(self.url) is_proxy = ('wsgiprox.proxy_host' in env)
self.splits = urlsplit(self.url) if not is_proxy else None
def get_full_request_uri(self): def get_full_request_uri(self):
if not self.splits:
return self.url
uri = self.splits.path uri = self.splits.path
if not uri: if not uri:
uri = '/' uri = '/'
@ -39,17 +44,20 @@ class RewriteInputRequest(DirectWSGIInputRequest):
for name, value in iteritems(self.env): for name, value in iteritems(self.env):
if name == 'HTTP_HOST': if name == 'HTTP_HOST':
name = 'Host' name = 'Host'
value = self.splits.netloc if self.splits:
value = self.splits.netloc
elif name == 'HTTP_ORIGIN': elif name == 'HTTP_ORIGIN':
name = 'Origin' name = 'Origin'
value = (self.splits.scheme + '://' + self.splits.netloc) if self.splits:
value = (self.splits.scheme + '://' + self.splits.netloc)
elif name == 'HTTP_X_CSRFTOKEN': elif name == 'HTTP_X_CSRFTOKEN':
name = 'X-CSRFToken' name = 'X-CSRFToken'
cookie_val = extract_client_cookie(self.env, 'csrftoken') if self.splits:
if cookie_val: cookie_val = extract_client_cookie(self.env, 'csrftoken')
value = cookie_val if cookie_val:
value = cookie_val
elif name == 'HTTP_X_PYWB_REQUESTED_WITH': elif name == 'HTTP_X_PYWB_REQUESTED_WITH':
continue continue
@ -62,12 +70,8 @@ class RewriteInputRequest(DirectWSGIInputRequest):
elif name == 'HTTP_X_FORWARDED_PROTO': elif name == 'HTTP_X_FORWARDED_PROTO':
name = 'X-Forwarded-Proto' name = 'X-Forwarded-Proto'
value = self.splits.scheme if self.splits:
value = self.splits.scheme
elif name == 'HTTP_COOKIE':
name = 'Cookie'
value = self._req_cookie_rewrite(value)
has_cookies = True
elif name.startswith('HTTP_'): elif name.startswith('HTTP_'):
name = name[5:].title().replace('_', '-') name = name[5:].title().replace('_', '-')
@ -81,31 +85,11 @@ class RewriteInputRequest(DirectWSGIInputRequest):
if value: if value:
headers[name] = value headers[name] = value
if not has_cookies:
value = self._req_cookie_rewrite('')
if value:
headers['Cookie'] = value
if self.extra_cookie: if self.extra_cookie:
headers['Cookie'] = self.extra_cookie + ';' + headers.get('Cookie', '') headers['Cookie'] = self.extra_cookie + ';' + headers.get('Cookie', '')
return headers return headers
def _req_cookie_rewrite(self, value):
return value
rule = self.rewriter.ruleset.get_first_match(self.urlkey)
if not rule or not rule.req_cookie_rewrite:
return value
for cr in rule.req_cookie_rewrite:
try:
value = cr['rx'].sub(cr['replace'], value)
except KeyError:
pass
return value
def extract_range(self): def extract_range(self):
use_206 = False use_206 = False
start = None start = None

View File

@ -161,7 +161,28 @@ class UrlRewriter(object):
#================================================================= #=================================================================
class SchemeOnlyUrlRewriter(UrlRewriter): class IdentityUrlRewriter(UrlRewriter):
"""
No rewriting performed, return original url
"""
def rewrite(self, url, mod=None):
return url
def get_new_url(self, **kwargs):
return kwargs.get('url', self.wburl.url)
def rebase_rewriter(self, new_url):
return self
def get_cookie_rewriter(self, scope=None):
return None
def deprefix_url(self):
return self.wburl.url
#=================================================================
class SchemeOnlyUrlRewriter(IdentityUrlRewriter):
""" """
A url rewriter which ensures that any urls have the same A url rewriter which ensures that any urls have the same
scheme (http or https) as the base url. scheme (http or https) as the base url.
@ -182,14 +203,3 @@ class SchemeOnlyUrlRewriter(UrlRewriter):
return url return url
def get_new_url(self, **kwargs):
return kwargs.get('url', self.wburl.url)
def rebase_rewriter(self, new_url):
return self
def get_cookie_rewriter(self, scope=None):
return None
def deprefix_url(self):
return self.wburl.url

View File

@ -337,13 +337,6 @@ rules:
- match: 'yt\.setConfig.*PLAYER_CONFIG.*args":\s*{' - match: 'yt\.setConfig.*PLAYER_CONFIG.*args":\s*{'
replace: '{0} "dash": "0", dashmpd: "", ' replace: '{0} "dash": "0", dashmpd: "", '
req_cookie_rewrite:
- match: '^(((?!PREF).)*)$'
replace: '\1; PREF=f2=40000000'
- match: '(.*PREF=)([^ ;]*)(.*)'
replace: '\1&f2=40000000\3'
# testing rules -- not for valid domain # testing rules -- not for valid domain
#================================================================= #=================================================================
# this rule block is a non-existent prefix merely for testing # this rule block is a non-existent prefix merely for testing
@ -376,17 +369,6 @@ rules:
rewrite: rewrite:
js_rewrite_location: urls js_rewrite_location: urls
req_cookie_rewrite:
- match: '^(((?!FOO).)*)$'
replace: '\1; FOO=bar=1'
- match: '(.*FOO=)([^ ;]*)(.*)'
replace: '\1&bar=1\3'
- match: ''
invalid_: ''
# all domain rules -- fallback to this dataset # all domain rules -- fallback to this dataset
#================================================================= #=================================================================
# Applies to all urls -- should be last # Applies to all urls -- should be last

View File

@ -1,7 +1,9 @@
from requests.adapters import HTTPAdapter from requests.adapters import HTTPAdapter
import requests
class DefaultAdapters(object): class DefaultAdapters(object):
live_adapter = HTTPAdapter(max_retries=3) live_adapter = HTTPAdapter(max_retries=3)
remote_adapter = HTTPAdapter(max_retries=3) remote_adapter = HTTPAdapter(max_retries=3)
requests.packages.urllib3.disable_warnings()

View File

@ -3,8 +3,8 @@ from warcio.statusandheaders import StatusAndHeadersParser
from warcio.utils import to_native_str from warcio.utils import to_native_str
from six.moves.urllib.parse import urlsplit, quote, unquote_plus from six.moves.urllib.parse import urlsplit, quote, unquote_plus, urlencode
from six import iteritems, StringIO from six import iteritems, StringIO, PY3
from io import BytesIO from io import BytesIO
import base64 import base64
@ -230,7 +230,7 @@ class PostQueryExtractor(object):
environ=env, environ=env,
keep_blank_values=True) keep_blank_values=True)
if six.PY3: if PY3:
args['encoding'] = 'utf-8' args['encoding'] = 'utf-8'
data = cgi.FieldStorage(**args) data = cgi.FieldStorage(**args)

View File

@ -12,3 +12,4 @@ webencodings
gevent==1.2.2 gevent==1.2.2
webassets==0.12.1 webassets==0.12.1
portalocker portalocker
wsgiprox>=1.4.1

101
tests/test_proxy.py Normal file
View File

@ -0,0 +1,101 @@
from pywb.warcserver.test.testutils import BaseTestClass, TempDirTests
from .base_config_test import CollsDirMixin
from pywb.utils.geventserver import GeventServer
from pywb.apps.frontendapp import FrontEndApp
from pywb.manager.manager import main as manager
import os
import requests
import pytest
# ============================================================================
@pytest.fixture(params=['http', 'https'])
def scheme(request):
return request.param
# ============================================================================
class BaseTestProxy(TempDirTests, BaseTestClass):
@classmethod
def setup_class(cls, coll='pywb', config_file='config_test.yaml'):
super(BaseTestProxy, cls).setup_class()
config_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), config_file)
cls.root_ca_file = os.path.join(cls.root_dir, 'pywb-ca-test.pem')
cls.app = FrontEndApp(config_file=config_file)
opts = {'ca_name': 'pywb HTTPS Proxy CA',
'ca_file_cache': cls.root_ca_file}
cls.proxy_app = cls.app.init_proxy(coll, opts)
cls.server = GeventServer(cls.proxy_app)
cls.proxies = cls.proxy_dict(cls.server.port)
@classmethod
def teardown_class(cls):
cls.server.stop()
super(BaseTestProxy, cls).teardown_class()
@classmethod
def proxy_dict(cls, port, host='localhost'):
return {'http': 'http://{0}:{1}'.format(host, port),
'https': 'https://{0}:{1}'.format(host, port)
}
# ============================================================================
class TestProxy(BaseTestProxy):
def test_proxy_replay(self, scheme):
res = requests.get('{0}://example.com/'.format(scheme),
proxies=self.proxies,
verify=self.root_ca_file)
assert 'WB Insert' in res.text
assert 'Example Domain' in res.text
assert res.headers['Link'] == '<http://example.com>; rel="memento"; datetime="Mon, 27 Jan 2014 17:12:51 GMT"'
assert res.headers['Memento-Datetime'] == 'Mon, 27 Jan 2014 17:12:51 GMT'
# ============================================================================
class TestRecordingProxy(CollsDirMixin, BaseTestProxy):
@classmethod
def setup_class(cls, coll='pywb', config_file='config_test.yaml'):
super(TestRecordingProxy, cls).setup_class('test/record', 'config_test_record.yaml')
manager(['init', 'test'])
@classmethod
def teardown_class(cls):
if cls.app.recorder:
cls.app.recorder.writer.close()
super(TestRecordingProxy, cls).teardown_class()
def test_proxy_record(self, scheme):
archive_dir = os.path.join(self.root_dir, '_test_colls', 'test', 'archive')
assert os.path.isdir(archive_dir)
res = requests.get('{0}://httpbin.org/'.format(scheme),
proxies=self.proxies,
verify=self.root_ca_file)
assert 'is_live = true' in res.text
assert 'httpbin(1)' in res.text
assert len(os.listdir(archive_dir)) == 1
def test_proxy_replay_recorded(self, scheme):
manager(['reindex', 'test'])
self.proxy_app.prefix_resolver.fixed_prefix = '/test/bn_/'
res = requests.get('{0}://httpbin.org/'.format(scheme),
proxies=self.proxies,
verify=self.root_ca_file)
assert 'is_live = false' in res.text
assert 'httpbin(1)' in res.text