mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
Proxy Mode Support (#244)
proxy mode support readded! - use wsgiprox wrapper in FrontEndApp.init_proxy() with fixed collection prefix, ca options - cli --proxy <coll> flag added to specify proxy collection - cleanup: remove cookie rw (already disabled), fix post handling paths - headers: ensure request headers are not rewritten when in proxy mode, response headers marked with 'url-rewrite' also no rewritten if no url rewrite/proxy mode - urlrewriter: add IdentityRewriter with no rewriting as default, instead of SchemeOnlyUrlRewriter - memento support: for now, only include rel="original" and Memento-Datetime in for proxy replay response - responseloader: disable urllib3 unsecure response warnings - tests: add test for proxy replay and proxy record/replay of new collection
This commit is contained in:
parent
bbbb62ad52
commit
925f8337a5
@ -34,6 +34,8 @@ class BaseCli(object):
|
||||
parser.add_argument('--debug', action='store_true')
|
||||
parser.add_argument('--profile', action='store_true')
|
||||
|
||||
parser.add_argument('--proxy', help='Enable HTTP/S Proxy on specified collection')
|
||||
|
||||
parser.add_argument('--live', action='store_true', help='Add /live handler')
|
||||
|
||||
self.desc = desc
|
||||
@ -48,6 +50,9 @@ class BaseCli(object):
|
||||
|
||||
self.application = self.load()
|
||||
|
||||
if self.r.proxy:
|
||||
self.application = self.application.init_proxy(self.r.proxy)
|
||||
|
||||
if self.r.profile:
|
||||
from werkzeug.contrib.profiler import ProfilerMiddleware
|
||||
self.application = ProfilerMiddleware(self.application)
|
||||
|
@ -8,6 +8,7 @@ from six.moves.urllib.parse import urljoin
|
||||
from six import iteritems
|
||||
|
||||
from warcio.utils import to_native_str
|
||||
from wsgiprox.wsgiprox import WSGIProxMiddleware
|
||||
|
||||
from pywb.recorder.multifilewarcwriter import MultiFileWARCWriter
|
||||
from pywb.recorder.recorderapp import RecorderApp
|
||||
@ -202,7 +203,6 @@ class FrontEndApp(object):
|
||||
metadata = self.get_metadata(coll)
|
||||
if record:
|
||||
metadata['type'] = 'record'
|
||||
print('RECORD')
|
||||
|
||||
if timemap_output:
|
||||
metadata['output'] = timemap_output
|
||||
@ -304,6 +304,17 @@ class FrontEndApp(object):
|
||||
app_server = GeventServer(app, port=port, hostname='0.0.0.0')
|
||||
return app_server
|
||||
|
||||
def init_proxy(self, proxy_coll, opts=None):
|
||||
if not opts:
|
||||
opts = {'ca_name': 'pywb HTTPS Proxy CA',
|
||||
'ca_file_cache': os.path.join('proxy-certs', 'pywb-ca.pem')}
|
||||
|
||||
prefix = '/{0}/bn_/'.format(proxy_coll)
|
||||
|
||||
return WSGIProxMiddleware(self, prefix,
|
||||
proxy_host='pywb.proxy',
|
||||
proxy_options=opts)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class MetadataCache(object):
|
||||
|
@ -6,7 +6,7 @@ from six.moves.urllib.parse import urlencode, urlsplit, urlunsplit
|
||||
from pywb.rewrite.default_rewriter import DefaultRewriter, RewriterWithJSProxy
|
||||
|
||||
from pywb.rewrite.wburl import WbUrl
|
||||
from pywb.rewrite.url_rewriter import UrlRewriter, SchemeOnlyUrlRewriter
|
||||
from pywb.rewrite.url_rewriter import UrlRewriter, IdentityUrlRewriter
|
||||
|
||||
from pywb.utils.wbexception import WbException
|
||||
from pywb.utils.canonicalize import canonicalize
|
||||
@ -122,18 +122,18 @@ class RewriterApp(object):
|
||||
rel_prefix = self.get_rel_prefix(environ)
|
||||
full_prefix = host_prefix + rel_prefix
|
||||
|
||||
is_proxy = ('wsgiprox.proxy_host' in environ)
|
||||
|
||||
response = self.handle_custom_response(environ, wb_url,
|
||||
full_prefix, host_prefix,
|
||||
kwargs)
|
||||
|
||||
if response:
|
||||
return self.format_response(response, wb_url, full_prefix, is_timegate)
|
||||
|
||||
is_proxy = ('wsgiprox.proxy_host' in environ)
|
||||
return self.format_response(response, wb_url, full_prefix, is_timegate, is_proxy)
|
||||
|
||||
if is_proxy:
|
||||
environ['pywb_proxy_magic'] = environ['wsgiprox.proxy_host']
|
||||
urlrewriter = SchemeOnlyUrlRewriter(wb_url, '')
|
||||
urlrewriter = IdentityUrlRewriter(wb_url, '')
|
||||
framed_replay = False
|
||||
|
||||
else:
|
||||
@ -293,24 +293,18 @@ class RewriterApp(object):
|
||||
if not is_ajax and self.enable_memento:
|
||||
self._add_memento_links(cdx['url'], full_prefix,
|
||||
memento_dt, cdx['timestamp'], status_headers,
|
||||
is_timegate)
|
||||
is_timegate, is_proxy)
|
||||
|
||||
set_content_loc = True
|
||||
|
||||
if set_content_loc:
|
||||
status_headers.headers.append(('Content-Location', urlrewriter.get_new_url(timestamp=cdx['timestamp'],
|
||||
url=cdx['url'])))
|
||||
#gen = buffer_iter(status_headers, gen)
|
||||
response = WbResponse(status_headers, gen)
|
||||
|
||||
if is_proxy:
|
||||
response.status_headers.remove_header('Content-Security-Policy-Report-Only')
|
||||
response.status_headers.remove_header('Content-Security-Policy')
|
||||
response.status_headers.remove_header('X-Frame-Options')
|
||||
|
||||
return response
|
||||
|
||||
def format_response(self, response, wb_url, full_prefix, is_timegate):
|
||||
def format_response(self, response, wb_url, full_prefix, is_timegate, is_proxy):
|
||||
memento_ts = None
|
||||
if not isinstance(response, WbResponse):
|
||||
content_type = 'text/html'
|
||||
@ -324,11 +318,11 @@ class RewriterApp(object):
|
||||
response = WbResponse.text_response(response, content_type=content_type)
|
||||
|
||||
self._add_memento_links(wb_url.url, full_prefix, None, memento_ts,
|
||||
response.status_headers, is_timegate)
|
||||
response.status_headers, is_timegate, is_proxy)
|
||||
return response
|
||||
|
||||
def _add_memento_links(self, url, full_prefix, memento_dt, memento_ts,
|
||||
status_headers, is_timegate):
|
||||
status_headers, is_timegate, is_proxy):
|
||||
|
||||
# memento url + header
|
||||
if not memento_dt and memento_ts:
|
||||
@ -337,6 +331,9 @@ class RewriterApp(object):
|
||||
if memento_dt:
|
||||
status_headers.headers.append(('Memento-Datetime', memento_dt))
|
||||
|
||||
if is_proxy:
|
||||
memento_url = url
|
||||
else:
|
||||
memento_url = full_prefix + memento_ts + self.replay_mod
|
||||
memento_url += '/' + url
|
||||
else:
|
||||
@ -345,6 +342,7 @@ class RewriterApp(object):
|
||||
timegate_url, timemap_url = self._get_timegate_timemap(url, full_prefix)
|
||||
|
||||
link = []
|
||||
if not is_proxy:
|
||||
link.append(MementoUtils.make_link(url, 'original'))
|
||||
link.append(MementoUtils.make_link(timegate_url, 'timegate'))
|
||||
link.append(MementoUtils.make_link(timemap_url, 'timemap'))
|
||||
|
@ -34,6 +34,7 @@ class DefaultHeaderRewriter(object):
|
||||
'content-md5': 'prefix',
|
||||
'content-range': 'keep',
|
||||
'content-security-policy': 'prefix',
|
||||
'content-security-policy-report-only': 'prefix',
|
||||
'content-type': 'keep',
|
||||
|
||||
'date': 'keep',
|
||||
@ -102,7 +103,10 @@ class DefaultHeaderRewriter(object):
|
||||
return (name, value)
|
||||
|
||||
elif rule == 'url-rewrite':
|
||||
if self.rwinfo.is_url_rw():
|
||||
return (name, self.rwinfo.url_rewriter.rewrite(value))
|
||||
else:
|
||||
return (name, value)
|
||||
|
||||
elif rule == 'prefix-if-content-rewrite':
|
||||
if self.rwinfo.is_content_rw:
|
||||
|
@ -19,9 +19,14 @@ class RewriteInputRequest(DirectWSGIInputRequest):
|
||||
self.rewriter = rewriter
|
||||
self.extra_cookie = None
|
||||
|
||||
self.splits = urlsplit(self.url)
|
||||
is_proxy = ('wsgiprox.proxy_host' in env)
|
||||
|
||||
self.splits = urlsplit(self.url) if not is_proxy else None
|
||||
|
||||
def get_full_request_uri(self):
|
||||
if not self.splits:
|
||||
return self.url
|
||||
|
||||
uri = self.splits.path
|
||||
if not uri:
|
||||
uri = '/'
|
||||
@ -39,14 +44,17 @@ class RewriteInputRequest(DirectWSGIInputRequest):
|
||||
for name, value in iteritems(self.env):
|
||||
if name == 'HTTP_HOST':
|
||||
name = 'Host'
|
||||
if self.splits:
|
||||
value = self.splits.netloc
|
||||
|
||||
elif name == 'HTTP_ORIGIN':
|
||||
name = 'Origin'
|
||||
if self.splits:
|
||||
value = (self.splits.scheme + '://' + self.splits.netloc)
|
||||
|
||||
elif name == 'HTTP_X_CSRFTOKEN':
|
||||
name = 'X-CSRFToken'
|
||||
if self.splits:
|
||||
cookie_val = extract_client_cookie(self.env, 'csrftoken')
|
||||
if cookie_val:
|
||||
value = cookie_val
|
||||
@ -62,13 +70,9 @@ class RewriteInputRequest(DirectWSGIInputRequest):
|
||||
|
||||
elif name == 'HTTP_X_FORWARDED_PROTO':
|
||||
name = 'X-Forwarded-Proto'
|
||||
if self.splits:
|
||||
value = self.splits.scheme
|
||||
|
||||
elif name == 'HTTP_COOKIE':
|
||||
name = 'Cookie'
|
||||
value = self._req_cookie_rewrite(value)
|
||||
has_cookies = True
|
||||
|
||||
elif name.startswith('HTTP_'):
|
||||
name = name[5:].title().replace('_', '-')
|
||||
|
||||
@ -81,31 +85,11 @@ class RewriteInputRequest(DirectWSGIInputRequest):
|
||||
if value:
|
||||
headers[name] = value
|
||||
|
||||
if not has_cookies:
|
||||
value = self._req_cookie_rewrite('')
|
||||
if value:
|
||||
headers['Cookie'] = value
|
||||
|
||||
if self.extra_cookie:
|
||||
headers['Cookie'] = self.extra_cookie + ';' + headers.get('Cookie', '')
|
||||
|
||||
return headers
|
||||
|
||||
def _req_cookie_rewrite(self, value):
|
||||
return value
|
||||
|
||||
rule = self.rewriter.ruleset.get_first_match(self.urlkey)
|
||||
if not rule or not rule.req_cookie_rewrite:
|
||||
return value
|
||||
|
||||
for cr in rule.req_cookie_rewrite:
|
||||
try:
|
||||
value = cr['rx'].sub(cr['replace'], value)
|
||||
except KeyError:
|
||||
pass
|
||||
|
||||
return value
|
||||
|
||||
def extract_range(self):
|
||||
use_206 = False
|
||||
start = None
|
||||
|
@ -161,7 +161,28 @@ class UrlRewriter(object):
|
||||
|
||||
|
||||
#=================================================================
|
||||
class SchemeOnlyUrlRewriter(UrlRewriter):
|
||||
class IdentityUrlRewriter(UrlRewriter):
|
||||
"""
|
||||
No rewriting performed, return original url
|
||||
"""
|
||||
def rewrite(self, url, mod=None):
|
||||
return url
|
||||
|
||||
def get_new_url(self, **kwargs):
|
||||
return kwargs.get('url', self.wburl.url)
|
||||
|
||||
def rebase_rewriter(self, new_url):
|
||||
return self
|
||||
|
||||
def get_cookie_rewriter(self, scope=None):
|
||||
return None
|
||||
|
||||
def deprefix_url(self):
|
||||
return self.wburl.url
|
||||
|
||||
|
||||
#=================================================================
|
||||
class SchemeOnlyUrlRewriter(IdentityUrlRewriter):
|
||||
"""
|
||||
A url rewriter which ensures that any urls have the same
|
||||
scheme (http or https) as the base url.
|
||||
@ -182,14 +203,3 @@ class SchemeOnlyUrlRewriter(UrlRewriter):
|
||||
|
||||
return url
|
||||
|
||||
def get_new_url(self, **kwargs):
|
||||
return kwargs.get('url', self.wburl.url)
|
||||
|
||||
def rebase_rewriter(self, new_url):
|
||||
return self
|
||||
|
||||
def get_cookie_rewriter(self, scope=None):
|
||||
return None
|
||||
|
||||
def deprefix_url(self):
|
||||
return self.wburl.url
|
||||
|
@ -337,13 +337,6 @@ rules:
|
||||
- match: 'yt\.setConfig.*PLAYER_CONFIG.*args":\s*{'
|
||||
replace: '{0} "dash": "0", dashmpd: "", '
|
||||
|
||||
req_cookie_rewrite:
|
||||
- match: '^(((?!PREF).)*)$'
|
||||
replace: '\1; PREF=f2=40000000'
|
||||
|
||||
- match: '(.*PREF=)([^ ;]*)(.*)'
|
||||
replace: '\1&f2=40000000\3'
|
||||
|
||||
# testing rules -- not for valid domain
|
||||
#=================================================================
|
||||
# this rule block is a non-existent prefix merely for testing
|
||||
@ -376,17 +369,6 @@ rules:
|
||||
rewrite:
|
||||
js_rewrite_location: urls
|
||||
|
||||
req_cookie_rewrite:
|
||||
- match: '^(((?!FOO).)*)$'
|
||||
replace: '\1; FOO=bar=1'
|
||||
|
||||
- match: '(.*FOO=)([^ ;]*)(.*)'
|
||||
replace: '\1&bar=1\3'
|
||||
|
||||
- match: ''
|
||||
invalid_: ''
|
||||
|
||||
|
||||
# all domain rules -- fallback to this dataset
|
||||
#=================================================================
|
||||
# Applies to all urls -- should be last
|
||||
|
@ -1,7 +1,9 @@
|
||||
from requests.adapters import HTTPAdapter
|
||||
import requests
|
||||
|
||||
class DefaultAdapters(object):
|
||||
live_adapter = HTTPAdapter(max_retries=3)
|
||||
remote_adapter = HTTPAdapter(max_retries=3)
|
||||
|
||||
requests.packages.urllib3.disable_warnings()
|
||||
|
||||
|
@ -3,8 +3,8 @@ from warcio.statusandheaders import StatusAndHeadersParser
|
||||
|
||||
from warcio.utils import to_native_str
|
||||
|
||||
from six.moves.urllib.parse import urlsplit, quote, unquote_plus
|
||||
from six import iteritems, StringIO
|
||||
from six.moves.urllib.parse import urlsplit, quote, unquote_plus, urlencode
|
||||
from six import iteritems, StringIO, PY3
|
||||
from io import BytesIO
|
||||
|
||||
import base64
|
||||
@ -230,7 +230,7 @@ class PostQueryExtractor(object):
|
||||
environ=env,
|
||||
keep_blank_values=True)
|
||||
|
||||
if six.PY3:
|
||||
if PY3:
|
||||
args['encoding'] = 'utf-8'
|
||||
|
||||
data = cgi.FieldStorage(**args)
|
||||
|
@ -12,3 +12,4 @@ webencodings
|
||||
gevent==1.2.2
|
||||
webassets==0.12.1
|
||||
portalocker
|
||||
wsgiprox>=1.4.1
|
||||
|
101
tests/test_proxy.py
Normal file
101
tests/test_proxy.py
Normal file
@ -0,0 +1,101 @@
|
||||
from pywb.warcserver.test.testutils import BaseTestClass, TempDirTests
|
||||
|
||||
from .base_config_test import CollsDirMixin
|
||||
from pywb.utils.geventserver import GeventServer
|
||||
from pywb.apps.frontendapp import FrontEndApp
|
||||
from pywb.manager.manager import main as manager
|
||||
|
||||
import os
|
||||
import requests
|
||||
import pytest
|
||||
|
||||
|
||||
# ============================================================================
|
||||
@pytest.fixture(params=['http', 'https'])
|
||||
def scheme(request):
|
||||
return request.param
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class BaseTestProxy(TempDirTests, BaseTestClass):
|
||||
@classmethod
|
||||
def setup_class(cls, coll='pywb', config_file='config_test.yaml'):
|
||||
super(BaseTestProxy, cls).setup_class()
|
||||
config_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), config_file)
|
||||
|
||||
cls.root_ca_file = os.path.join(cls.root_dir, 'pywb-ca-test.pem')
|
||||
|
||||
cls.app = FrontEndApp(config_file=config_file)
|
||||
opts = {'ca_name': 'pywb HTTPS Proxy CA',
|
||||
'ca_file_cache': cls.root_ca_file}
|
||||
|
||||
cls.proxy_app = cls.app.init_proxy(coll, opts)
|
||||
|
||||
cls.server = GeventServer(cls.proxy_app)
|
||||
cls.proxies = cls.proxy_dict(cls.server.port)
|
||||
|
||||
@classmethod
|
||||
def teardown_class(cls):
|
||||
cls.server.stop()
|
||||
|
||||
super(BaseTestProxy, cls).teardown_class()
|
||||
|
||||
@classmethod
|
||||
def proxy_dict(cls, port, host='localhost'):
|
||||
return {'http': 'http://{0}:{1}'.format(host, port),
|
||||
'https': 'https://{0}:{1}'.format(host, port)
|
||||
}
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class TestProxy(BaseTestProxy):
|
||||
def test_proxy_replay(self, scheme):
|
||||
res = requests.get('{0}://example.com/'.format(scheme),
|
||||
proxies=self.proxies,
|
||||
verify=self.root_ca_file)
|
||||
|
||||
assert 'WB Insert' in res.text
|
||||
assert 'Example Domain' in res.text
|
||||
|
||||
assert res.headers['Link'] == '<http://example.com>; rel="memento"; datetime="Mon, 27 Jan 2014 17:12:51 GMT"'
|
||||
assert res.headers['Memento-Datetime'] == 'Mon, 27 Jan 2014 17:12:51 GMT'
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class TestRecordingProxy(CollsDirMixin, BaseTestProxy):
|
||||
@classmethod
|
||||
def setup_class(cls, coll='pywb', config_file='config_test.yaml'):
|
||||
super(TestRecordingProxy, cls).setup_class('test/record', 'config_test_record.yaml')
|
||||
manager(['init', 'test'])
|
||||
|
||||
@classmethod
|
||||
def teardown_class(cls):
|
||||
if cls.app.recorder:
|
||||
cls.app.recorder.writer.close()
|
||||
super(TestRecordingProxy, cls).teardown_class()
|
||||
|
||||
def test_proxy_record(self, scheme):
|
||||
archive_dir = os.path.join(self.root_dir, '_test_colls', 'test', 'archive')
|
||||
assert os.path.isdir(archive_dir)
|
||||
|
||||
res = requests.get('{0}://httpbin.org/'.format(scheme),
|
||||
proxies=self.proxies,
|
||||
verify=self.root_ca_file)
|
||||
|
||||
assert 'is_live = true' in res.text
|
||||
assert 'httpbin(1)' in res.text
|
||||
|
||||
assert len(os.listdir(archive_dir)) == 1
|
||||
|
||||
def test_proxy_replay_recorded(self, scheme):
|
||||
manager(['reindex', 'test'])
|
||||
|
||||
self.proxy_app.prefix_resolver.fixed_prefix = '/test/bn_/'
|
||||
|
||||
res = requests.get('{0}://httpbin.org/'.format(scheme),
|
||||
proxies=self.proxies,
|
||||
verify=self.root_ca_file)
|
||||
|
||||
assert 'is_live = false' in res.text
|
||||
assert 'httpbin(1)' in res.text
|
||||
|
Loading…
x
Reference in New Issue
Block a user