1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

Proxy Mode Support (#244)

proxy mode support readded!
- use wsgiprox wrapper in FrontEndApp.init_proxy() with fixed collection prefix, ca options
- cli --proxy <coll> flag added to specify proxy collection
- cleanup: remove cookie rw (already disabled), fix post handling paths
- headers: ensure request headers are not rewritten when in proxy mode, response headers marked with 'url-rewrite' also no rewritten if no url rewrite/proxy mode
- urlrewriter: add IdentityRewriter with no rewriting as default, instead of SchemeOnlyUrlRewriter
- memento support: for now, only include rel="original" and Memento-Datetime in for proxy replay response
- responseloader: disable urllib3 unsecure response warnings
- tests: add test for proxy replay and proxy record/replay of new collection
This commit is contained in:
Ilya Kreymer 2017-09-27 13:47:02 -07:00 committed by GitHub
parent bbbb62ad52
commit 925f8337a5
11 changed files with 185 additions and 87 deletions

View File

@ -34,6 +34,8 @@ class BaseCli(object):
parser.add_argument('--debug', action='store_true')
parser.add_argument('--profile', action='store_true')
parser.add_argument('--proxy', help='Enable HTTP/S Proxy on specified collection')
parser.add_argument('--live', action='store_true', help='Add /live handler')
self.desc = desc
@ -48,6 +50,9 @@ class BaseCli(object):
self.application = self.load()
if self.r.proxy:
self.application = self.application.init_proxy(self.r.proxy)
if self.r.profile:
from werkzeug.contrib.profiler import ProfilerMiddleware
self.application = ProfilerMiddleware(self.application)

View File

@ -8,6 +8,7 @@ from six.moves.urllib.parse import urljoin
from six import iteritems
from warcio.utils import to_native_str
from wsgiprox.wsgiprox import WSGIProxMiddleware
from pywb.recorder.multifilewarcwriter import MultiFileWARCWriter
from pywb.recorder.recorderapp import RecorderApp
@ -202,7 +203,6 @@ class FrontEndApp(object):
metadata = self.get_metadata(coll)
if record:
metadata['type'] = 'record'
print('RECORD')
if timemap_output:
metadata['output'] = timemap_output
@ -304,6 +304,17 @@ class FrontEndApp(object):
app_server = GeventServer(app, port=port, hostname='0.0.0.0')
return app_server
def init_proxy(self, proxy_coll, opts=None):
if not opts:
opts = {'ca_name': 'pywb HTTPS Proxy CA',
'ca_file_cache': os.path.join('proxy-certs', 'pywb-ca.pem')}
prefix = '/{0}/bn_/'.format(proxy_coll)
return WSGIProxMiddleware(self, prefix,
proxy_host='pywb.proxy',
proxy_options=opts)
# ============================================================================
class MetadataCache(object):

View File

@ -6,7 +6,7 @@ from six.moves.urllib.parse import urlencode, urlsplit, urlunsplit
from pywb.rewrite.default_rewriter import DefaultRewriter, RewriterWithJSProxy
from pywb.rewrite.wburl import WbUrl
from pywb.rewrite.url_rewriter import UrlRewriter, SchemeOnlyUrlRewriter
from pywb.rewrite.url_rewriter import UrlRewriter, IdentityUrlRewriter
from pywb.utils.wbexception import WbException
from pywb.utils.canonicalize import canonicalize
@ -122,18 +122,18 @@ class RewriterApp(object):
rel_prefix = self.get_rel_prefix(environ)
full_prefix = host_prefix + rel_prefix
is_proxy = ('wsgiprox.proxy_host' in environ)
response = self.handle_custom_response(environ, wb_url,
full_prefix, host_prefix,
kwargs)
if response:
return self.format_response(response, wb_url, full_prefix, is_timegate)
is_proxy = ('wsgiprox.proxy_host' in environ)
return self.format_response(response, wb_url, full_prefix, is_timegate, is_proxy)
if is_proxy:
environ['pywb_proxy_magic'] = environ['wsgiprox.proxy_host']
urlrewriter = SchemeOnlyUrlRewriter(wb_url, '')
urlrewriter = IdentityUrlRewriter(wb_url, '')
framed_replay = False
else:
@ -293,24 +293,18 @@ class RewriterApp(object):
if not is_ajax and self.enable_memento:
self._add_memento_links(cdx['url'], full_prefix,
memento_dt, cdx['timestamp'], status_headers,
is_timegate)
is_timegate, is_proxy)
set_content_loc = True
if set_content_loc:
status_headers.headers.append(('Content-Location', urlrewriter.get_new_url(timestamp=cdx['timestamp'],
url=cdx['url'])))
#gen = buffer_iter(status_headers, gen)
response = WbResponse(status_headers, gen)
if is_proxy:
response.status_headers.remove_header('Content-Security-Policy-Report-Only')
response.status_headers.remove_header('Content-Security-Policy')
response.status_headers.remove_header('X-Frame-Options')
return response
def format_response(self, response, wb_url, full_prefix, is_timegate):
def format_response(self, response, wb_url, full_prefix, is_timegate, is_proxy):
memento_ts = None
if not isinstance(response, WbResponse):
content_type = 'text/html'
@ -324,11 +318,11 @@ class RewriterApp(object):
response = WbResponse.text_response(response, content_type=content_type)
self._add_memento_links(wb_url.url, full_prefix, None, memento_ts,
response.status_headers, is_timegate)
response.status_headers, is_timegate, is_proxy)
return response
def _add_memento_links(self, url, full_prefix, memento_dt, memento_ts,
status_headers, is_timegate):
status_headers, is_timegate, is_proxy):
# memento url + header
if not memento_dt and memento_ts:
@ -337,6 +331,9 @@ class RewriterApp(object):
if memento_dt:
status_headers.headers.append(('Memento-Datetime', memento_dt))
if is_proxy:
memento_url = url
else:
memento_url = full_prefix + memento_ts + self.replay_mod
memento_url += '/' + url
else:
@ -345,6 +342,7 @@ class RewriterApp(object):
timegate_url, timemap_url = self._get_timegate_timemap(url, full_prefix)
link = []
if not is_proxy:
link.append(MementoUtils.make_link(url, 'original'))
link.append(MementoUtils.make_link(timegate_url, 'timegate'))
link.append(MementoUtils.make_link(timemap_url, 'timemap'))

View File

@ -34,6 +34,7 @@ class DefaultHeaderRewriter(object):
'content-md5': 'prefix',
'content-range': 'keep',
'content-security-policy': 'prefix',
'content-security-policy-report-only': 'prefix',
'content-type': 'keep',
'date': 'keep',
@ -102,7 +103,10 @@ class DefaultHeaderRewriter(object):
return (name, value)
elif rule == 'url-rewrite':
if self.rwinfo.is_url_rw():
return (name, self.rwinfo.url_rewriter.rewrite(value))
else:
return (name, value)
elif rule == 'prefix-if-content-rewrite':
if self.rwinfo.is_content_rw:

View File

@ -19,9 +19,14 @@ class RewriteInputRequest(DirectWSGIInputRequest):
self.rewriter = rewriter
self.extra_cookie = None
self.splits = urlsplit(self.url)
is_proxy = ('wsgiprox.proxy_host' in env)
self.splits = urlsplit(self.url) if not is_proxy else None
def get_full_request_uri(self):
if not self.splits:
return self.url
uri = self.splits.path
if not uri:
uri = '/'
@ -39,14 +44,17 @@ class RewriteInputRequest(DirectWSGIInputRequest):
for name, value in iteritems(self.env):
if name == 'HTTP_HOST':
name = 'Host'
if self.splits:
value = self.splits.netloc
elif name == 'HTTP_ORIGIN':
name = 'Origin'
if self.splits:
value = (self.splits.scheme + '://' + self.splits.netloc)
elif name == 'HTTP_X_CSRFTOKEN':
name = 'X-CSRFToken'
if self.splits:
cookie_val = extract_client_cookie(self.env, 'csrftoken')
if cookie_val:
value = cookie_val
@ -62,13 +70,9 @@ class RewriteInputRequest(DirectWSGIInputRequest):
elif name == 'HTTP_X_FORWARDED_PROTO':
name = 'X-Forwarded-Proto'
if self.splits:
value = self.splits.scheme
elif name == 'HTTP_COOKIE':
name = 'Cookie'
value = self._req_cookie_rewrite(value)
has_cookies = True
elif name.startswith('HTTP_'):
name = name[5:].title().replace('_', '-')
@ -81,31 +85,11 @@ class RewriteInputRequest(DirectWSGIInputRequest):
if value:
headers[name] = value
if not has_cookies:
value = self._req_cookie_rewrite('')
if value:
headers['Cookie'] = value
if self.extra_cookie:
headers['Cookie'] = self.extra_cookie + ';' + headers.get('Cookie', '')
return headers
def _req_cookie_rewrite(self, value):
return value
rule = self.rewriter.ruleset.get_first_match(self.urlkey)
if not rule or not rule.req_cookie_rewrite:
return value
for cr in rule.req_cookie_rewrite:
try:
value = cr['rx'].sub(cr['replace'], value)
except KeyError:
pass
return value
def extract_range(self):
use_206 = False
start = None

View File

@ -161,7 +161,28 @@ class UrlRewriter(object):
#=================================================================
class SchemeOnlyUrlRewriter(UrlRewriter):
class IdentityUrlRewriter(UrlRewriter):
"""
No rewriting performed, return original url
"""
def rewrite(self, url, mod=None):
return url
def get_new_url(self, **kwargs):
return kwargs.get('url', self.wburl.url)
def rebase_rewriter(self, new_url):
return self
def get_cookie_rewriter(self, scope=None):
return None
def deprefix_url(self):
return self.wburl.url
#=================================================================
class SchemeOnlyUrlRewriter(IdentityUrlRewriter):
"""
A url rewriter which ensures that any urls have the same
scheme (http or https) as the base url.
@ -182,14 +203,3 @@ class SchemeOnlyUrlRewriter(UrlRewriter):
return url
def get_new_url(self, **kwargs):
return kwargs.get('url', self.wburl.url)
def rebase_rewriter(self, new_url):
return self
def get_cookie_rewriter(self, scope=None):
return None
def deprefix_url(self):
return self.wburl.url

View File

@ -337,13 +337,6 @@ rules:
- match: 'yt\.setConfig.*PLAYER_CONFIG.*args":\s*{'
replace: '{0} "dash": "0", dashmpd: "", '
req_cookie_rewrite:
- match: '^(((?!PREF).)*)$'
replace: '\1; PREF=f2=40000000'
- match: '(.*PREF=)([^ ;]*)(.*)'
replace: '\1&f2=40000000\3'
# testing rules -- not for valid domain
#=================================================================
# this rule block is a non-existent prefix merely for testing
@ -376,17 +369,6 @@ rules:
rewrite:
js_rewrite_location: urls
req_cookie_rewrite:
- match: '^(((?!FOO).)*)$'
replace: '\1; FOO=bar=1'
- match: '(.*FOO=)([^ ;]*)(.*)'
replace: '\1&bar=1\3'
- match: ''
invalid_: ''
# all domain rules -- fallback to this dataset
#=================================================================
# Applies to all urls -- should be last

View File

@ -1,7 +1,9 @@
from requests.adapters import HTTPAdapter
import requests
class DefaultAdapters(object):
live_adapter = HTTPAdapter(max_retries=3)
remote_adapter = HTTPAdapter(max_retries=3)
requests.packages.urllib3.disable_warnings()

View File

@ -3,8 +3,8 @@ from warcio.statusandheaders import StatusAndHeadersParser
from warcio.utils import to_native_str
from six.moves.urllib.parse import urlsplit, quote, unquote_plus
from six import iteritems, StringIO
from six.moves.urllib.parse import urlsplit, quote, unquote_plus, urlencode
from six import iteritems, StringIO, PY3
from io import BytesIO
import base64
@ -230,7 +230,7 @@ class PostQueryExtractor(object):
environ=env,
keep_blank_values=True)
if six.PY3:
if PY3:
args['encoding'] = 'utf-8'
data = cgi.FieldStorage(**args)

View File

@ -12,3 +12,4 @@ webencodings
gevent==1.2.2
webassets==0.12.1
portalocker
wsgiprox>=1.4.1

101
tests/test_proxy.py Normal file
View File

@ -0,0 +1,101 @@
from pywb.warcserver.test.testutils import BaseTestClass, TempDirTests
from .base_config_test import CollsDirMixin
from pywb.utils.geventserver import GeventServer
from pywb.apps.frontendapp import FrontEndApp
from pywb.manager.manager import main as manager
import os
import requests
import pytest
# ============================================================================
@pytest.fixture(params=['http', 'https'])
def scheme(request):
return request.param
# ============================================================================
class BaseTestProxy(TempDirTests, BaseTestClass):
@classmethod
def setup_class(cls, coll='pywb', config_file='config_test.yaml'):
super(BaseTestProxy, cls).setup_class()
config_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), config_file)
cls.root_ca_file = os.path.join(cls.root_dir, 'pywb-ca-test.pem')
cls.app = FrontEndApp(config_file=config_file)
opts = {'ca_name': 'pywb HTTPS Proxy CA',
'ca_file_cache': cls.root_ca_file}
cls.proxy_app = cls.app.init_proxy(coll, opts)
cls.server = GeventServer(cls.proxy_app)
cls.proxies = cls.proxy_dict(cls.server.port)
@classmethod
def teardown_class(cls):
cls.server.stop()
super(BaseTestProxy, cls).teardown_class()
@classmethod
def proxy_dict(cls, port, host='localhost'):
return {'http': 'http://{0}:{1}'.format(host, port),
'https': 'https://{0}:{1}'.format(host, port)
}
# ============================================================================
class TestProxy(BaseTestProxy):
def test_proxy_replay(self, scheme):
res = requests.get('{0}://example.com/'.format(scheme),
proxies=self.proxies,
verify=self.root_ca_file)
assert 'WB Insert' in res.text
assert 'Example Domain' in res.text
assert res.headers['Link'] == '<http://example.com>; rel="memento"; datetime="Mon, 27 Jan 2014 17:12:51 GMT"'
assert res.headers['Memento-Datetime'] == 'Mon, 27 Jan 2014 17:12:51 GMT'
# ============================================================================
class TestRecordingProxy(CollsDirMixin, BaseTestProxy):
@classmethod
def setup_class(cls, coll='pywb', config_file='config_test.yaml'):
super(TestRecordingProxy, cls).setup_class('test/record', 'config_test_record.yaml')
manager(['init', 'test'])
@classmethod
def teardown_class(cls):
if cls.app.recorder:
cls.app.recorder.writer.close()
super(TestRecordingProxy, cls).teardown_class()
def test_proxy_record(self, scheme):
archive_dir = os.path.join(self.root_dir, '_test_colls', 'test', 'archive')
assert os.path.isdir(archive_dir)
res = requests.get('{0}://httpbin.org/'.format(scheme),
proxies=self.proxies,
verify=self.root_ca_file)
assert 'is_live = true' in res.text
assert 'httpbin(1)' in res.text
assert len(os.listdir(archive_dir)) == 1
def test_proxy_replay_recorded(self, scheme):
manager(['reindex', 'test'])
self.proxy_app.prefix_resolver.fixed_prefix = '/test/bn_/'
res = requests.get('{0}://httpbin.org/'.format(scheme),
proxies=self.proxies,
verify=self.root_ca_file)
assert 'is_live = false' in res.text
assert 'httpbin(1)' in res.text