mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
first pass at simple http proxy! #8
* proxy router for handling only proxy * proxy/archival router for handling both archival and proxy mode, togglable with 'enable_http_proxy' setting in config * supports only most recent capture playback -- no support for selecting replay date/calendar view yet * not testable with WebTest -- need better way to unit test proxy mode
This commit is contained in:
parent
848dc6d000
commit
1a1aa814d0
@ -75,5 +75,8 @@ hostpaths: ['http://localhost:8080/']
|
|||||||
# ==== New / Experimental Settings ====
|
# ==== New / Experimental Settings ====
|
||||||
# Not yet production ready -- used primarily for testing
|
# Not yet production ready -- used primarily for testing
|
||||||
|
|
||||||
|
# Enable simple http proxy mode
|
||||||
|
enable_http_proxy: false
|
||||||
|
|
||||||
# enable cdx server api for querying cdx directly (experimental)
|
# enable cdx server api for querying cdx directly (experimental)
|
||||||
enable_cdx_api: false
|
enable_cdx_api: false
|
||||||
|
102
pywb/proxy.py
Normal file
102
pywb/proxy.py
Normal file
@ -0,0 +1,102 @@
|
|||||||
|
from wbrequestresponse import WbResponse, WbRequest
|
||||||
|
from archivalrouter import ArchivalRouter
|
||||||
|
import urlparse
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
# An experimental router which combines both archival and proxy modes
|
||||||
|
# http proxy mode support is very simple: only latest capture is available currently
|
||||||
|
#=================================================================
|
||||||
|
|
||||||
|
class ProxyArchivalRouter:
|
||||||
|
def __init__(self, routes, hostpaths = None, abs_path = True, home_view = None, error_view = None):
|
||||||
|
self.archival = ArchivalRouter(routes, hostpaths, abs_path, home_view, error_view)
|
||||||
|
self.proxy = ProxyRouter(routes[0].handler, hostpaths, error_view)
|
||||||
|
self.error_view = error_view
|
||||||
|
|
||||||
|
def __call__(self, env):
|
||||||
|
response = self.archival(env)
|
||||||
|
if response:
|
||||||
|
return response
|
||||||
|
|
||||||
|
response = self.proxy(env)
|
||||||
|
if response:
|
||||||
|
return response
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
# Simple router which routes http proxy requests
|
||||||
|
# Handles requests of the form: GET http://example.com
|
||||||
|
# Only supports latest capture replay at the moment
|
||||||
|
#=================================================================
|
||||||
|
class ProxyRouter:
|
||||||
|
def __init__(self, handler, hostpaths = None, error_view = None):
|
||||||
|
self.handler = handler
|
||||||
|
self.hostpaths = hostpaths
|
||||||
|
|
||||||
|
self.error_view = error_view
|
||||||
|
|
||||||
|
def __call__(self, env):
|
||||||
|
url = env['REL_REQUEST_URI']
|
||||||
|
|
||||||
|
if url.endswith('/proxy.pac'):
|
||||||
|
return self.make_pac_response(env)
|
||||||
|
|
||||||
|
if not url.startswith('http://'):
|
||||||
|
return None
|
||||||
|
|
||||||
|
wbrequest = WbRequest(env,
|
||||||
|
request_uri = url,
|
||||||
|
coll = '',
|
||||||
|
wb_url_str = url,
|
||||||
|
wb_prefix = '',
|
||||||
|
use_abs_prefix = False,
|
||||||
|
wburl_class = self.handler.get_wburl_type(),
|
||||||
|
url_rewriter_class = ProxyHttpsUrlRewriter,
|
||||||
|
is_proxy = True)
|
||||||
|
|
||||||
|
return self.handler(wbrequest)
|
||||||
|
|
||||||
|
|
||||||
|
# Proxy Auto-Config (PAC) script for the proxy
|
||||||
|
def make_pac_response(self, env):
|
||||||
|
server_hostport = env['SERVER_NAME'] + ':' + env['SERVER_PORT']
|
||||||
|
|
||||||
|
buff = 'function FindProxyForURL (url, host) {\n'
|
||||||
|
|
||||||
|
direct_cond =' if (shExpMatch(host, "{0}")) {{ return "DIRECT"; }}\n'
|
||||||
|
|
||||||
|
for hostpath in self.hostpaths:
|
||||||
|
parts = urlparse.urlsplit(hostpath).netloc.split(':')
|
||||||
|
buff += direct_cond.format(parts[0])
|
||||||
|
|
||||||
|
buff += direct_cond.format(env['SERVER_NAME'])
|
||||||
|
|
||||||
|
#buff += '\n return "PROXY {0}";\n}}\n'.format(self.hostpaths[0])
|
||||||
|
buff += '\n return "PROXY {0}";\n}}\n'.format(server_hostport)
|
||||||
|
|
||||||
|
return WbResponse.text_response(buff, content_type = 'application/x-ns-proxy-autoconfig')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
# A rewriter which only rewrites https -> http
|
||||||
|
#=================================================================
|
||||||
|
class ProxyHttpsUrlRewriter:
|
||||||
|
HTTP = 'http://'
|
||||||
|
HTTPS = 'https://'
|
||||||
|
def __init__(self, wbrequest, prefix):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def rewrite(self, url, mod = None):
|
||||||
|
if url.startswith(self.HTTPS):
|
||||||
|
return self.HTTP + url[len(self.HTTPS):]
|
||||||
|
else:
|
||||||
|
return url
|
||||||
|
|
||||||
|
def get_timestamp_url(self, timestamp, url):
|
||||||
|
return url
|
||||||
|
|
||||||
|
def get_abs_url(self, url = ''):
|
||||||
|
return url
|
||||||
|
|
@ -5,7 +5,7 @@ import os
|
|||||||
import yaml
|
import yaml
|
||||||
import config_utils
|
import config_utils
|
||||||
import logging
|
import logging
|
||||||
|
import proxy
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
## Reference non-YAML config
|
## Reference non-YAML config
|
||||||
@ -54,9 +54,14 @@ def pywb_config_manual(config = {}):
|
|||||||
if config.get('debug_echo_req', False):
|
if config.get('debug_echo_req', False):
|
||||||
routes.append(archivalrouter.Route('echo_req', handlers.DebugEchoHandler()))
|
routes.append(archivalrouter.Route('echo_req', handlers.DebugEchoHandler()))
|
||||||
|
|
||||||
|
# Check for new proxy mode!
|
||||||
|
if config.get('enable_http_proxy', False):
|
||||||
|
router = proxy.ProxyArchivalRouter
|
||||||
|
else:
|
||||||
|
router = archivalrouter.ArchivalRouter
|
||||||
|
|
||||||
# Finally, create wb router
|
# Finally, create wb router
|
||||||
return archivalrouter.ArchivalRouter(
|
return router(
|
||||||
routes,
|
routes,
|
||||||
# Specify hostnames that pywb will be running on
|
# Specify hostnames that pywb will be running on
|
||||||
# This will help catch occasionally missed rewrites that fall-through to the host
|
# This will help catch occasionally missed rewrites that fall-through to the host
|
||||||
|
11
run-tests.py
11
run-tests.py
@ -75,6 +75,15 @@ class TestWb:
|
|||||||
assert 'Mon, Jan 27 2014 17:12:51' in resp.body
|
assert 'Mon, Jan 27 2014 17:12:51' in resp.body
|
||||||
assert '/pywb/20140127171251/http://www.iana.org/domains/example' in resp.body
|
assert '/pywb/20140127171251/http://www.iana.org/domains/example' in resp.body
|
||||||
|
|
||||||
|
# XX: Doesn't work as webtest does not support proxy mode
|
||||||
|
# need a way to test
|
||||||
|
#def test_proxy_replay(self):
|
||||||
|
#resp = self.testapp.get('http://www.iana.org/domains/idn-tables')
|
||||||
|
#self._assert_basic_html(resp)
|
||||||
|
|
||||||
|
#assert 'Sun, Jan 26 2014 20:11:27' in resp.body
|
||||||
|
#assert 'wb.js' in resp.body
|
||||||
|
|
||||||
def test_cdx_server_filters(self):
|
def test_cdx_server_filters(self):
|
||||||
resp = self.testapp.get('/pywb-cdx?url=http://www.iana.org/_css/2013.1/screen.css&filter=mimetype:warc/revisit&filter=filename:dupes.warc.gz')
|
resp = self.testapp.get('/pywb-cdx?url=http://www.iana.org/_css/2013.1/screen.css&filter=mimetype:warc/revisit&filter=filename:dupes.warc.gz')
|
||||||
self._assert_basic_text(resp)
|
self._assert_basic_text(resp)
|
||||||
@ -102,5 +111,3 @@ class TestWb:
|
|||||||
resp = self.testapp.get('/pywb/?abc', status = 400)
|
resp = self.testapp.get('/pywb/?abc', status = 400)
|
||||||
assert resp.status_int == 400
|
assert resp.status_int == 400
|
||||||
assert 'Bad Request Url: http://?abc' in resp.body
|
assert 'Bad Request Url: http://?abc' in resp.body
|
||||||
|
|
||||||
|
|
||||||
|
@ -75,5 +75,8 @@ hostpaths: ['http://localhost:8080/']
|
|||||||
# ==== New / Experimental Settings ====
|
# ==== New / Experimental Settings ====
|
||||||
# Not yet production ready -- used primarily for testing
|
# Not yet production ready -- used primarily for testing
|
||||||
|
|
||||||
|
# Enable simple http proxy mode
|
||||||
|
enable_http_proxy: true
|
||||||
|
|
||||||
# enable cdx server api for querying cdx directly (experimental)
|
# enable cdx server api for querying cdx directly (experimental)
|
||||||
enable_cdx_api: true
|
enable_cdx_api: true
|
||||||
|
Loading…
x
Reference in New Issue
Block a user