diff --git a/pywb/framework/cache.py b/pywb/framework/cache.py index 9028828b..3582dedc 100644 --- a/pywb/framework/cache.py +++ b/pywb/framework/cache.py @@ -20,9 +20,15 @@ class UwsgiCache(object): # pragma: no cover uwsgi.cache_del(item) +#================================================================= +class DefaultCache(dict): + def __getitem__(self, item): + return self.get(item) + + #================================================================= def create_cache(): if uwsgi_cache: # pragma: no cover return UwsgiCache() else: - return {} + return DefaultCache() diff --git a/pywb/framework/proxy.py b/pywb/framework/proxy.py index 61f07792..ffb2136f 100644 --- a/pywb/framework/proxy.py +++ b/pywb/framework/proxy.py @@ -163,6 +163,7 @@ class ProxyRouter(object): env['pywb.proxy_req_uri'] = parts.path if parts.query: env['pywb.proxy_req_uri'] += '?' + parts.query + env['pywb.proxy_query'] = parts.query env['pywb_proxy_magic'] = self.magic_name @@ -305,6 +306,7 @@ class ProxyRouter(object): queryparts = env['REL_REQUEST_URI'].split('?', 1) env['PATH_INFO'] = queryparts[0] env['QUERY_STRING'] = queryparts[1] if len(queryparts) > 1 else '' + env['pywb.proxy_query'] = env['QUERY_STRING'] while True: line = buffreader.readline() diff --git a/pywb/framework/proxy_resolvers.py b/pywb/framework/proxy_resolvers.py index 73dde2dc..46d53f15 100644 --- a/pywb/framework/proxy_resolvers.py +++ b/pywb/framework/proxy_resolvers.py @@ -16,9 +16,12 @@ import json class BaseCollResolver(object): def __init__(self, routes, config): self.routes = routes - self.pre_connect = config.get('pre_connect', False) self.use_default_coll = config.get('use_default_coll', True) + @property + def pre_connect(self): + return False + def resolve(self, env): route = None coll = None @@ -66,10 +69,13 @@ class ProxyAuthResolver(BaseCollResolver): DEFAULT_MSG = 'Please enter name of a collection to use with proxy mode' def __init__(self, routes, config): - config['pre_connect'] = True super(ProxyAuthResolver, self).__init__(routes, config) self.auth_msg = config.get('auth_msg', self.DEFAULT_MSG) + @property + def pre_connect(self): + return True + def get_proxy_coll_ts(self, env): proxy_auth = env.get('HTTP_PROXY_AUTHORIZATION') @@ -111,8 +117,32 @@ class IPCacheResolver(BaseCollResolver): self.cache = create_cache() self.magic_name = config['magic_name'] + def _get_ip(self, env): + ip = env['REMOTE_ADDR'] + qs = env.get('pywb.proxy_query') + if qs: + res = urlparse.parse_qs(qs) + + if 'ip' in res: + ip = res['ip'][0] + + return ip + def get_proxy_coll_ts(self, env): ip = env['REMOTE_ADDR'] + qs = env.get('pywb.proxy_query') + if qs: + res = urlparse.parse_qs(qs) + + if 'ip' in res: + ip = res['ip'][0] + + if 'coll' in res: + self.cache[ip + ':c'] = res['coll'][0] + + if 'ts' in res: + self.cache[ip + ':t'] = res['ts'][0] + coll = self.cache[ip + ':c'] ts = self.cache[ip + ':t'] return coll, ts @@ -128,21 +158,8 @@ class IPCacheResolver(BaseCollResolver): return super(IPCacheResolver, self).resolve(env) def handle_magic_page(self, env): - ip = env['REMOTE_ADDR'] - qs = env.get('QUERY_STRING') - if qs: - res = urlparse.parse_qs(qs) - - if 'ip' in res: - ip = res['ip'][0] - - if 'coll' in res: - self.cache[ip + ':c'] = res['coll'][0] - - if 'ts' in res: - self.cache[ip + ':t'] = res['ts'][0] - coll, ts = self.get_proxy_coll_ts(env) + ip = self._get_ip(env) res = json.dumps({'ip': ip, 'coll': coll, 'ts': ts}) return WbResponse.text_response(res, content_type='application/json') @@ -152,7 +169,6 @@ class CookieResolver(BaseCollResolver): SESH_COOKIE_NAME = '__pywb_proxy_sesh' def __init__(self, routes, config): - config['pre_connect'] = False super(CookieResolver, self).__init__(routes, config) self.magic_name = config['magic_name'] self.sethost_prefix = '-sethost.' + self.magic_name + '.' @@ -309,10 +325,7 @@ class CookieResolver(BaseCollResolver): ts = None if sesh_id: coll = self.cache[sesh_id + ':c'] - try: - ts = self.cache[sesh_id + ':t'] - except KeyError: - pass + ts = self.cache[sesh_id + ':t'] return coll, ts, sesh_id diff --git a/tests/test_config_proxy_ip.yaml b/tests/test_config_proxy_ip.yaml new file mode 100644 index 00000000..d1e765fb --- /dev/null +++ b/tests/test_config_proxy_ip.yaml @@ -0,0 +1,18 @@ +collections: + all: + - ./sample_archive/cdx/iana.cdx + - ./sample_archive/cdx/dupes.cdx + - ./sample_archive/cdx/post-test.cdx + +archive_paths: ./sample_archive/warcs/ + +enable_http_proxy: true + +proxy_options: + enable_https_proxy: false + + cookie_resolver: ip + use_default_coll: all + + use_banner: true + use_client_rewrite: false diff --git a/tests/test_proxy_http_ip.py b/tests/test_proxy_http_ip.py new file mode 100644 index 00000000..735a345c --- /dev/null +++ b/tests/test_proxy_http_ip.py @@ -0,0 +1,87 @@ +from pytest import raises +import webtest +import base64 + +from pywb.webapp.pywb_init import create_wb_router +from pywb.framework.wsgi_wrappers import init_app +from pywb.cdx.cdxobject import CDXObject + +from urlparse import urlsplit + + +class TestProxyIPResolver(object): + TEST_CONFIG = 'tests/test_config_proxy_ip.yaml' + + def setup(self): + self.app = init_app(create_wb_router, + load_yaml=True, + config_file=self.TEST_CONFIG) + + self.testapp = webtest.TestApp(self.app) + + def _assert_basic_html(self, resp): + assert resp.status_int == 200 + assert resp.content_type == 'text/html' + assert resp.content_length > 0 + + def _assert_basic_text(self, resp): + assert resp.status_int == 200 + assert resp.content_type == 'text/plain' + assert resp.content_length > 0 + + def get_url(self, uri, addr='127.0.0.1'): + parts = urlsplit(uri) + env = dict(REQUEST_URI=uri, QUERY_STRING=parts.query, SCRIPT_NAME='', REMOTE_ADDR=addr) + # 'Simulating' proxy by settings REQUEST_URI explicitly to full url with empty SCRIPT_NAME + return self.testapp.get('/x-ignore-this-x', extra_environ=env) + + def test_proxy_ip_default_ts(self): + resp = self.get_url('http://www.iana.org/') + self._assert_basic_html(resp) + + assert '"20140127171238"' in resp.body + assert 'wb.js' in resp.body + + def test_proxy_ip_get_defaults(self): + resp = self.get_url('http://info.pywb.proxy/') + assert resp.content_type == 'application/json' + assert resp.json == {'ip': '127.0.0.1', 'coll': None, 'ts': None} + + def test_proxy_ip_set_ts(self): + resp = self.get_url('http://info.pywb.proxy/set?ts=1996') + assert resp.content_type == 'application/json' + assert resp.json == {'ip': '127.0.0.1', 'coll': None, 'ts': '1996'} + + def test_proxy_ip_set_ts_coll(self): + resp = self.get_url('http://info.pywb.proxy/set?ts=1996&coll=all') + assert resp.content_type == 'application/json' + assert resp.json == {'ip': '127.0.0.1', 'coll': 'all', 'ts': '1996'} + + def test_proxy_ip_set_ts_coll_diff_ip(self): + resp = self.get_url('http://info.pywb.proxy/set?ts=1996&coll=all', '127.0.0.2') + assert resp.content_type == 'application/json' + assert resp.json == {'ip': '127.0.0.2', 'coll': 'all', 'ts': '1996'} + + resp = self.get_url('http://info.pywb.proxy/') + assert resp.json == {'ip': '127.0.0.1', 'coll': None, 'ts': None} + + resp = self.get_url('http://info.pywb.proxy/set?ip=127.0.0.2&ts=2005') + assert resp.json == {'ip': '127.0.0.2', 'coll': 'all', 'ts': '2005'} + + resp = self.get_url('http://info.pywb.proxy/', '127.0.0.2') + assert resp.json == {'ip': '127.0.0.2', 'coll': 'all', 'ts': '2005'} + + def test_proxy_ip_change_ts_for_ip(self): + resp = self.get_url('http://info.pywb.proxy/set?ip=1.2.3.4&ts=20140126200624') + assert resp.json == {'ip': '1.2.3.4', 'coll': None, 'ts': '20140126200624'} + + # different ts for this ip + resp = self.get_url('http://www.iana.org/', '1.2.3.4') + self._assert_basic_html(resp) + + assert '"20140126200624"' in resp.body + + # defaults for any other ip + resp = self.get_url('http://www.iana.org/') + self._assert_basic_html(resp) + assert '"20140127171238"' in resp.body