From 84eb070938f83320abb93b4a76dfca3df82ddc66 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sun, 2 Jul 2017 03:58:55 +0000 Subject: [PATCH] warcserver: support different default adapters, for live web and remote sources warcserver.http.DefaultAdapters.live_adapter used if is_live, else DefaultAdapters.remote_adapter tests: fix test to ignore order in dir listing --- pywb/warcserver/http.py | 4 +- pywb/warcserver/index/indexsource.py | 12 +++--- pywb/warcserver/resource/responseloader.py | 40 +++++++++---------- pywb/warcserver/test/test_configwarcserver.py | 2 +- 4 files changed, 31 insertions(+), 27 deletions(-) diff --git a/pywb/warcserver/http.py b/pywb/warcserver/http.py index cd940e0f..8181f41b 100644 --- a/pywb/warcserver/http.py +++ b/pywb/warcserver/http.py @@ -1,5 +1,7 @@ from requests.adapters import HTTPAdapter -default_adapter = HTTPAdapter(max_retries=3) +class DefaultAdapters(object): + live_adapter = HTTPAdapter(max_retries=3) + remote_adapter = HTTPAdapter(pool_connections=8, pool_maxsize=8, pool_block=True) diff --git a/pywb/warcserver/index/indexsource.py b/pywb/warcserver/index/indexsource.py index aecc2489..669a43be 100644 --- a/pywb/warcserver/index/indexsource.py +++ b/pywb/warcserver/index/indexsource.py @@ -5,7 +5,7 @@ from pywb.utils.wbexception import NotFoundException from warcio.timeutils import timestamp_to_http_date, http_date_to_timestamp from warcio.timeutils import timestamp_now, pad_timestamp, PAD_14_DOWN -from pywb.warcserver.http import default_adapter +from pywb.warcserver.http import DefaultAdapters from pywb.warcserver.index.cdxobject import CDXObject from pywb.utils.format import ParamFormatter, res_template @@ -32,10 +32,12 @@ class BaseIndexSource(object): else: return None - def _init_sesh(self): + def _init_sesh(self, adapter=None): + if not adapter: + adapter = DefaultAdapters.remote_adapter self.sesh = requests.Session() - self.sesh.mount('http://', default_adapter) - self.sesh.mount('https://', default_adapter) + self.sesh.mount('http://', adapter) + self.sesh.mount('https://', adapter) #============================================================================= @@ -193,7 +195,7 @@ class RemoteIndexSource(BaseIndexSource): class LiveIndexSource(BaseIndexSource): def __init__(self, proxy_url='{url}'): self.proxy_url = proxy_url - self._init_sesh() + self._init_sesh(DefaultAdapters.live_adapter) def load_index(self, params): # no fuzzy match for live resources diff --git a/pywb/warcserver/resource/responseloader.py b/pywb/warcserver/resource/responseloader.py index e1fa6726..a398997b 100644 --- a/pywb/warcserver/resource/responseloader.py +++ b/pywb/warcserver/resource/responseloader.py @@ -14,7 +14,7 @@ from pywb.utils.format import ParamFormatter from pywb.warcserver.resource.resolvingloader import ResolvingLoader from pywb.warcserver.resource.pathresolvers import DefaultResolverMixin -from pywb.warcserver.http import default_adapter +from pywb.warcserver.http import DefaultAdapters from six.moves.urllib.parse import urlsplit, quote, unquote @@ -237,12 +237,6 @@ class LiveWebLoader(BaseLoader): def __init__(self, forward_proxy_prefix=None, adapter=None): self.forward_proxy_prefix = forward_proxy_prefix - if not adapter: - adapter = default_adapter - - self.pool = adapter.poolmanager - self.max_retries = adapter.max_retries - def load_resource(self, cdx, params): load_url = cdx.get('load_url') if not load_url: @@ -407,7 +401,8 @@ class LiveWebLoader(BaseLoader): data, req_headers, params, cdx): upstream_res = self._do_request(method, load_url, - data, req_headers, params) + data, req_headers, params, + cdx.get('is_live')) if cdx.get('is_live'): return upstream_res @@ -428,23 +423,28 @@ class LiveWebLoader(BaseLoader): raise load_url = location - upstream_res = self._do_request(method, load_url, data, req_headers, params) + upstream_res = self._do_request(method, load_url, data, + req_headers, params, cdx.get('is_live')) self_redir_count += 1 return upstream_res - def _do_request(self, method, load_url, data, req_headers, params): + def _do_request(self, method, load_url, data, req_headers, params, is_live): + adapter = DefaultAdapters.live_adapter if is_live else DefaultAdapters.remote_adapter + pool = adapter.poolmanager + max_retries = adapter.max_retries + try: - upstream_res = self.pool.urlopen(method=method, - url=load_url, - body=data, - headers=req_headers, - redirect=False, - assert_same_host=False, - preload_content=False, - decode_content=False, - retries=self.max_retries, - timeout=params.get('_timeout')) + upstream_res = pool.urlopen(method=method, + url=load_url, + body=data, + headers=req_headers, + redirect=False, + assert_same_host=False, + preload_content=False, + decode_content=False, + retries=max_retries, + timeout=params.get('_timeout')) return upstream_res diff --git a/pywb/warcserver/test/test_configwarcserver.py b/pywb/warcserver/test/test_configwarcserver.py index 17ed4810..1032950b 100644 --- a/pywb/warcserver/test/test_configwarcserver.py +++ b/pywb/warcserver/test/test_configwarcserver.py @@ -53,7 +53,7 @@ class TestWarcServer(TempDirTests, BaseTestClass): assert len(self.loader.list_fixed_routes()) == 13 def test_list_dynamic(self): - assert self.loader.list_dynamic_routes() == ['auto1', 'auto2'] + assert set(self.loader.list_dynamic_routes()) == set(['auto1', 'auto2']) def test_remote_cdx(self): sources = self._get_sources('ait')