diff --git a/pywb/utils/io.py b/pywb/utils/io.py index aa55abce..4b6b6ba5 100644 --- a/pywb/utils/io.py +++ b/pywb/utils/io.py @@ -27,7 +27,10 @@ def call_release_conn(stream): try: yield stream finally: - stream.release_conn() + if hasattr(stream, 'release_conn'): + stream.release_conn() + else: + stream.close() #============================================================================= diff --git a/pywb/warcserver/http.py b/pywb/warcserver/http.py new file mode 100644 index 00000000..cd940e0f --- /dev/null +++ b/pywb/warcserver/http.py @@ -0,0 +1,5 @@ +from requests.adapters import HTTPAdapter + +default_adapter = HTTPAdapter(max_retries=3) + + diff --git a/pywb/warcserver/index/indexsource.py b/pywb/warcserver/index/indexsource.py index 5fc62fab..cb3a0951 100644 --- a/pywb/warcserver/index/indexsource.py +++ b/pywb/warcserver/index/indexsource.py @@ -5,6 +5,7 @@ from pywb.utils.wbexception import NotFoundException from warcio.timeutils import timestamp_to_http_date, http_date_to_timestamp from warcio.timeutils import timestamp_now, pad_timestamp, PAD_14_DOWN +from pywb.warcserver.http import default_adapter from pywb.warcserver.index.cdxobject import CDXObject from pywb.utils.format import ParamFormatter, res_template @@ -31,6 +32,11 @@ class BaseIndexSource(object): else: return None + def _init_sesh(self): + self.sesh = requests.Session() + self.sesh.mount('http://', default_adapter) + self.sesh.mount('https://', default_adapter) + #============================================================================= class FileIndexSource(BaseIndexSource): @@ -96,6 +102,7 @@ class RemoteIndexSource(BaseIndexSource): self.replay_url = replay_url self.url_field = url_field self.closest_limit = closest_limit + self._init_sesh() def _get_api_url(self, params): api_url = res_template(self.api_url, params) @@ -106,8 +113,11 @@ class RemoteIndexSource(BaseIndexSource): def load_index(self, params): api_url = self._get_api_url(params) - r = requests.get(api_url, timeout=params.get('_timeout')) - if r.status_code >= 400: + try: + r = self.sesh.get(api_url, timeout=params.get('_timeout')) + r.raise_for_status() + except Exception as e: + print('FAILED: ' + str(e)) raise NotFoundException(api_url) lines = r.content.strip().split(b'\n') @@ -317,12 +327,6 @@ class MementoIndexSource(BaseIndexSource): self.replay_url = replay_url self._init_sesh() - def _init_sesh(self): - self.sesh = requests.Session() - adapt = requests.adapters.HTTPAdapter(max_retries=3) - self.sesh.mount('http://', adapt) - self.sesh.mount('https://', adapt) - def links_to_cdxobject(self, link_header, def_name): results = MementoUtils.parse_links(link_header, def_name) @@ -360,10 +364,9 @@ class MementoIndexSource(BaseIndexSource): headers = self._get_headers(params) headers['Accept-Datetime'] = accept_dt res = self.sesh.head(url, headers=headers) - if res.status_code >= 400: - raise NotFoundException(url) + res.raise_for_status() except Exception as e: - print('FAILED:', e) + print('FAILED: ' + str(e)) raise NotFoundException(url) links = res.headers.get('Link') diff --git a/pywb/warcserver/resource/responseloader.py b/pywb/warcserver/resource/responseloader.py index 9cc828e0..e1fa6726 100644 --- a/pywb/warcserver/resource/responseloader.py +++ b/pywb/warcserver/resource/responseloader.py @@ -14,6 +14,8 @@ from pywb.utils.format import ParamFormatter from pywb.warcserver.resource.resolvingloader import ResolvingLoader from pywb.warcserver.resource.pathresolvers import DefaultResolverMixin +from pywb.warcserver.http import default_adapter + from six.moves.urllib.parse import urlsplit, quote, unquote from io import BytesIO @@ -26,7 +28,6 @@ import glob import datetime from requests.models import PreparedRequest -from requests.packages import urllib3 import six.moves.http_client six.moves.http_client._MAXHEADERS = 10000 @@ -233,14 +234,14 @@ class LiveWebLoader(BaseLoader): UNREWRITE_HEADERS = ('location', 'content-location') - def __init__(self, forward_proxy_prefix=None): - self.num_retries = 3 - self.num_pools = 10 - self.num_conn_per_pool = 10 + def __init__(self, forward_proxy_prefix=None, adapter=None): self.forward_proxy_prefix = forward_proxy_prefix - self.pool = urllib3.PoolManager(num_pools=self.num_pools, - maxsize=self.num_conn_per_pool) + if not adapter: + adapter = default_adapter + + self.pool = adapter.poolmanager + self.max_retries = adapter.max_retries def load_resource(self, cdx, params): load_url = cdx.get('load_url') @@ -442,7 +443,7 @@ class LiveWebLoader(BaseLoader): assert_same_host=False, preload_content=False, decode_content=False, - retries=self.num_retries, + retries=self.max_retries, timeout=params.get('_timeout')) return upstream_res