1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 08:04:49 +01:00

warcserver: define default HTTPAdapter in warcserver.http.default_adapter, for use with both index sources and responseloader

responseloader uses existing pool from shared HTTPAdapter
fix tests: call_release_conn() checks if release_conn() exists before calling, else default to close()
This commit is contained in:
Ilya Kreymer 2017-06-29 22:33:16 -07:00
parent 1bd8a85a4d
commit dd7c1bd752
4 changed files with 32 additions and 20 deletions

View File

@ -27,7 +27,10 @@ def call_release_conn(stream):
try:
yield stream
finally:
stream.release_conn()
if hasattr(stream, 'release_conn'):
stream.release_conn()
else:
stream.close()
#=============================================================================

5
pywb/warcserver/http.py Normal file
View File

@ -0,0 +1,5 @@
from requests.adapters import HTTPAdapter
default_adapter = HTTPAdapter(max_retries=3)

View File

@ -5,6 +5,7 @@ from pywb.utils.wbexception import NotFoundException
from warcio.timeutils import timestamp_to_http_date, http_date_to_timestamp
from warcio.timeutils import timestamp_now, pad_timestamp, PAD_14_DOWN
from pywb.warcserver.http import default_adapter
from pywb.warcserver.index.cdxobject import CDXObject
from pywb.utils.format import ParamFormatter, res_template
@ -31,6 +32,11 @@ class BaseIndexSource(object):
else:
return None
def _init_sesh(self):
self.sesh = requests.Session()
self.sesh.mount('http://', default_adapter)
self.sesh.mount('https://', default_adapter)
#=============================================================================
class FileIndexSource(BaseIndexSource):
@ -96,6 +102,7 @@ class RemoteIndexSource(BaseIndexSource):
self.replay_url = replay_url
self.url_field = url_field
self.closest_limit = closest_limit
self._init_sesh()
def _get_api_url(self, params):
api_url = res_template(self.api_url, params)
@ -106,8 +113,11 @@ class RemoteIndexSource(BaseIndexSource):
def load_index(self, params):
api_url = self._get_api_url(params)
r = requests.get(api_url, timeout=params.get('_timeout'))
if r.status_code >= 400:
try:
r = self.sesh.get(api_url, timeout=params.get('_timeout'))
r.raise_for_status()
except Exception as e:
print('FAILED: ' + str(e))
raise NotFoundException(api_url)
lines = r.content.strip().split(b'\n')
@ -317,12 +327,6 @@ class MementoIndexSource(BaseIndexSource):
self.replay_url = replay_url
self._init_sesh()
def _init_sesh(self):
self.sesh = requests.Session()
adapt = requests.adapters.HTTPAdapter(max_retries=3)
self.sesh.mount('http://', adapt)
self.sesh.mount('https://', adapt)
def links_to_cdxobject(self, link_header, def_name):
results = MementoUtils.parse_links(link_header, def_name)
@ -360,10 +364,9 @@ class MementoIndexSource(BaseIndexSource):
headers = self._get_headers(params)
headers['Accept-Datetime'] = accept_dt
res = self.sesh.head(url, headers=headers)
if res.status_code >= 400:
raise NotFoundException(url)
res.raise_for_status()
except Exception as e:
print('FAILED:', e)
print('FAILED: ' + str(e))
raise NotFoundException(url)
links = res.headers.get('Link')

View File

@ -14,6 +14,8 @@ from pywb.utils.format import ParamFormatter
from pywb.warcserver.resource.resolvingloader import ResolvingLoader
from pywb.warcserver.resource.pathresolvers import DefaultResolverMixin
from pywb.warcserver.http import default_adapter
from six.moves.urllib.parse import urlsplit, quote, unquote
from io import BytesIO
@ -26,7 +28,6 @@ import glob
import datetime
from requests.models import PreparedRequest
from requests.packages import urllib3
import six.moves.http_client
six.moves.http_client._MAXHEADERS = 10000
@ -233,14 +234,14 @@ class LiveWebLoader(BaseLoader):
UNREWRITE_HEADERS = ('location', 'content-location')
def __init__(self, forward_proxy_prefix=None):
self.num_retries = 3
self.num_pools = 10
self.num_conn_per_pool = 10
def __init__(self, forward_proxy_prefix=None, adapter=None):
self.forward_proxy_prefix = forward_proxy_prefix
self.pool = urllib3.PoolManager(num_pools=self.num_pools,
maxsize=self.num_conn_per_pool)
if not adapter:
adapter = default_adapter
self.pool = adapter.poolmanager
self.max_retries = adapter.max_retries
def load_resource(self, cdx, params):
load_url = cdx.get('load_url')
@ -442,7 +443,7 @@ class LiveWebLoader(BaseLoader):
assert_same_host=False,
preload_content=False,
decode_content=False,
retries=self.num_retries,
retries=self.max_retries,
timeout=params.get('_timeout'))
return upstream_res