mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
warcserver: define default HTTPAdapter in warcserver.http.default_adapter, for use with both index sources and responseloader
responseloader uses existing pool from shared HTTPAdapter fix tests: call_release_conn() checks if release_conn() exists before calling, else default to close()
This commit is contained in:
parent
1bd8a85a4d
commit
dd7c1bd752
@ -27,7 +27,10 @@ def call_release_conn(stream):
|
|||||||
try:
|
try:
|
||||||
yield stream
|
yield stream
|
||||||
finally:
|
finally:
|
||||||
stream.release_conn()
|
if hasattr(stream, 'release_conn'):
|
||||||
|
stream.release_conn()
|
||||||
|
else:
|
||||||
|
stream.close()
|
||||||
|
|
||||||
|
|
||||||
#=============================================================================
|
#=============================================================================
|
||||||
|
5
pywb/warcserver/http.py
Normal file
5
pywb/warcserver/http.py
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
from requests.adapters import HTTPAdapter
|
||||||
|
|
||||||
|
default_adapter = HTTPAdapter(max_retries=3)
|
||||||
|
|
||||||
|
|
@ -5,6 +5,7 @@ from pywb.utils.wbexception import NotFoundException
|
|||||||
from warcio.timeutils import timestamp_to_http_date, http_date_to_timestamp
|
from warcio.timeutils import timestamp_to_http_date, http_date_to_timestamp
|
||||||
from warcio.timeutils import timestamp_now, pad_timestamp, PAD_14_DOWN
|
from warcio.timeutils import timestamp_now, pad_timestamp, PAD_14_DOWN
|
||||||
|
|
||||||
|
from pywb.warcserver.http import default_adapter
|
||||||
from pywb.warcserver.index.cdxobject import CDXObject
|
from pywb.warcserver.index.cdxobject import CDXObject
|
||||||
|
|
||||||
from pywb.utils.format import ParamFormatter, res_template
|
from pywb.utils.format import ParamFormatter, res_template
|
||||||
@ -31,6 +32,11 @@ class BaseIndexSource(object):
|
|||||||
else:
|
else:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
def _init_sesh(self):
|
||||||
|
self.sesh = requests.Session()
|
||||||
|
self.sesh.mount('http://', default_adapter)
|
||||||
|
self.sesh.mount('https://', default_adapter)
|
||||||
|
|
||||||
|
|
||||||
#=============================================================================
|
#=============================================================================
|
||||||
class FileIndexSource(BaseIndexSource):
|
class FileIndexSource(BaseIndexSource):
|
||||||
@ -96,6 +102,7 @@ class RemoteIndexSource(BaseIndexSource):
|
|||||||
self.replay_url = replay_url
|
self.replay_url = replay_url
|
||||||
self.url_field = url_field
|
self.url_field = url_field
|
||||||
self.closest_limit = closest_limit
|
self.closest_limit = closest_limit
|
||||||
|
self._init_sesh()
|
||||||
|
|
||||||
def _get_api_url(self, params):
|
def _get_api_url(self, params):
|
||||||
api_url = res_template(self.api_url, params)
|
api_url = res_template(self.api_url, params)
|
||||||
@ -106,8 +113,11 @@ class RemoteIndexSource(BaseIndexSource):
|
|||||||
|
|
||||||
def load_index(self, params):
|
def load_index(self, params):
|
||||||
api_url = self._get_api_url(params)
|
api_url = self._get_api_url(params)
|
||||||
r = requests.get(api_url, timeout=params.get('_timeout'))
|
try:
|
||||||
if r.status_code >= 400:
|
r = self.sesh.get(api_url, timeout=params.get('_timeout'))
|
||||||
|
r.raise_for_status()
|
||||||
|
except Exception as e:
|
||||||
|
print('FAILED: ' + str(e))
|
||||||
raise NotFoundException(api_url)
|
raise NotFoundException(api_url)
|
||||||
|
|
||||||
lines = r.content.strip().split(b'\n')
|
lines = r.content.strip().split(b'\n')
|
||||||
@ -317,12 +327,6 @@ class MementoIndexSource(BaseIndexSource):
|
|||||||
self.replay_url = replay_url
|
self.replay_url = replay_url
|
||||||
self._init_sesh()
|
self._init_sesh()
|
||||||
|
|
||||||
def _init_sesh(self):
|
|
||||||
self.sesh = requests.Session()
|
|
||||||
adapt = requests.adapters.HTTPAdapter(max_retries=3)
|
|
||||||
self.sesh.mount('http://', adapt)
|
|
||||||
self.sesh.mount('https://', adapt)
|
|
||||||
|
|
||||||
def links_to_cdxobject(self, link_header, def_name):
|
def links_to_cdxobject(self, link_header, def_name):
|
||||||
results = MementoUtils.parse_links(link_header, def_name)
|
results = MementoUtils.parse_links(link_header, def_name)
|
||||||
|
|
||||||
@ -360,10 +364,9 @@ class MementoIndexSource(BaseIndexSource):
|
|||||||
headers = self._get_headers(params)
|
headers = self._get_headers(params)
|
||||||
headers['Accept-Datetime'] = accept_dt
|
headers['Accept-Datetime'] = accept_dt
|
||||||
res = self.sesh.head(url, headers=headers)
|
res = self.sesh.head(url, headers=headers)
|
||||||
if res.status_code >= 400:
|
res.raise_for_status()
|
||||||
raise NotFoundException(url)
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print('FAILED:', e)
|
print('FAILED: ' + str(e))
|
||||||
raise NotFoundException(url)
|
raise NotFoundException(url)
|
||||||
|
|
||||||
links = res.headers.get('Link')
|
links = res.headers.get('Link')
|
||||||
|
@ -14,6 +14,8 @@ from pywb.utils.format import ParamFormatter
|
|||||||
from pywb.warcserver.resource.resolvingloader import ResolvingLoader
|
from pywb.warcserver.resource.resolvingloader import ResolvingLoader
|
||||||
from pywb.warcserver.resource.pathresolvers import DefaultResolverMixin
|
from pywb.warcserver.resource.pathresolvers import DefaultResolverMixin
|
||||||
|
|
||||||
|
from pywb.warcserver.http import default_adapter
|
||||||
|
|
||||||
from six.moves.urllib.parse import urlsplit, quote, unquote
|
from six.moves.urllib.parse import urlsplit, quote, unquote
|
||||||
|
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
@ -26,7 +28,6 @@ import glob
|
|||||||
import datetime
|
import datetime
|
||||||
|
|
||||||
from requests.models import PreparedRequest
|
from requests.models import PreparedRequest
|
||||||
from requests.packages import urllib3
|
|
||||||
|
|
||||||
import six.moves.http_client
|
import six.moves.http_client
|
||||||
six.moves.http_client._MAXHEADERS = 10000
|
six.moves.http_client._MAXHEADERS = 10000
|
||||||
@ -233,14 +234,14 @@ class LiveWebLoader(BaseLoader):
|
|||||||
|
|
||||||
UNREWRITE_HEADERS = ('location', 'content-location')
|
UNREWRITE_HEADERS = ('location', 'content-location')
|
||||||
|
|
||||||
def __init__(self, forward_proxy_prefix=None):
|
def __init__(self, forward_proxy_prefix=None, adapter=None):
|
||||||
self.num_retries = 3
|
|
||||||
self.num_pools = 10
|
|
||||||
self.num_conn_per_pool = 10
|
|
||||||
self.forward_proxy_prefix = forward_proxy_prefix
|
self.forward_proxy_prefix = forward_proxy_prefix
|
||||||
|
|
||||||
self.pool = urllib3.PoolManager(num_pools=self.num_pools,
|
if not adapter:
|
||||||
maxsize=self.num_conn_per_pool)
|
adapter = default_adapter
|
||||||
|
|
||||||
|
self.pool = adapter.poolmanager
|
||||||
|
self.max_retries = adapter.max_retries
|
||||||
|
|
||||||
def load_resource(self, cdx, params):
|
def load_resource(self, cdx, params):
|
||||||
load_url = cdx.get('load_url')
|
load_url = cdx.get('load_url')
|
||||||
@ -442,7 +443,7 @@ class LiveWebLoader(BaseLoader):
|
|||||||
assert_same_host=False,
|
assert_same_host=False,
|
||||||
preload_content=False,
|
preload_content=False,
|
||||||
decode_content=False,
|
decode_content=False,
|
||||||
retries=self.num_retries,
|
retries=self.max_retries,
|
||||||
timeout=params.get('_timeout'))
|
timeout=params.get('_timeout'))
|
||||||
|
|
||||||
return upstream_res
|
return upstream_res
|
||||||
|
Loading…
x
Reference in New Issue
Block a user