1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 08:04:49 +01:00

webagg: responseloader: use urllib3 directly instead of requests to

take advantage of connection pooling w/o storing/sharing cookies
This commit is contained in:
Ilya Kreymer 2016-04-27 10:16:54 -07:00
parent 9010e52663
commit 7a0dd463cd

View File

@ -13,12 +13,13 @@ from pywb.warc.resolvingloader import ResolvingLoader
from six.moves.urllib.parse import urlsplit
from io import BytesIO
#from io import BytesIO
import uuid
import six
import itertools
import requests
#import requests
import urllib3
#=============================================================================
@ -216,8 +217,12 @@ class LiveWebLoader(BaseLoader):
'x-archive')
def __init__(self):
#self.sesh = requests.session()
self.sesh = requests
self.num_retries = 3
self.num_pools = 10
self.num_conn_per_pool = 10
self.pool = urllib3.PoolManager(num_pools=self.num_pools,
maxsize=self.num_conn_per_pool)
def load_resource(self, cdx, params):
load_url = cdx.get('load_url')
@ -237,13 +242,17 @@ class LiveWebLoader(BaseLoader):
data = input_req.get_req_body()
try:
upstream_res = self.sesh.request(url=load_url,
method=method,
stream=True,
allow_redirects=False,
upstream_res = self.pool.urlopen(method=method,
url=load_url,
body=data,
headers=req_headers,
data=data,
redirect=False,
assert_same_host=False,
preload_content=False,
decode_content=False,
retries=self.num_retries,
timeout=params.get('_timeout'))
except Exception as e:
raise LiveResourceException(load_url)
@ -259,26 +268,26 @@ class LiveWebLoader(BaseLoader):
agg_type = upstream_res.headers.get('WebAgg-Type')
if agg_type == 'warc':
cdx['source'] = upstream_res.headers.get('WebAgg-Source-Coll')
return None, upstream_res.headers, upstream_res.raw
return None, upstream_res.headers, upstream_res
self.raise_on_self_redirect(params, cdx,
str(upstream_res.status_code),
str(upstream_res.status),
upstream_res.headers.get('Location'))
if upstream_res.raw.version == 11:
if upstream_res.version == 11:
version = '1.1'
else:
version = '1.0'
status = 'HTTP/{version} {status} {reason}\r\n'
status = status.format(version=version,
status=upstream_res.status_code,
status=upstream_res.status,
reason=upstream_res.reason)
http_headers_buff = status
orig_resp = upstream_res.raw._original_response
orig_resp = upstream_res._original_response
try: #pragma: no cover
#PY 3
@ -301,7 +310,7 @@ class LiveWebLoader(BaseLoader):
http_headers_buff = http_headers_buff.encode('latin-1')
try:
fp = upstream_res.raw._fp.fp
fp = upstream_res._fp.fp
if hasattr(fp, 'raw'): #pragma: no cover
fp = fp.raw
remote_ip = fp._sock.getpeername()[0]
@ -324,7 +333,7 @@ class LiveWebLoader(BaseLoader):
len(http_headers_buff))
warc_headers = StatusAndHeaders('WARC/1.0', warc_headers.items())
return (warc_headers, http_headers_buff, upstream_res.raw)
return (warc_headers, http_headers_buff, upstream_res)
@staticmethod
def _make_warc_id(id_=None):