mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
webagg: responseloader: use urllib3 directly instead of requests to
take advantage of connection pooling w/o storing/sharing cookies
This commit is contained in:
parent
9010e52663
commit
7a0dd463cd
@ -13,12 +13,13 @@ from pywb.warc.resolvingloader import ResolvingLoader
|
|||||||
|
|
||||||
from six.moves.urllib.parse import urlsplit
|
from six.moves.urllib.parse import urlsplit
|
||||||
|
|
||||||
from io import BytesIO
|
#from io import BytesIO
|
||||||
|
|
||||||
import uuid
|
import uuid
|
||||||
import six
|
import six
|
||||||
import itertools
|
import itertools
|
||||||
import requests
|
#import requests
|
||||||
|
import urllib3
|
||||||
|
|
||||||
|
|
||||||
#=============================================================================
|
#=============================================================================
|
||||||
@ -216,8 +217,12 @@ class LiveWebLoader(BaseLoader):
|
|||||||
'x-archive')
|
'x-archive')
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
#self.sesh = requests.session()
|
self.num_retries = 3
|
||||||
self.sesh = requests
|
self.num_pools = 10
|
||||||
|
self.num_conn_per_pool = 10
|
||||||
|
|
||||||
|
self.pool = urllib3.PoolManager(num_pools=self.num_pools,
|
||||||
|
maxsize=self.num_conn_per_pool)
|
||||||
|
|
||||||
def load_resource(self, cdx, params):
|
def load_resource(self, cdx, params):
|
||||||
load_url = cdx.get('load_url')
|
load_url = cdx.get('load_url')
|
||||||
@ -237,13 +242,17 @@ class LiveWebLoader(BaseLoader):
|
|||||||
data = input_req.get_req_body()
|
data = input_req.get_req_body()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
upstream_res = self.sesh.request(url=load_url,
|
upstream_res = self.pool.urlopen(method=method,
|
||||||
method=method,
|
url=load_url,
|
||||||
stream=True,
|
body=data,
|
||||||
allow_redirects=False,
|
|
||||||
headers=req_headers,
|
headers=req_headers,
|
||||||
data=data,
|
redirect=False,
|
||||||
|
assert_same_host=False,
|
||||||
|
preload_content=False,
|
||||||
|
decode_content=False,
|
||||||
|
retries=self.num_retries,
|
||||||
timeout=params.get('_timeout'))
|
timeout=params.get('_timeout'))
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise LiveResourceException(load_url)
|
raise LiveResourceException(load_url)
|
||||||
|
|
||||||
@ -259,26 +268,26 @@ class LiveWebLoader(BaseLoader):
|
|||||||
agg_type = upstream_res.headers.get('WebAgg-Type')
|
agg_type = upstream_res.headers.get('WebAgg-Type')
|
||||||
if agg_type == 'warc':
|
if agg_type == 'warc':
|
||||||
cdx['source'] = upstream_res.headers.get('WebAgg-Source-Coll')
|
cdx['source'] = upstream_res.headers.get('WebAgg-Source-Coll')
|
||||||
return None, upstream_res.headers, upstream_res.raw
|
return None, upstream_res.headers, upstream_res
|
||||||
|
|
||||||
self.raise_on_self_redirect(params, cdx,
|
self.raise_on_self_redirect(params, cdx,
|
||||||
str(upstream_res.status_code),
|
str(upstream_res.status),
|
||||||
upstream_res.headers.get('Location'))
|
upstream_res.headers.get('Location'))
|
||||||
|
|
||||||
|
|
||||||
if upstream_res.raw.version == 11:
|
if upstream_res.version == 11:
|
||||||
version = '1.1'
|
version = '1.1'
|
||||||
else:
|
else:
|
||||||
version = '1.0'
|
version = '1.0'
|
||||||
|
|
||||||
status = 'HTTP/{version} {status} {reason}\r\n'
|
status = 'HTTP/{version} {status} {reason}\r\n'
|
||||||
status = status.format(version=version,
|
status = status.format(version=version,
|
||||||
status=upstream_res.status_code,
|
status=upstream_res.status,
|
||||||
reason=upstream_res.reason)
|
reason=upstream_res.reason)
|
||||||
|
|
||||||
http_headers_buff = status
|
http_headers_buff = status
|
||||||
|
|
||||||
orig_resp = upstream_res.raw._original_response
|
orig_resp = upstream_res._original_response
|
||||||
|
|
||||||
try: #pragma: no cover
|
try: #pragma: no cover
|
||||||
#PY 3
|
#PY 3
|
||||||
@ -301,7 +310,7 @@ class LiveWebLoader(BaseLoader):
|
|||||||
http_headers_buff = http_headers_buff.encode('latin-1')
|
http_headers_buff = http_headers_buff.encode('latin-1')
|
||||||
|
|
||||||
try:
|
try:
|
||||||
fp = upstream_res.raw._fp.fp
|
fp = upstream_res._fp.fp
|
||||||
if hasattr(fp, 'raw'): #pragma: no cover
|
if hasattr(fp, 'raw'): #pragma: no cover
|
||||||
fp = fp.raw
|
fp = fp.raw
|
||||||
remote_ip = fp._sock.getpeername()[0]
|
remote_ip = fp._sock.getpeername()[0]
|
||||||
@ -324,7 +333,7 @@ class LiveWebLoader(BaseLoader):
|
|||||||
len(http_headers_buff))
|
len(http_headers_buff))
|
||||||
|
|
||||||
warc_headers = StatusAndHeaders('WARC/1.0', warc_headers.items())
|
warc_headers = StatusAndHeaders('WARC/1.0', warc_headers.items())
|
||||||
return (warc_headers, http_headers_buff, upstream_res.raw)
|
return (warc_headers, http_headers_buff, upstream_res)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _make_warc_id(id_=None):
|
def _make_warc_id(id_=None):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user