1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 08:04:49 +01:00

mementoindexsource: add 'connection: close' to ensure connection closed after memento timegate query!

io utils: StreamIter() supports custom closer
responseloader: use release_conn() instead of close() to recycle urllib3 connections!
This commit is contained in:
Ilya Kreymer 2017-06-29 20:03:42 -07:00
parent 9bda61cab5
commit 1bd8a85a4d
3 changed files with 24 additions and 13 deletions

View File

@ -1,13 +1,13 @@
import zlib
from contextlib import closing
from contextlib import closing, contextmanager
from warcio.utils import BUFF_SIZE
from tempfile import SpooledTemporaryFile
#=============================================================================
def StreamIter(stream, header1=None, header2=None, size=BUFF_SIZE):
with closing(stream):
def StreamIter(stream, header1=None, header2=None, size=BUFF_SIZE, closer=closing):
with closer(stream):
if header1:
yield header1
@ -21,6 +21,15 @@ def StreamIter(stream, header1=None, header2=None, size=BUFF_SIZE):
yield buff
#=============================================================================
@contextmanager
def call_release_conn(stream):
try:
yield stream
finally:
stream.release_conn()
#=============================================================================
def chunk_encode_iter(orig_iter):
for chunk in orig_iter:

View File

@ -32,7 +32,6 @@ class BaseIndexSource(object):
return None
#=============================================================================
class FileIndexSource(BaseIndexSource):
CDX_EXT = ('.cdx', '.cdxj')
@ -360,10 +359,11 @@ class MementoIndexSource(BaseIndexSource):
try:
headers = self._get_headers(params)
headers['Accept-Datetime'] = accept_dt
res = self.sesh.head(url, headers=headers, timeout=None)
res = self.sesh.head(url, headers=headers)
if res.status_code >= 400:
raise NotFoundException(url)
except:
except Exception as e:
print('FAILED:', e)
raise NotFoundException(url)
links = res.headers.get('Link')
@ -374,7 +374,8 @@ class MementoIndexSource(BaseIndexSource):
return links
def _get_headers(self, params):
return {}
headers = {'Connection': 'close'}
return headers
def handle_timemap(self, params):
url = res_template(self.timemap_url, params)
@ -469,11 +470,11 @@ class WBMementoIndexSource(MementoIndexSource):
return ref_url
def _get_headers(self, params):
headers = super(WBMementoIndexSource, self)._get_headers(params)
ref_url = self._get_referrer(params)
if ref_url:
return {'Referer': ref_url}
else:
return {}
headers['Referer'] = ref_url
return headers
def _extract_location(self, url, location):
if not location or not location.startswith(self.prefix):

View File

@ -8,7 +8,7 @@ from warcio.statusandheaders import StatusAndHeaders, StatusAndHeadersParser
from pywb.utils.wbexception import LiveResourceException, WbException
from pywb.utils.memento import MementoUtils
from pywb.utils.io import StreamIter, compress_gzip_iter
from pywb.utils.io import StreamIter, compress_gzip_iter, call_release_conn
from pywb.utils.format import ParamFormatter
from pywb.warcserver.resource.resolvingloader import ResolvingLoader
@ -63,7 +63,7 @@ class BaseLoader(object):
if not compress:
out_headers['Content-Length'] = other_headers.get('Content-Length')
return out_headers, StreamIter(stream)
return out_headers, StreamIter(stream, closer=call_release_conn)
target_uri = warc_headers.get_header('WARC-Target-URI')
@ -85,7 +85,8 @@ class BaseLoader(object):
streamiter = StreamIter(stream,
header1=warc_headers_buff,
header2=other_headers)
header2=other_headers,
closer=call_release_conn)
if compress:
streamiter = compress_gzip_iter(streamiter)