1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

mementoindexsource: add 'connection: close' to ensure connection closed after memento timegate query!

io utils: StreamIter() supports custom closer
responseloader: use release_conn() instead of close() to recycle urllib3 connections!
This commit is contained in:
Ilya Kreymer 2017-06-29 20:03:42 -07:00
parent 9bda61cab5
commit 1bd8a85a4d
3 changed files with 24 additions and 13 deletions

View File

@ -1,13 +1,13 @@
import zlib import zlib
from contextlib import closing from contextlib import closing, contextmanager
from warcio.utils import BUFF_SIZE from warcio.utils import BUFF_SIZE
from tempfile import SpooledTemporaryFile from tempfile import SpooledTemporaryFile
#============================================================================= #=============================================================================
def StreamIter(stream, header1=None, header2=None, size=BUFF_SIZE): def StreamIter(stream, header1=None, header2=None, size=BUFF_SIZE, closer=closing):
with closing(stream): with closer(stream):
if header1: if header1:
yield header1 yield header1
@ -21,6 +21,15 @@ def StreamIter(stream, header1=None, header2=None, size=BUFF_SIZE):
yield buff yield buff
#=============================================================================
@contextmanager
def call_release_conn(stream):
try:
yield stream
finally:
stream.release_conn()
#============================================================================= #=============================================================================
def chunk_encode_iter(orig_iter): def chunk_encode_iter(orig_iter):
for chunk in orig_iter: for chunk in orig_iter:

View File

@ -32,7 +32,6 @@ class BaseIndexSource(object):
return None return None
#============================================================================= #=============================================================================
class FileIndexSource(BaseIndexSource): class FileIndexSource(BaseIndexSource):
CDX_EXT = ('.cdx', '.cdxj') CDX_EXT = ('.cdx', '.cdxj')
@ -360,10 +359,11 @@ class MementoIndexSource(BaseIndexSource):
try: try:
headers = self._get_headers(params) headers = self._get_headers(params)
headers['Accept-Datetime'] = accept_dt headers['Accept-Datetime'] = accept_dt
res = self.sesh.head(url, headers=headers, timeout=None) res = self.sesh.head(url, headers=headers)
if res.status_code >= 400: if res.status_code >= 400:
raise NotFoundException(url) raise NotFoundException(url)
except: except Exception as e:
print('FAILED:', e)
raise NotFoundException(url) raise NotFoundException(url)
links = res.headers.get('Link') links = res.headers.get('Link')
@ -374,7 +374,8 @@ class MementoIndexSource(BaseIndexSource):
return links return links
def _get_headers(self, params): def _get_headers(self, params):
return {} headers = {'Connection': 'close'}
return headers
def handle_timemap(self, params): def handle_timemap(self, params):
url = res_template(self.timemap_url, params) url = res_template(self.timemap_url, params)
@ -469,11 +470,11 @@ class WBMementoIndexSource(MementoIndexSource):
return ref_url return ref_url
def _get_headers(self, params): def _get_headers(self, params):
headers = super(WBMementoIndexSource, self)._get_headers(params)
ref_url = self._get_referrer(params) ref_url = self._get_referrer(params)
if ref_url: if ref_url:
return {'Referer': ref_url} headers['Referer'] = ref_url
else: return headers
return {}
def _extract_location(self, url, location): def _extract_location(self, url, location):
if not location or not location.startswith(self.prefix): if not location or not location.startswith(self.prefix):

View File

@ -8,7 +8,7 @@ from warcio.statusandheaders import StatusAndHeaders, StatusAndHeadersParser
from pywb.utils.wbexception import LiveResourceException, WbException from pywb.utils.wbexception import LiveResourceException, WbException
from pywb.utils.memento import MementoUtils from pywb.utils.memento import MementoUtils
from pywb.utils.io import StreamIter, compress_gzip_iter from pywb.utils.io import StreamIter, compress_gzip_iter, call_release_conn
from pywb.utils.format import ParamFormatter from pywb.utils.format import ParamFormatter
from pywb.warcserver.resource.resolvingloader import ResolvingLoader from pywb.warcserver.resource.resolvingloader import ResolvingLoader
@ -63,7 +63,7 @@ class BaseLoader(object):
if not compress: if not compress:
out_headers['Content-Length'] = other_headers.get('Content-Length') out_headers['Content-Length'] = other_headers.get('Content-Length')
return out_headers, StreamIter(stream) return out_headers, StreamIter(stream, closer=call_release_conn)
target_uri = warc_headers.get_header('WARC-Target-URI') target_uri = warc_headers.get_header('WARC-Target-URI')
@ -85,7 +85,8 @@ class BaseLoader(object):
streamiter = StreamIter(stream, streamiter = StreamIter(stream,
header1=warc_headers_buff, header1=warc_headers_buff,
header2=other_headers) header2=other_headers,
closer=call_release_conn)
if compress: if compress:
streamiter = compress_gzip_iter(streamiter) streamiter = compress_gzip_iter(streamiter)