1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

Improved handling of open http connections and file handles (#463)

* improved pywb's closing of open file handles and http connects by adding to pywb.util.io no_except_close

replaced close calls with no_except_close
reformatted and optimizes import of files that were modified

additional ci build fixes: 
- pin gevent to 1.4.0 in order to ensure build of pywb on ubuntu use gevent's wheel distribution
- youtube-dl fix: use youtube-dl in quiet mode to avoid errors with youtube-dl logging in pytest
This commit is contained in:
John Berlin 2019-05-15 20:38:12 -04:00 committed by Ilya Kreymer
parent 94784d6e5d
commit 5509c2cc82
13 changed files with 242 additions and 255 deletions

View File

@ -1,35 +1,23 @@
from io import BytesIO
import requests
from werkzeug.http import HTTP_STATUS_CODES
from six.moves.urllib.parse import urlencode, urlsplit, urlunsplit
from pywb.rewrite.default_rewriter import DefaultRewriter, RewriterWithJSProxy
from pywb.rewrite.wburl import WbUrl
from pywb.rewrite.url_rewriter import UrlRewriter, IdentityUrlRewriter
from pywb.utils.wbexception import WbException
from pywb.utils.canonicalize import canonicalize
from pywb.utils.loaders import extract_client_cookie
from pywb.utils.io import BUFF_SIZE, OffsetLimitReader
from pywb.utils.memento import MementoUtils
from warcio.timeutils import http_date_to_timestamp, timestamp_to_http_date
from warcio.bufferedreaders import BufferedReader
from warcio.recordloader import ArcWarcRecordLoader
from warcio.timeutils import http_date_to_timestamp, timestamp_to_http_date
from werkzeug.http import HTTP_STATUS_CODES
from pywb.warcserver.index.cdxobject import CDXObject
from pywb.apps.wbrequestresponse import WbResponse
from pywb.rewrite.default_rewriter import DefaultRewriter, RewriterWithJSProxy
from pywb.rewrite.rewriteinputreq import RewriteInputRequest
from pywb.rewrite.templateview import JinjaEnv, HeadInsertView, TopFrameView, BaseInsertView
from io import BytesIO
from copy import copy
import gevent
import json
from pywb.rewrite.templateview import BaseInsertView, HeadInsertView, JinjaEnv, TopFrameView
from pywb.rewrite.url_rewriter import IdentityUrlRewriter, UrlRewriter
from pywb.rewrite.wburl import WbUrl
from pywb.utils.canonicalize import canonicalize
from pywb.utils.io import BUFF_SIZE, OffsetLimitReader, no_except_close
from pywb.utils.memento import MementoUtils
from pywb.utils.wbexception import WbException
from pywb.warcserver.index.cdxobject import CDXObject
# ============================================================================
@ -40,7 +28,7 @@ class UpstreamException(WbException):
# ============================================================================
#class Rewriter(RewriteDASHMixin, RewriteAMFMixin, RewriteContent):
# class Rewriter(RewriteDASHMixin, RewriteAMFMixin, RewriteContent):
# pass
@ -84,8 +72,8 @@ class RewriterApp(object):
self.banner_view)
self.frame_insert_view = TopFrameView(self.jinja_env,
self._html_templ('frame_insert_html'),
self.banner_view)
self._html_templ('frame_insert_html'),
self.banner_view)
self.error_view = BaseInsertView(self.jinja_env, self._html_templ('error_html'))
self.not_found_view = BaseInsertView(self.jinja_env, self._html_templ('not_found_html'))
@ -129,9 +117,9 @@ class RewriterApp(object):
if accept_dt:
try:
wb_url.timestamp = http_date_to_timestamp(accept_dt)
except:
except Exception:
raise UpstreamException(400, url=wb_url.url, details='Invalid Accept-Datetime')
#return WbResponse.text_response('Invalid Accept-Datetime', status='400 Bad Request')
# return WbResponse.text_response('Invalid Accept-Datetime', status='400 Bad Request')
wb_url.type = wb_url.REPLAY
@ -163,7 +151,7 @@ class RewriterApp(object):
range_start = start
range_end = end
#if start with 0, load from upstream, but add range after
# if start with 0, load from upstream, but add range after
if start == 0:
del inputreq.env['HTTP_RANGE']
else:
@ -193,11 +181,6 @@ class RewriterApp(object):
if range_start >= content_length or range_end >= content_length:
details = 'Invalid Range: {0} >= {2} or {1} >= {2}'.format(range_start, range_end, content_length)
try:
r.raw.close()
except:
pass
raise UpstreamException(416, url=wb_url.url, details=details)
range_len = range_end - range_start + 1
@ -296,9 +279,10 @@ class RewriterApp(object):
error = None
try:
error = r.raw.read()
r.raw.close()
except:
except Exception:
pass
finally:
no_except_close(r.raw)
if error:
error = error.decode('utf-8')
@ -316,10 +300,7 @@ class RewriterApp(object):
# add trailing slash
new_path = url_parts.path + '/'
try:
r.raw.close()
except:
pass
no_except_close(r.raw)
return self.send_redirect(new_path, url_parts, urlrewriter)
@ -330,9 +311,9 @@ class RewriterApp(object):
memento_dt = r.headers.get('Memento-Datetime')
target_uri = r.headers.get('WARC-Target-URI')
#cdx['urlkey'] = urlkey
#cdx['timestamp'] = http_date_to_timestamp(memento_dt)
#cdx['url'] = target_uri
# cdx['urlkey'] = urlkey
# cdx['timestamp'] = http_date_to_timestamp(memento_dt)
# cdx['url'] = target_uri
set_content_loc = False
@ -343,7 +324,7 @@ class RewriterApp(object):
# if redir to exact, redir if url or ts are different
if self.redirect_to_exact:
if (set_content_loc or
(wb_url.timestamp != cdx.get('timestamp') and not cdx.get('is_live'))):
(wb_url.timestamp != cdx.get('timestamp') and not cdx.get('is_live'))):
new_url = urlrewriter.get_new_url(url=target_uri,
timestamp=cdx['timestamp'],
@ -375,15 +356,15 @@ class RewriterApp(object):
else:
top_url = self.get_top_url(full_prefix, wb_url, cdx, kwargs)
head_insert_func = (self.head_insert_view.
create_insert_func(wb_url,
full_prefix,
host_prefix,
top_url,
environ,
framed_replay,
coll=kwargs.get('coll', ''),
replay_mod=self.replay_mod,
config=self.config))
create_insert_func(wb_url,
full_prefix,
host_prefix,
top_url,
environ,
framed_replay,
coll=kwargs.get('coll', ''),
replay_mod=self.replay_mod,
config=self.config))
cookie_rewriter = None
if self.cookie_tracker:
@ -511,7 +492,6 @@ class RewriterApp(object):
return WbResponse.text_response(resp, status=status, content_type='text/html')
def _do_req(self, inputreq, wb_url, kwargs, skip_record):
req_data = inputreq.reconstruct_request(wb_url.url)
@ -618,7 +598,7 @@ class RewriterApp(object):
return scheme + host
def get_rel_prefix(self, environ):
#return request.script_name
# return request.script_name
return environ.get('SCRIPT_NAME') + '/'
def get_full_prefix(self, environ):

View File

@ -1,5 +1,7 @@
from warcio.statusandheaders import StatusAndHeaders
from pywb.utils.io import no_except_close
try:
import ujson as json
except ImportError: # pragma: no cover
@ -151,8 +153,7 @@ class WbResponse(object):
self.status_headers.headers)
request_method = env['REQUEST_METHOD']
if request_method == 'HEAD' or request_method == 'OPTIONS' or self.status_headers.statusline.startswith('304'):
if hasattr(self.body, 'close'):
self.body.close()
no_except_close(self.body)
return []
return self.body

View File

@ -2,15 +2,14 @@ import base64
import datetime
import os
import shutil
import traceback
import portalocker
from warcio.timeutils import timestamp20_now
from warcio.warcwriter import BaseWARCWriter
from pywb.utils.format import res_template
from pywb.utils.io import no_except_close
# ============================================================================
@ -85,7 +84,7 @@ class MultiFileWARCWriter(BaseWARCWriter):
try:
os.makedirs(path)
except:
except Exception:
pass
fh = open(filename, 'a+b')
@ -99,11 +98,12 @@ class MultiFileWARCWriter(BaseWARCWriter):
try:
if os.name != 'nt':
portalocker.lock(fh, portalocker.LOCK_UN)
fh.close()
return True
except Exception as e:
print(e)
return False
finally:
no_except_close(fh)
def get_dir_key(self, params):
return res_template(self.key_template, params)
@ -249,7 +249,7 @@ class MultiFileWARCWriter(BaseWARCWriter):
for dir_key, out, filename in self.iter_open_files():
try:
mtime = os.path.getmtime(filename)
except:
except Exception:
self.close_key(dir_key)
return

View File

@ -1,26 +1,21 @@
from pywb.utils.io import StreamIter, BUFF_SIZE
from pywb.utils.format import ParamFormatter, res_template
from pywb.warcserver.inputrequest import DirectWSGIInputRequest
from warcio.recordloader import ArcWarcRecordLoader
from pywb.recorder.filters import SkipRangeRequestFilter, CollectionFilter
from six.moves.urllib.parse import parse_qsl
import six
import json
import tempfile
import requests
import traceback
import gevent.queue
import gevent
import gevent.queue
import requests
import six
from six.moves.urllib.parse import parse_qsl
from warcio.recordloader import ArcWarcRecordLoader
from pywb.recorder.filters import CollectionFilter, SkipRangeRequestFilter
from pywb.utils.format import ParamFormatter
from pywb.utils.io import BUFF_SIZE, StreamIter, no_except_close
from pywb.warcserver.inputrequest import DirectWSGIInputRequest
#==============================================================================
# ==============================================================================
class RecorderApp(object):
def __init__(self, upstream_host, writer, skip_filters=None, **kwargs):
self.upstream_host = upstream_host
@ -52,13 +47,13 @@ class RecorderApp(object):
@staticmethod
def default_create_buffer(params, name):
return tempfile.SpooledTemporaryFile(max_size=512*1024)
return tempfile.SpooledTemporaryFile(max_size=512 * 1024)
def _write_loop(self):
while True:
try:
self._write_one()
except:
except Exception:
traceback.print_exc()
def _write_one(self):
@ -88,14 +83,13 @@ class RecorderApp(object):
else:
self.writer.write_record(resp, params)
finally:
try:
if req_pay:
req_pay.close()
no_except_close(req_pay)
if resp_pay:
resp_pay.close()
no_except_close(resp_pay)
except Exception as e:
traceback.print_exc()
@ -155,7 +149,7 @@ class RecorderApp(object):
finally:
if req_stream:
req_stream.out.close()
no_except_close(req_stream.out)
return self.send_message(msg,
'200 OK',
@ -169,8 +163,7 @@ class RecorderApp(object):
def __call__(self, environ, start_response):
try:
return self.handle_call(environ, start_response)
except:
import traceback
except Exception:
traceback.print_exc()
def handle_call(self, environ, start_response):
@ -217,15 +210,15 @@ class RecorderApp(object):
try:
res = requests.request(url=self.upstream_host + request_uri,
method=method,
data=data,
headers=headers,
allow_redirects=False,
stream=True)
method=method,
data=data,
headers=headers,
allow_redirects=False,
stream=True)
res.raise_for_status()
except Exception as e:
if req_is_wrapped:
req_stream.out.close()
no_except_close(req_stream.out)
return self.send_error(e, start_response)
if not skipping:
@ -233,8 +226,7 @@ class RecorderApp(object):
req_stream.headers,
res.headers,
params)
for x in self.skip_filters)
for x in self.skip_filters)
if not skipping:
resp_stream = RespWrapper(res.raw,
@ -248,7 +240,7 @@ class RecorderApp(object):
else:
resp_stream = res.raw
if req_is_wrapped:
req_stream.out.close()
no_except_close(req_stream.out)
resp_iter = StreamIter(resp_stream)
@ -260,7 +252,7 @@ class RecorderApp(object):
return resp_iter
#==============================================================================
# ==============================================================================
class Wrapper(object):
def __init__(self, stream, params, create_func):
self.stream = stream
@ -280,7 +272,7 @@ class Wrapper(object):
return buff
#==============================================================================
# ==============================================================================
class RespWrapper(Wrapper):
def __init__(self, stream, headers, req,
params, queue, path, create_func):
@ -319,23 +311,20 @@ class RespWrapper(Wrapper):
entry = (self.req.headers, self.req.out,
self.headers, self.out, self.params)
self.queue.put(entry)
except:
except Exception:
traceback.print_exc()
skipping = True
finally:
try:
if skipping:
self.out.close()
self.req.out.close()
except:
traceback.print_exc()
if skipping:
no_except_close(self.out)
no_except_close(self.req.out)
self.req.close()
no_except_close(self.req)
self.req = None
#==============================================================================
# ==============================================================================
class ReqWrapper(Wrapper):
def __init__(self, stream, req_headers, params, create_func):
super(ReqWrapper, self).__init__(stream, params, create_func)
@ -348,5 +337,3 @@ class ReqWrapper(Wrapper):
def close(self):
# no need to close wsgi.input
pass

View File

@ -1,19 +1,15 @@
from io import BytesIO
import codecs
import json
import re
import tempfile
from contextlib import closing
import webencodings
from warcio.bufferedreaders import BufferedReader, ChunkedDataReader
from warcio.utils import to_native_str
import re
import webencodings
import tempfile
import json
import codecs
from pywb.utils.io import StreamIter, BUFF_SIZE
from pywb.utils.loaders import load_yaml_config, load_py_name
from pywb.utils.io import BUFF_SIZE, StreamIter, no_except_close
from pywb.utils.loaders import load_py_name, load_yaml_config
WORKER_MODS = {"wkr_", "sw_"} # type: Set[str]
@ -344,7 +340,7 @@ class StreamingRewriter(object):
yield buff.encode(charset)
finally:
stream.close()
no_except_close(stream)
# ============================================================================

View File

@ -1,6 +1,8 @@
from gevent.pywsgi import WSGIServer, WSGIHandler
from gevent import spawn
import logging
import traceback
from gevent import spawn
from gevent.pywsgi import WSGIHandler, WSGIServer
# ============================================================================

View File

@ -1,12 +1,35 @@
import zlib
from contextlib import closing, contextmanager
from warcio.utils import BUFF_SIZE
from warcio.limitreader import LimitReader
from tempfile import SpooledTemporaryFile
from warcio.limitreader import LimitReader
from warcio.utils import BUFF_SIZE
#=============================================================================
def no_except_close(closable):
"""Attempts to call the close method of the
supplied object.
:param closable: The object to be closed
:rtype: None
"""
if not closable:
return
try:
closable.close()
except Exception:
pass
try:
release_conn = getattr(closable, 'release_conn', None)
if release_conn is not None:
release_conn()
except Exception:
pass
# =============================================================================
def StreamIter(stream, header1=None, header2=None, size=BUFF_SIZE, closer=closing):
with closer(stream):
if header1:
@ -22,19 +45,16 @@ def StreamIter(stream, header1=None, header2=None, size=BUFF_SIZE, closer=closin
yield buff
#=============================================================================
# =============================================================================
@contextmanager
def call_release_conn(stream):
try:
yield stream
finally:
if hasattr(stream, 'release_conn'):
stream.release_conn()
else:
stream.close()
no_except_close(stream)
#=============================================================================
# =============================================================================
def chunk_encode_iter(orig_iter):
for chunk in orig_iter:
if not len(chunk):
@ -47,7 +67,7 @@ def chunk_encode_iter(orig_iter):
yield b'0\r\n\r\n'
#=============================================================================
# =============================================================================
def buffer_iter(status_headers, iterator, buff_size=BUFF_SIZE * 4):
out = SpooledTemporaryFile(buff_size)
size = 0
@ -65,7 +85,7 @@ def buffer_iter(status_headers, iterator, buff_size=BUFF_SIZE * 4):
return StreamIter(out)
#=============================================================================
# =============================================================================
def compress_gzip_iter(orig_iter):
compressobj = zlib.compressobj(9, zlib.DEFLATED, zlib.MAX_WBITS + 16)
for chunk in orig_iter:
@ -101,4 +121,3 @@ class OffsetLimitReader(LimitReader):
def readline(self, length=None):
self._skip()
return super(OffsetLimitReader, self).readline(length)

View File

@ -11,22 +11,22 @@ import requests
import yaml
import six
from six.moves.urllib.parse import urljoin, unquote_plus, urlsplit, urlencode
from six.moves.urllib.parse import unquote_plus, urlsplit
import time
import pkgutil
import base64
import cgi
from io import open, BytesIO
from warcio.limitreader import LimitReader
from pywb.utils.io import no_except_close
try:
import boto3
from botocore import UNSIGNED
from botocore.client import Config
s3_avail = True
except ImportError: #pragma: no cover
except ImportError: # pragma: no cover
s3_avail = False
@ -39,12 +39,12 @@ def load_py_name(string):
return getattr(mod, string[1])
#=================================================================
# =================================================================
def is_http(filename):
return filename.startswith(('http://', 'https://'))
#=================================================================
# =================================================================
def to_file_url(filename):
""" Convert a filename to a file:// url
"""
@ -52,7 +52,7 @@ def to_file_url(filename):
return url
#=================================================================
# =================================================================
def from_file_url(url):
""" Convert from file:// url to file path
"""
@ -62,7 +62,7 @@ def from_file_url(url):
return url
#=================================================================
# =================================================================
def load(filename):
return BlockLoader().load(filename)
@ -75,8 +75,7 @@ def load_yaml_config(config_file):
configdata = load(config_file)
config = yaml.load(configdata)
finally:
if configdata:
configdata.close()
no_except_close(configdata)
return config
@ -84,7 +83,6 @@ def load_yaml_config(config_file):
# =============================================================================
def load_overlay_config(main_env_var, main_default_file='',
overlay_env_var='', overlay_file=''):
configfile = os.environ.get(main_env_var, main_default_file)
config = None
@ -104,7 +102,7 @@ def load_overlay_config(main_env_var, main_default_file='',
return config
#=================================================================
# =================================================================
def extract_client_cookie(env, cookie_name):
cookie_header = env.get('HTTP_COOKIE')
if not cookie_header:
@ -129,7 +127,7 @@ def extract_client_cookie(env, cookie_name):
return value
#=================================================================
# =================================================================
def read_last_line(fh, offset=256):
""" Read last line from a seekable file. Start reading
from buff before end of file, and double backwards seek
@ -150,7 +148,7 @@ def read_last_line(fh, offset=256):
return fh.readlines()[-1]
#=================================================================
# =================================================================
class BaseLoader(object):
def __init__(self, **kwargs):
pass
@ -159,7 +157,7 @@ class BaseLoader(object):
raise NotImplemented()
#=================================================================
# =================================================================
class BlockLoader(BaseLoader):
"""
a loader which can stream blocks of content
@ -171,6 +169,7 @@ class BlockLoader(BaseLoader):
profile_loader = None
def __init__(self, **kwargs):
super(BlockLoader, self).__init__()
self.cached = {}
self.kwargs = kwargs
@ -241,7 +240,7 @@ class BlockLoader(BaseLoader):
return range_header
#=================================================================
# =================================================================
class PackageLoader(BaseLoader):
def load(self, url, offset=0, length=-1):
if url.startswith('pkg://'):
@ -263,11 +262,11 @@ class PackageLoader(BaseLoader):
buff.name = url
return buff
#afile = pkg_resources.resource_stream(pkg_split[0],
# afile = pkg_resources.resource_stream(pkg_split[0],
# pkg_split[1])
#=================================================================
# =================================================================
class LocalFileLoader(PackageLoader):
def load(self, url, offset=0, length=-1):
"""
@ -283,11 +282,13 @@ class LocalFileLoader(PackageLoader):
file_only = True
url = filename
afile = None
try:
# first, try as file
afile = open(url, 'rb')
except IOError:
no_except_close(afile)
if file_only:
raise
@ -302,9 +303,10 @@ class LocalFileLoader(PackageLoader):
return afile
#=================================================================
# =================================================================
class HttpLoader(BaseLoader):
def __init__(self, **kwargs):
super(HttpLoader, self).__init__()
self.cookie_maker = kwargs.get('cookie_maker')
if not self.cookie_maker:
self.cookie_maker = kwargs.get('cookie')
@ -333,16 +335,17 @@ class HttpLoader(BaseLoader):
return r.raw
#=================================================================
# =================================================================
class S3Loader(BaseLoader):
def __init__(self, **kwargs):
super(S3Loader, self).__init__()
self.client = None
self.aws_access_key_id = kwargs.get('aws_access_key_id')
self.aws_secret_access_key = kwargs.get('aws_secret_access_key')
def load(self, url, offset, length):
if not s3_avail: #pragma: no cover
raise IOError('To load from s3 paths, ' +
if not s3_avail: # pragma: no cover
raise IOError('To load from s3 paths, ' +
'you must install boto3: pip install boto3')
aws_access_key_id = self.aws_access_key_id
@ -372,8 +375,8 @@ class S3Loader(BaseLoader):
config = None
client = boto3.client('s3', aws_access_key_id=aws_access_key_id,
aws_secret_access_key=aws_secret_access_key,
config=config)
aws_secret_access_key=aws_secret_access_key,
config=config)
else:
client = self.client
@ -398,15 +401,16 @@ class S3Loader(BaseLoader):
return obj['Body']
#=================================================================
# =================================================================
# Signed Cookie-Maker
#=================================================================
# =================================================================
class HMACCookieMaker(object):
"""
Utility class to produce signed HMAC digest cookies
to be used with each http request
"""
def __init__(self, key, name, duration=10):
self.key = key
self.name = name
@ -435,4 +439,3 @@ class HMACCookieMaker(object):
# ============================================================================
BlockLoader.init_default_loaders()

View File

@ -1,22 +1,18 @@
from pywb.utils.binsearch import iter_range
from pywb.utils.canonicalize import canonicalize
from pywb.utils.wbexception import NotFoundException
from warcio.timeutils import timestamp_to_http_date, http_date_to_timestamp
from warcio.timeutils import timestamp_now, pad_timestamp, PAD_14_DOWN
from pywb.warcserver.http import DefaultAdapters
from pywb.warcserver.index.cdxobject import CDXObject
from pywb.utils.format import ParamFormatter, res_template
from pywb.utils.memento import MementoUtils
import logging
import re
import redis
import requests
from warcio.timeutils import PAD_14_DOWN, http_date_to_timestamp, pad_timestamp, timestamp_now, timestamp_to_http_date
import re
import logging
from pywb.utils.binsearch import iter_range
from pywb.utils.canonicalize import canonicalize
from pywb.utils.format import res_template
from pywb.utils.io import no_except_close
from pywb.utils.memento import MementoUtils
from pywb.utils.wbexception import NotFoundException
from pywb.warcserver.http import DefaultAdapters
from pywb.warcserver.index.cdxobject import CDXObject
#=============================================================================
@ -432,15 +428,16 @@ class MementoIndexSource(BaseIndexSource):
def handle_timemap(self, params):
url = res_template(self.timemap_url, params)
headers = self._get_headers(params)
res = None
try:
res = self.sesh.get(url,
headers=headers,
timeout=params.get('_timeout'))
res.raise_for_status()
assert(res.text)
except Exception as e:
no_except_close(res)
self.logger.debug('FAILED: ' + str(e))
raise NotFoundException(url)
@ -550,14 +547,17 @@ class WBMementoIndexSource(MementoIndexSource):
url = params['url']
load_url = self.timegate_url.format(url=url, timestamp=timestamp)
res = None
try:
headers = self._get_headers(params)
res = self.sesh.head(load_url, headers=headers)
except Exception as e:
no_except_close(res)
raise NotFoundException(url)
if res and res.headers.get('Memento-Datetime'):
if res.status_code >= 400:
no_except_close(res)
raise NotFoundException(url)
if res.status_code >= 300:

View File

@ -1,24 +1,20 @@
import datetime
import itertools
import json
import logging
import os
from io import BytesIO
import os
import collections
import itertools
import logging
import datetime
import json
import six
from six.moves import map
from warcio.bufferedreaders import gzip_decompressor
#from pywb.warcserver.index.cdxsource import CDXSource
from pywb.warcserver.index.indexsource import BaseIndexSource
from pywb.warcserver.index.cdxobject import IDXObject, CDXException, CDXObject
from pywb.warcserver.index.query import CDXQuery
from pywb.utils.loaders import BlockLoader, read_last_line
from pywb.utils.binsearch import iter_range, linearsearch, search
from pywb.utils.io import no_except_close
from pywb.utils.loaders import BlockLoader, read_last_line
from pywb.warcserver.index.cdxobject import CDXException, CDXObject, IDXObject
# from pywb.warcserver.index.cdxsource import CDXSource
from pywb.warcserver.index.indexsource import BaseIndexSource
from pywb.warcserver.index.query import CDXQuery
# ============================================================================
@ -211,7 +207,7 @@ class ZipNumIndexSource(BaseIndexSource):
if end_line == last_line and query.key >= last_line:
first_line = last_line
else:
reader.close()
no_except_close(reader)
if query.page_count:
yield self._page_info(0, pagesize, 0)
return
@ -240,13 +236,13 @@ class ZipNumIndexSource(BaseIndexSource):
blocks = -1
yield self._page_info(total_pages, pagesize, blocks + 1)
reader.close()
no_except_close(reader)
return
curr_page = query.page
if curr_page >= total_pages or curr_page < 0:
msg = 'Page {0} invalid: First Page is 0, Last Page is {1}'
reader.close()
no_except_close(reader)
raise CDXException(msg.format(curr_page, total_pages - 1))
startline = curr_page * pagesize
@ -259,12 +255,14 @@ class ZipNumIndexSource(BaseIndexSource):
else:
startline -= 1
idxiter = itertools.islice(first_iter, startline, endline)
for idx in idxiter:
yield idx
reader.close()
try:
idxiter = itertools.islice(first_iter, startline, endline)
for idx in idxiter:
yield idx
except Exception:
pass
finally:
no_except_close(reader)
def search_by_line_num(self, reader, line): # pragma: no cover
def line_cmp(line1, line2):
@ -349,7 +347,7 @@ class ZipNumIndexSource(BaseIndexSource):
for r in ranges:
yield decompress_block(r)
finally:
reader.close()
no_except_close(reader)
# iterate over all blocks
iter_ = itertools.chain.from_iterable(iter_blocks(reader))

View File

@ -1,20 +1,19 @@
import six
from warcio.recordloader import ArchiveLoadFailed
from warcio.timeutils import iso_date_to_timestamp
from pywb.utils.io import no_except_close
from pywb.utils.wbexception import NotFoundException
from pywb.warcserver.resource.blockrecordloader import BlockArcWarcRecordLoader
from pywb.utils.wbexception import NotFoundException
import six
#=================================================================
# =================================================================
class ResolvingLoader(object):
MISSING_REVISIT_MSG = 'Original for revisit record could not be loaded'
def __init__(self, path_resolvers, record_loader=BlockArcWarcRecordLoader(), no_record_parse=False):
def __init__(self, path_resolvers, record_loader=None, no_record_parse=False):
self.path_resolvers = path_resolvers
self.record_loader = record_loader
self.record_loader = record_loader if record_loader is not None else BlockArcWarcRecordLoader()
self.no_record_parse = no_record_parse
def __call__(self, cdx, failed_files, cdx_loader, *args, **kwargs):
@ -29,7 +28,7 @@ class ResolvingLoader(object):
elif headers_record != payload_record:
# close remainder of stream as this record only used for
# (already parsed) headers
headers_record.raw_stream.close()
no_except_close(headers_record.raw_stream)
# special case: check if headers record is actually empty
# (eg empty revisit), then use headers from revisit
@ -37,6 +36,10 @@ class ResolvingLoader(object):
headers_record = payload_record
if not headers_record or not payload_record:
if headers_record:
no_except_close(headers_record.raw_stream)
if payload_record:
no_except_close(payload_record.raw_stream)
raise ArchiveLoadFailed('Could not load ' + str(cdx))
# ensure status line is valid from here
@ -57,12 +60,13 @@ class ResolvingLoader(object):
from a different url to find the original record.
"""
has_curr = (cdx['filename'] != '-')
#has_orig = (cdx.get('orig.filename', '-') != '-')
# has_orig = (cdx.get('orig.filename', '-') != '-')
orig_f = cdx.get('orig.filename')
has_orig = orig_f and orig_f != '-'
# load headers record from cdx['filename'] unless it is '-' (rare)
headers_record = None
payload_record = None
if has_curr:
headers_record = self._resolve_path_load(cdx, False, failed_files)
@ -85,7 +89,6 @@ class ResolvingLoader(object):
return headers_record, payload_record
def _resolve_path_load(self, cdx, is_original, failed_files):
"""
Load specific record based on filename, offset and length
@ -127,8 +130,8 @@ class ResolvingLoader(object):
any_found = True
try:
return (self.record_loader.
load(path, offset, length,
no_record_parse=self.no_record_parse))
load(path, offset, length,
no_record_parse=self.no_record_parse))
except Exception as ue:
last_exc = ue
@ -140,12 +143,12 @@ class ResolvingLoader(object):
failed_files.append(filename)
if last_exc:
#msg = str(last_exc.__class__.__name__)
# msg = str(last_exc.__class__.__name__)
msg = str(last_exc)
else:
msg = 'Archive File Not Found'
#raise ArchiveLoadFailed(msg, filename), None, last_traceback
# raise ArchiveLoadFailed(msg, filename), None, last_traceback
six.reraise(ArchiveLoadFailed, ArchiveLoadFailed(filename + ': ' + msg), last_traceback)
def _load_different_url_payload(self, cdx, headers_record,

View File

@ -1,36 +1,31 @@
from warcio.timeutils import timestamp_to_datetime, datetime_to_timestamp
from warcio.timeutils import iso_date_to_datetime, datetime_to_iso_date
from warcio.timeutils import http_date_to_datetime, datetime_to_http_date
from warcio.utils import to_native_str
from warcio.statusandheaders import StatusAndHeaders, StatusAndHeadersParser
from pywb.utils.wbexception import LiveResourceException, WbException
from pywb.utils.canonicalize import canonicalize
from pywb.utils.memento import MementoUtils
from pywb.utils.io import StreamIter, compress_gzip_iter, call_release_conn
from pywb.utils.format import ParamFormatter
from pywb.warcserver.resource.resolvingloader import ResolvingLoader
from pywb.warcserver.resource.pathresolvers import DefaultResolverMixin
from pywb.warcserver.http import DefaultAdapters, SOCKS_PROXIES
from six.moves.urllib.parse import urlsplit, quote, unquote
import datetime
import json
import logging
import uuid
from io import BytesIO
import uuid
import six
import itertools
import json
import glob
import datetime
import logging
from requests.models import PreparedRequest
from six.moves.urllib.parse import quote, unquote, urlsplit
from warcio.statusandheaders import StatusAndHeaders, StatusAndHeadersParser
from warcio.timeutils import (
datetime_to_http_date,
datetime_to_iso_date,
datetime_to_timestamp,
http_date_to_datetime,
iso_date_to_datetime,
timestamp_to_datetime
)
from warcio.utils import to_native_str
from pywb.utils.canonicalize import canonicalize
from pywb.utils.format import ParamFormatter
from pywb.utils.io import StreamIter, call_release_conn, compress_gzip_iter, no_except_close
from pywb.utils.memento import MementoUtils
from pywb.utils.wbexception import LiveResourceException
from pywb.warcserver.http import DefaultAdapters, SOCKS_PROXIES
from pywb.warcserver.resource.pathresolvers import DefaultResolverMixin
from pywb.warcserver.resource.resolvingloader import ResolvingLoader
logger = logging.getLogger('warcserver')
@ -217,8 +212,8 @@ class WARCPathLoader(DefaultResolverMixin, BaseLoader):
http_headers.get_statuscode(),
http_headers.get_header('Location'))
except LiveResourceException:
headers.raw_stream.close()
payload.raw_stream.close()
no_except_close(headers.raw_stream)
no_except_close(payload.raw_stream)
raise
http_headers_buff = http_headers.to_bytes()
@ -237,8 +232,7 @@ class WARCPathLoader(DefaultResolverMixin, BaseLoader):
warc_headers.replace_header('WARC-Date',
headers.rec_headers.get_header('WARC-Date'))
headers.raw_stream.close()
no_except_close(headers.raw_stream)
return (warc_headers, http_headers_buff, payload.raw_stream)
@ -288,7 +282,7 @@ class LiveWebLoader(BaseLoader):
p = PreparedRequest()
try:
p.prepare_url(load_url, None)
except:
except Exception:
raise LiveResourceException(load_url)
p.prepare_headers(None)
p.prepare_auth(None, load_url)
@ -320,6 +314,7 @@ class LiveWebLoader(BaseLoader):
elif cdx.get('memento_url'):
# if 'memento_url' set and no Memento-Datetime header present
# then its an error
no_except_close(upstream_res)
return None
agg_type = upstream_res.headers.get('Warcserver-Type')
@ -485,6 +480,7 @@ class LiveWebLoader(BaseLoader):
else:
conn = adapter.poolmanager
upstream_res = None
try:
upstream_res = conn.urlopen(method=method,
url=load_url,
@ -500,6 +496,8 @@ class LiveWebLoader(BaseLoader):
return upstream_res
except Exception as e:
if upstream_res:
no_except_close(upstream_res)
if logger.isEnabledFor(logging.DEBUG):
import traceback
traceback.print_exc()
@ -527,7 +525,7 @@ class VideoLoader(BaseLoader):
self.ydl = None
return
self.ydl = YoutubeDL(dict(simulate=True,
self.ydl = YoutubeDL(dict(simulate=True, quiet=True,
youtube_include_dash_manifest=False))
self.ydl.add_default_info_extractors()

View File

@ -9,7 +9,7 @@ brotlipy
pyyaml
werkzeug
webencodings
gevent
gevent==1.4.0
webassets==0.12.1
portalocker
wsgiprox>=1.5.1