mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
Improved handling of open http connections and file handles (#463)
* improved pywb's closing of open file handles and http connects by adding to pywb.util.io no_except_close replaced close calls with no_except_close reformatted and optimizes import of files that were modified additional ci build fixes: - pin gevent to 1.4.0 in order to ensure build of pywb on ubuntu use gevent's wheel distribution - youtube-dl fix: use youtube-dl in quiet mode to avoid errors with youtube-dl logging in pytest
This commit is contained in:
parent
22b4297fc5
commit
a907b2b511
@ -1,35 +1,23 @@
|
||||
from io import BytesIO
|
||||
|
||||
import requests
|
||||
|
||||
from werkzeug.http import HTTP_STATUS_CODES
|
||||
from six.moves.urllib.parse import urlencode, urlsplit, urlunsplit
|
||||
|
||||
from pywb.rewrite.default_rewriter import DefaultRewriter, RewriterWithJSProxy
|
||||
|
||||
from pywb.rewrite.wburl import WbUrl
|
||||
from pywb.rewrite.url_rewriter import UrlRewriter, IdentityUrlRewriter
|
||||
|
||||
from pywb.utils.wbexception import WbException
|
||||
from pywb.utils.canonicalize import canonicalize
|
||||
from pywb.utils.loaders import extract_client_cookie
|
||||
from pywb.utils.io import BUFF_SIZE, OffsetLimitReader
|
||||
from pywb.utils.memento import MementoUtils
|
||||
|
||||
from warcio.timeutils import http_date_to_timestamp, timestamp_to_http_date
|
||||
from warcio.bufferedreaders import BufferedReader
|
||||
from warcio.recordloader import ArcWarcRecordLoader
|
||||
from warcio.timeutils import http_date_to_timestamp, timestamp_to_http_date
|
||||
from werkzeug.http import HTTP_STATUS_CODES
|
||||
|
||||
from pywb.warcserver.index.cdxobject import CDXObject
|
||||
from pywb.apps.wbrequestresponse import WbResponse
|
||||
|
||||
from pywb.rewrite.default_rewriter import DefaultRewriter, RewriterWithJSProxy
|
||||
from pywb.rewrite.rewriteinputreq import RewriteInputRequest
|
||||
from pywb.rewrite.templateview import JinjaEnv, HeadInsertView, TopFrameView, BaseInsertView
|
||||
|
||||
|
||||
from io import BytesIO
|
||||
from copy import copy
|
||||
|
||||
import gevent
|
||||
import json
|
||||
from pywb.rewrite.templateview import BaseInsertView, HeadInsertView, JinjaEnv, TopFrameView
|
||||
from pywb.rewrite.url_rewriter import IdentityUrlRewriter, UrlRewriter
|
||||
from pywb.rewrite.wburl import WbUrl
|
||||
from pywb.utils.canonicalize import canonicalize
|
||||
from pywb.utils.io import BUFF_SIZE, OffsetLimitReader, no_except_close
|
||||
from pywb.utils.memento import MementoUtils
|
||||
from pywb.utils.wbexception import WbException
|
||||
from pywb.warcserver.index.cdxobject import CDXObject
|
||||
|
||||
|
||||
# ============================================================================
|
||||
@ -129,7 +117,7 @@ class RewriterApp(object):
|
||||
if accept_dt:
|
||||
try:
|
||||
wb_url.timestamp = http_date_to_timestamp(accept_dt)
|
||||
except:
|
||||
except Exception:
|
||||
raise UpstreamException(400, url=wb_url.url, details='Invalid Accept-Datetime')
|
||||
# return WbResponse.text_response('Invalid Accept-Datetime', status='400 Bad Request')
|
||||
|
||||
@ -193,11 +181,6 @@ class RewriterApp(object):
|
||||
|
||||
if range_start >= content_length or range_end >= content_length:
|
||||
details = 'Invalid Range: {0} >= {2} or {1} >= {2}'.format(range_start, range_end, content_length)
|
||||
try:
|
||||
r.raw.close()
|
||||
except:
|
||||
pass
|
||||
|
||||
raise UpstreamException(416, url=wb_url.url, details=details)
|
||||
|
||||
range_len = range_end - range_start + 1
|
||||
@ -296,9 +279,10 @@ class RewriterApp(object):
|
||||
error = None
|
||||
try:
|
||||
error = r.raw.read()
|
||||
r.raw.close()
|
||||
except:
|
||||
except Exception:
|
||||
pass
|
||||
finally:
|
||||
no_except_close(r.raw)
|
||||
|
||||
if error:
|
||||
error = error.decode('utf-8')
|
||||
@ -316,10 +300,7 @@ class RewriterApp(object):
|
||||
# add trailing slash
|
||||
new_path = url_parts.path + '/'
|
||||
|
||||
try:
|
||||
r.raw.close()
|
||||
except:
|
||||
pass
|
||||
no_except_close(r.raw)
|
||||
|
||||
return self.send_redirect(new_path, url_parts, urlrewriter)
|
||||
|
||||
@ -511,7 +492,6 @@ class RewriterApp(object):
|
||||
|
||||
return WbResponse.text_response(resp, status=status, content_type='text/html')
|
||||
|
||||
|
||||
def _do_req(self, inputreq, wb_url, kwargs, skip_record):
|
||||
req_data = inputreq.reconstruct_request(wb_url.url)
|
||||
|
||||
|
@ -1,5 +1,7 @@
|
||||
from warcio.statusandheaders import StatusAndHeaders
|
||||
|
||||
from pywb.utils.io import no_except_close
|
||||
|
||||
try:
|
||||
import ujson as json
|
||||
except ImportError: # pragma: no cover
|
||||
@ -151,8 +153,7 @@ class WbResponse(object):
|
||||
self.status_headers.headers)
|
||||
request_method = env['REQUEST_METHOD']
|
||||
if request_method == 'HEAD' or request_method == 'OPTIONS' or self.status_headers.statusline.startswith('304'):
|
||||
if hasattr(self.body, 'close'):
|
||||
self.body.close()
|
||||
no_except_close(self.body)
|
||||
return []
|
||||
|
||||
return self.body
|
||||
|
@ -2,15 +2,14 @@ import base64
|
||||
import datetime
|
||||
import os
|
||||
import shutil
|
||||
|
||||
import traceback
|
||||
|
||||
import portalocker
|
||||
|
||||
from warcio.timeutils import timestamp20_now
|
||||
from warcio.warcwriter import BaseWARCWriter
|
||||
|
||||
from pywb.utils.format import res_template
|
||||
from pywb.utils.io import no_except_close
|
||||
|
||||
|
||||
# ============================================================================
|
||||
@ -85,7 +84,7 @@ class MultiFileWARCWriter(BaseWARCWriter):
|
||||
|
||||
try:
|
||||
os.makedirs(path)
|
||||
except:
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
fh = open(filename, 'a+b')
|
||||
@ -99,11 +98,12 @@ class MultiFileWARCWriter(BaseWARCWriter):
|
||||
try:
|
||||
if os.name != 'nt':
|
||||
portalocker.lock(fh, portalocker.LOCK_UN)
|
||||
fh.close()
|
||||
return True
|
||||
except Exception as e:
|
||||
print(e)
|
||||
return False
|
||||
finally:
|
||||
no_except_close(fh)
|
||||
|
||||
def get_dir_key(self, params):
|
||||
return res_template(self.key_template, params)
|
||||
@ -249,7 +249,7 @@ class MultiFileWARCWriter(BaseWARCWriter):
|
||||
for dir_key, out, filename in self.iter_open_files():
|
||||
try:
|
||||
mtime = os.path.getmtime(filename)
|
||||
except:
|
||||
except Exception:
|
||||
self.close_key(dir_key)
|
||||
return
|
||||
|
||||
|
@ -1,23 +1,18 @@
|
||||
from pywb.utils.io import StreamIter, BUFF_SIZE
|
||||
from pywb.utils.format import ParamFormatter, res_template
|
||||
from pywb.warcserver.inputrequest import DirectWSGIInputRequest
|
||||
|
||||
from warcio.recordloader import ArcWarcRecordLoader
|
||||
|
||||
from pywb.recorder.filters import SkipRangeRequestFilter, CollectionFilter
|
||||
|
||||
from six.moves.urllib.parse import parse_qsl
|
||||
import six
|
||||
|
||||
import json
|
||||
import tempfile
|
||||
|
||||
import requests
|
||||
|
||||
import traceback
|
||||
|
||||
import gevent.queue
|
||||
import gevent
|
||||
import gevent.queue
|
||||
import requests
|
||||
import six
|
||||
from six.moves.urllib.parse import parse_qsl
|
||||
from warcio.recordloader import ArcWarcRecordLoader
|
||||
|
||||
from pywb.recorder.filters import CollectionFilter, SkipRangeRequestFilter
|
||||
from pywb.utils.format import ParamFormatter
|
||||
from pywb.utils.io import BUFF_SIZE, StreamIter, no_except_close
|
||||
from pywb.warcserver.inputrequest import DirectWSGIInputRequest
|
||||
|
||||
|
||||
# ==============================================================================
|
||||
@ -58,7 +53,7 @@ class RecorderApp(object):
|
||||
while True:
|
||||
try:
|
||||
self._write_one()
|
||||
except:
|
||||
except Exception:
|
||||
traceback.print_exc()
|
||||
|
||||
def _write_one(self):
|
||||
@ -88,14 +83,13 @@ class RecorderApp(object):
|
||||
else:
|
||||
self.writer.write_record(resp, params)
|
||||
|
||||
|
||||
finally:
|
||||
try:
|
||||
if req_pay:
|
||||
req_pay.close()
|
||||
no_except_close(req_pay)
|
||||
|
||||
if resp_pay:
|
||||
resp_pay.close()
|
||||
no_except_close(resp_pay)
|
||||
except Exception as e:
|
||||
traceback.print_exc()
|
||||
|
||||
@ -155,7 +149,7 @@ class RecorderApp(object):
|
||||
|
||||
finally:
|
||||
if req_stream:
|
||||
req_stream.out.close()
|
||||
no_except_close(req_stream.out)
|
||||
|
||||
return self.send_message(msg,
|
||||
'200 OK',
|
||||
@ -169,8 +163,7 @@ class RecorderApp(object):
|
||||
def __call__(self, environ, start_response):
|
||||
try:
|
||||
return self.handle_call(environ, start_response)
|
||||
except:
|
||||
import traceback
|
||||
except Exception:
|
||||
traceback.print_exc()
|
||||
|
||||
def handle_call(self, environ, start_response):
|
||||
@ -225,7 +218,7 @@ class RecorderApp(object):
|
||||
res.raise_for_status()
|
||||
except Exception as e:
|
||||
if req_is_wrapped:
|
||||
req_stream.out.close()
|
||||
no_except_close(req_stream.out)
|
||||
return self.send_error(e, start_response)
|
||||
|
||||
if not skipping:
|
||||
@ -235,7 +228,6 @@ class RecorderApp(object):
|
||||
params)
|
||||
for x in self.skip_filters)
|
||||
|
||||
|
||||
if not skipping:
|
||||
resp_stream = RespWrapper(res.raw,
|
||||
res.headers,
|
||||
@ -248,7 +240,7 @@ class RecorderApp(object):
|
||||
else:
|
||||
resp_stream = res.raw
|
||||
if req_is_wrapped:
|
||||
req_stream.out.close()
|
||||
no_except_close(req_stream.out)
|
||||
|
||||
resp_iter = StreamIter(resp_stream)
|
||||
|
||||
@ -319,19 +311,16 @@ class RespWrapper(Wrapper):
|
||||
entry = (self.req.headers, self.req.out,
|
||||
self.headers, self.out, self.params)
|
||||
self.queue.put(entry)
|
||||
except:
|
||||
except Exception:
|
||||
traceback.print_exc()
|
||||
skipping = True
|
||||
|
||||
finally:
|
||||
try:
|
||||
if skipping:
|
||||
self.out.close()
|
||||
self.req.out.close()
|
||||
except:
|
||||
traceback.print_exc()
|
||||
no_except_close(self.out)
|
||||
no_except_close(self.req.out)
|
||||
|
||||
self.req.close()
|
||||
no_except_close(self.req)
|
||||
self.req = None
|
||||
|
||||
|
||||
@ -348,5 +337,3 @@ class ReqWrapper(Wrapper):
|
||||
def close(self):
|
||||
# no need to close wsgi.input
|
||||
pass
|
||||
|
||||
|
||||
|
@ -1,19 +1,15 @@
|
||||
from io import BytesIO
|
||||
|
||||
import codecs
|
||||
import json
|
||||
import re
|
||||
import tempfile
|
||||
from contextlib import closing
|
||||
|
||||
import webencodings
|
||||
from warcio.bufferedreaders import BufferedReader, ChunkedDataReader
|
||||
from warcio.utils import to_native_str
|
||||
|
||||
import re
|
||||
import webencodings
|
||||
import tempfile
|
||||
import json
|
||||
import codecs
|
||||
|
||||
from pywb.utils.io import StreamIter, BUFF_SIZE
|
||||
|
||||
from pywb.utils.loaders import load_yaml_config, load_py_name
|
||||
from pywb.utils.io import BUFF_SIZE, StreamIter, no_except_close
|
||||
from pywb.utils.loaders import load_py_name, load_yaml_config
|
||||
|
||||
WORKER_MODS = {"wkr_", "sw_"} # type: Set[str]
|
||||
|
||||
@ -344,7 +340,7 @@ class StreamingRewriter(object):
|
||||
yield buff.encode(charset)
|
||||
|
||||
finally:
|
||||
stream.close()
|
||||
no_except_close(stream)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
|
@ -1,6 +1,8 @@
|
||||
from gevent.pywsgi import WSGIServer, WSGIHandler
|
||||
from gevent import spawn
|
||||
import logging
|
||||
import traceback
|
||||
|
||||
from gevent import spawn
|
||||
from gevent.pywsgi import WSGIHandler, WSGIServer
|
||||
|
||||
|
||||
# ============================================================================
|
||||
|
@ -1,10 +1,33 @@
|
||||
import zlib
|
||||
from contextlib import closing, contextmanager
|
||||
|
||||
from warcio.utils import BUFF_SIZE
|
||||
from warcio.limitreader import LimitReader
|
||||
from tempfile import SpooledTemporaryFile
|
||||
|
||||
from warcio.limitreader import LimitReader
|
||||
from warcio.utils import BUFF_SIZE
|
||||
|
||||
|
||||
def no_except_close(closable):
|
||||
"""Attempts to call the close method of the
|
||||
supplied object.
|
||||
|
||||
:param closable: The object to be closed
|
||||
:rtype: None
|
||||
"""
|
||||
if not closable:
|
||||
return
|
||||
|
||||
try:
|
||||
closable.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
try:
|
||||
release_conn = getattr(closable, 'release_conn', None)
|
||||
if release_conn is not None:
|
||||
release_conn()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
# =============================================================================
|
||||
def StreamIter(stream, header1=None, header2=None, size=BUFF_SIZE, closer=closing):
|
||||
@ -28,10 +51,7 @@ def call_release_conn(stream):
|
||||
try:
|
||||
yield stream
|
||||
finally:
|
||||
if hasattr(stream, 'release_conn'):
|
||||
stream.release_conn()
|
||||
else:
|
||||
stream.close()
|
||||
no_except_close(stream)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
@ -101,4 +121,3 @@ class OffsetLimitReader(LimitReader):
|
||||
def readline(self, length=None):
|
||||
self._skip()
|
||||
return super(OffsetLimitReader, self).readline(length)
|
||||
|
||||
|
@ -11,20 +11,20 @@ import requests
|
||||
import yaml
|
||||
|
||||
import six
|
||||
from six.moves.urllib.parse import urljoin, unquote_plus, urlsplit, urlencode
|
||||
from six.moves.urllib.parse import unquote_plus, urlsplit
|
||||
|
||||
import time
|
||||
import pkgutil
|
||||
import base64
|
||||
import cgi
|
||||
|
||||
from io import open, BytesIO
|
||||
from warcio.limitreader import LimitReader
|
||||
from pywb.utils.io import no_except_close
|
||||
|
||||
try:
|
||||
import boto3
|
||||
from botocore import UNSIGNED
|
||||
from botocore.client import Config
|
||||
|
||||
s3_avail = True
|
||||
except ImportError: # pragma: no cover
|
||||
s3_avail = False
|
||||
@ -75,8 +75,7 @@ def load_yaml_config(config_file):
|
||||
configdata = load(config_file)
|
||||
config = yaml.load(configdata)
|
||||
finally:
|
||||
if configdata:
|
||||
configdata.close()
|
||||
no_except_close(configdata)
|
||||
|
||||
return config
|
||||
|
||||
@ -84,7 +83,6 @@ def load_yaml_config(config_file):
|
||||
# =============================================================================
|
||||
def load_overlay_config(main_env_var, main_default_file='',
|
||||
overlay_env_var='', overlay_file=''):
|
||||
|
||||
configfile = os.environ.get(main_env_var, main_default_file)
|
||||
config = None
|
||||
|
||||
@ -171,6 +169,7 @@ class BlockLoader(BaseLoader):
|
||||
profile_loader = None
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
super(BlockLoader, self).__init__()
|
||||
self.cached = {}
|
||||
self.kwargs = kwargs
|
||||
|
||||
@ -283,11 +282,13 @@ class LocalFileLoader(PackageLoader):
|
||||
file_only = True
|
||||
url = filename
|
||||
|
||||
afile = None
|
||||
try:
|
||||
# first, try as file
|
||||
afile = open(url, 'rb')
|
||||
|
||||
except IOError:
|
||||
no_except_close(afile)
|
||||
if file_only:
|
||||
raise
|
||||
|
||||
@ -305,6 +306,7 @@ class LocalFileLoader(PackageLoader):
|
||||
# =================================================================
|
||||
class HttpLoader(BaseLoader):
|
||||
def __init__(self, **kwargs):
|
||||
super(HttpLoader, self).__init__()
|
||||
self.cookie_maker = kwargs.get('cookie_maker')
|
||||
if not self.cookie_maker:
|
||||
self.cookie_maker = kwargs.get('cookie')
|
||||
@ -336,6 +338,7 @@ class HttpLoader(BaseLoader):
|
||||
# =================================================================
|
||||
class S3Loader(BaseLoader):
|
||||
def __init__(self, **kwargs):
|
||||
super(S3Loader, self).__init__()
|
||||
self.client = None
|
||||
self.aws_access_key_id = kwargs.get('aws_access_key_id')
|
||||
self.aws_secret_access_key = kwargs.get('aws_secret_access_key')
|
||||
@ -407,6 +410,7 @@ class HMACCookieMaker(object):
|
||||
Utility class to produce signed HMAC digest cookies
|
||||
to be used with each http request
|
||||
"""
|
||||
|
||||
def __init__(self, key, name, duration=10):
|
||||
self.key = key
|
||||
self.name = name
|
||||
@ -435,4 +439,3 @@ class HMACCookieMaker(object):
|
||||
|
||||
# ============================================================================
|
||||
BlockLoader.init_default_loaders()
|
||||
|
||||
|
@ -1,22 +1,18 @@
|
||||
from pywb.utils.binsearch import iter_range
|
||||
from pywb.utils.canonicalize import canonicalize
|
||||
from pywb.utils.wbexception import NotFoundException
|
||||
|
||||
from warcio.timeutils import timestamp_to_http_date, http_date_to_timestamp
|
||||
from warcio.timeutils import timestamp_now, pad_timestamp, PAD_14_DOWN
|
||||
|
||||
from pywb.warcserver.http import DefaultAdapters
|
||||
from pywb.warcserver.index.cdxobject import CDXObject
|
||||
|
||||
from pywb.utils.format import ParamFormatter, res_template
|
||||
from pywb.utils.memento import MementoUtils
|
||||
import logging
|
||||
import re
|
||||
|
||||
import redis
|
||||
|
||||
import requests
|
||||
from warcio.timeutils import PAD_14_DOWN, http_date_to_timestamp, pad_timestamp, timestamp_now, timestamp_to_http_date
|
||||
|
||||
import re
|
||||
import logging
|
||||
from pywb.utils.binsearch import iter_range
|
||||
from pywb.utils.canonicalize import canonicalize
|
||||
from pywb.utils.format import res_template
|
||||
from pywb.utils.io import no_except_close
|
||||
from pywb.utils.memento import MementoUtils
|
||||
from pywb.utils.wbexception import NotFoundException
|
||||
from pywb.warcserver.http import DefaultAdapters
|
||||
from pywb.warcserver.index.cdxobject import CDXObject
|
||||
|
||||
|
||||
#=============================================================================
|
||||
@ -432,15 +428,16 @@ class MementoIndexSource(BaseIndexSource):
|
||||
def handle_timemap(self, params):
|
||||
url = res_template(self.timemap_url, params)
|
||||
headers = self._get_headers(params)
|
||||
res = None
|
||||
try:
|
||||
res = self.sesh.get(url,
|
||||
headers=headers,
|
||||
timeout=params.get('_timeout'))
|
||||
|
||||
res.raise_for_status()
|
||||
assert(res.text)
|
||||
|
||||
except Exception as e:
|
||||
no_except_close(res)
|
||||
self.logger.debug('FAILED: ' + str(e))
|
||||
raise NotFoundException(url)
|
||||
|
||||
@ -550,14 +547,17 @@ class WBMementoIndexSource(MementoIndexSource):
|
||||
url = params['url']
|
||||
load_url = self.timegate_url.format(url=url, timestamp=timestamp)
|
||||
|
||||
res = None
|
||||
try:
|
||||
headers = self._get_headers(params)
|
||||
res = self.sesh.head(load_url, headers=headers)
|
||||
except Exception as e:
|
||||
no_except_close(res)
|
||||
raise NotFoundException(url)
|
||||
|
||||
if res and res.headers.get('Memento-Datetime'):
|
||||
if res.status_code >= 400:
|
||||
no_except_close(res)
|
||||
raise NotFoundException(url)
|
||||
|
||||
if res.status_code >= 300:
|
||||
|
@ -1,25 +1,21 @@
|
||||
import datetime
|
||||
import itertools
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
from io import BytesIO
|
||||
|
||||
import os
|
||||
import collections
|
||||
import itertools
|
||||
import logging
|
||||
import datetime
|
||||
import json
|
||||
import six
|
||||
|
||||
from six.moves import map
|
||||
|
||||
from warcio.bufferedreaders import gzip_decompressor
|
||||
|
||||
from pywb.utils.binsearch import iter_range, linearsearch, search
|
||||
from pywb.utils.io import no_except_close
|
||||
from pywb.utils.loaders import BlockLoader, read_last_line
|
||||
from pywb.warcserver.index.cdxobject import CDXException, CDXObject, IDXObject
|
||||
# from pywb.warcserver.index.cdxsource import CDXSource
|
||||
from pywb.warcserver.index.indexsource import BaseIndexSource
|
||||
from pywb.warcserver.index.cdxobject import IDXObject, CDXException, CDXObject
|
||||
from pywb.warcserver.index.query import CDXQuery
|
||||
|
||||
from pywb.utils.loaders import BlockLoader, read_last_line
|
||||
from pywb.utils.binsearch import iter_range, linearsearch, search
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class ZipBlocks(object):
|
||||
@ -211,7 +207,7 @@ class ZipNumIndexSource(BaseIndexSource):
|
||||
if end_line == last_line and query.key >= last_line:
|
||||
first_line = last_line
|
||||
else:
|
||||
reader.close()
|
||||
no_except_close(reader)
|
||||
if query.page_count:
|
||||
yield self._page_info(0, pagesize, 0)
|
||||
return
|
||||
@ -240,13 +236,13 @@ class ZipNumIndexSource(BaseIndexSource):
|
||||
blocks = -1
|
||||
|
||||
yield self._page_info(total_pages, pagesize, blocks + 1)
|
||||
reader.close()
|
||||
no_except_close(reader)
|
||||
return
|
||||
|
||||
curr_page = query.page
|
||||
if curr_page >= total_pages or curr_page < 0:
|
||||
msg = 'Page {0} invalid: First Page is 0, Last Page is {1}'
|
||||
reader.close()
|
||||
no_except_close(reader)
|
||||
raise CDXException(msg.format(curr_page, total_pages - 1))
|
||||
|
||||
startline = curr_page * pagesize
|
||||
@ -259,12 +255,14 @@ class ZipNumIndexSource(BaseIndexSource):
|
||||
else:
|
||||
startline -= 1
|
||||
|
||||
try:
|
||||
idxiter = itertools.islice(first_iter, startline, endline)
|
||||
for idx in idxiter:
|
||||
yield idx
|
||||
|
||||
reader.close()
|
||||
|
||||
except Exception:
|
||||
pass
|
||||
finally:
|
||||
no_except_close(reader)
|
||||
|
||||
def search_by_line_num(self, reader, line): # pragma: no cover
|
||||
def line_cmp(line1, line2):
|
||||
@ -349,7 +347,7 @@ class ZipNumIndexSource(BaseIndexSource):
|
||||
for r in ranges:
|
||||
yield decompress_block(r)
|
||||
finally:
|
||||
reader.close()
|
||||
no_except_close(reader)
|
||||
|
||||
# iterate over all blocks
|
||||
iter_ = itertools.chain.from_iterable(iter_blocks(reader))
|
||||
|
@ -1,20 +1,19 @@
|
||||
import six
|
||||
from warcio.recordloader import ArchiveLoadFailed
|
||||
from warcio.timeutils import iso_date_to_timestamp
|
||||
|
||||
from pywb.warcserver.resource.blockrecordloader import BlockArcWarcRecordLoader
|
||||
|
||||
from pywb.utils.io import no_except_close
|
||||
from pywb.utils.wbexception import NotFoundException
|
||||
|
||||
import six
|
||||
from pywb.warcserver.resource.blockrecordloader import BlockArcWarcRecordLoader
|
||||
|
||||
|
||||
# =================================================================
|
||||
class ResolvingLoader(object):
|
||||
MISSING_REVISIT_MSG = 'Original for revisit record could not be loaded'
|
||||
|
||||
def __init__(self, path_resolvers, record_loader=BlockArcWarcRecordLoader(), no_record_parse=False):
|
||||
def __init__(self, path_resolvers, record_loader=None, no_record_parse=False):
|
||||
self.path_resolvers = path_resolvers
|
||||
self.record_loader = record_loader
|
||||
self.record_loader = record_loader if record_loader is not None else BlockArcWarcRecordLoader()
|
||||
self.no_record_parse = no_record_parse
|
||||
|
||||
def __call__(self, cdx, failed_files, cdx_loader, *args, **kwargs):
|
||||
@ -29,7 +28,7 @@ class ResolvingLoader(object):
|
||||
elif headers_record != payload_record:
|
||||
# close remainder of stream as this record only used for
|
||||
# (already parsed) headers
|
||||
headers_record.raw_stream.close()
|
||||
no_except_close(headers_record.raw_stream)
|
||||
|
||||
# special case: check if headers record is actually empty
|
||||
# (eg empty revisit), then use headers from revisit
|
||||
@ -37,6 +36,10 @@ class ResolvingLoader(object):
|
||||
headers_record = payload_record
|
||||
|
||||
if not headers_record or not payload_record:
|
||||
if headers_record:
|
||||
no_except_close(headers_record.raw_stream)
|
||||
if payload_record:
|
||||
no_except_close(payload_record.raw_stream)
|
||||
raise ArchiveLoadFailed('Could not load ' + str(cdx))
|
||||
|
||||
# ensure status line is valid from here
|
||||
@ -63,6 +66,7 @@ class ResolvingLoader(object):
|
||||
|
||||
# load headers record from cdx['filename'] unless it is '-' (rare)
|
||||
headers_record = None
|
||||
payload_record = None
|
||||
if has_curr:
|
||||
headers_record = self._resolve_path_load(cdx, False, failed_files)
|
||||
|
||||
@ -85,7 +89,6 @@ class ResolvingLoader(object):
|
||||
|
||||
return headers_record, payload_record
|
||||
|
||||
|
||||
def _resolve_path_load(self, cdx, is_original, failed_files):
|
||||
"""
|
||||
Load specific record based on filename, offset and length
|
||||
|
@ -1,36 +1,31 @@
|
||||
from warcio.timeutils import timestamp_to_datetime, datetime_to_timestamp
|
||||
from warcio.timeutils import iso_date_to_datetime, datetime_to_iso_date
|
||||
from warcio.timeutils import http_date_to_datetime, datetime_to_http_date
|
||||
from warcio.utils import to_native_str
|
||||
|
||||
from warcio.statusandheaders import StatusAndHeaders, StatusAndHeadersParser
|
||||
|
||||
from pywb.utils.wbexception import LiveResourceException, WbException
|
||||
|
||||
from pywb.utils.canonicalize import canonicalize
|
||||
|
||||
from pywb.utils.memento import MementoUtils
|
||||
from pywb.utils.io import StreamIter, compress_gzip_iter, call_release_conn
|
||||
from pywb.utils.format import ParamFormatter
|
||||
|
||||
from pywb.warcserver.resource.resolvingloader import ResolvingLoader
|
||||
from pywb.warcserver.resource.pathresolvers import DefaultResolverMixin
|
||||
|
||||
from pywb.warcserver.http import DefaultAdapters, SOCKS_PROXIES
|
||||
|
||||
from six.moves.urllib.parse import urlsplit, quote, unquote
|
||||
|
||||
import datetime
|
||||
import json
|
||||
import logging
|
||||
import uuid
|
||||
from io import BytesIO
|
||||
|
||||
import uuid
|
||||
import six
|
||||
import itertools
|
||||
import json
|
||||
import glob
|
||||
import datetime
|
||||
import logging
|
||||
|
||||
from requests.models import PreparedRequest
|
||||
from six.moves.urllib.parse import quote, unquote, urlsplit
|
||||
from warcio.statusandheaders import StatusAndHeaders, StatusAndHeadersParser
|
||||
from warcio.timeutils import (
|
||||
datetime_to_http_date,
|
||||
datetime_to_iso_date,
|
||||
datetime_to_timestamp,
|
||||
http_date_to_datetime,
|
||||
iso_date_to_datetime,
|
||||
timestamp_to_datetime
|
||||
)
|
||||
from warcio.utils import to_native_str
|
||||
|
||||
from pywb.utils.canonicalize import canonicalize
|
||||
from pywb.utils.format import ParamFormatter
|
||||
from pywb.utils.io import StreamIter, call_release_conn, compress_gzip_iter, no_except_close
|
||||
from pywb.utils.memento import MementoUtils
|
||||
from pywb.utils.wbexception import LiveResourceException
|
||||
from pywb.warcserver.http import DefaultAdapters, SOCKS_PROXIES
|
||||
from pywb.warcserver.resource.pathresolvers import DefaultResolverMixin
|
||||
from pywb.warcserver.resource.resolvingloader import ResolvingLoader
|
||||
|
||||
logger = logging.getLogger('warcserver')
|
||||
|
||||
@ -217,8 +212,8 @@ class WARCPathLoader(DefaultResolverMixin, BaseLoader):
|
||||
http_headers.get_statuscode(),
|
||||
http_headers.get_header('Location'))
|
||||
except LiveResourceException:
|
||||
headers.raw_stream.close()
|
||||
payload.raw_stream.close()
|
||||
no_except_close(headers.raw_stream)
|
||||
no_except_close(payload.raw_stream)
|
||||
raise
|
||||
|
||||
http_headers_buff = http_headers.to_bytes()
|
||||
@ -237,8 +232,7 @@ class WARCPathLoader(DefaultResolverMixin, BaseLoader):
|
||||
|
||||
warc_headers.replace_header('WARC-Date',
|
||||
headers.rec_headers.get_header('WARC-Date'))
|
||||
|
||||
headers.raw_stream.close()
|
||||
no_except_close(headers.raw_stream)
|
||||
|
||||
return (warc_headers, http_headers_buff, payload.raw_stream)
|
||||
|
||||
@ -288,7 +282,7 @@ class LiveWebLoader(BaseLoader):
|
||||
p = PreparedRequest()
|
||||
try:
|
||||
p.prepare_url(load_url, None)
|
||||
except:
|
||||
except Exception:
|
||||
raise LiveResourceException(load_url)
|
||||
p.prepare_headers(None)
|
||||
p.prepare_auth(None, load_url)
|
||||
@ -320,6 +314,7 @@ class LiveWebLoader(BaseLoader):
|
||||
elif cdx.get('memento_url'):
|
||||
# if 'memento_url' set and no Memento-Datetime header present
|
||||
# then its an error
|
||||
no_except_close(upstream_res)
|
||||
return None
|
||||
|
||||
agg_type = upstream_res.headers.get('Warcserver-Type')
|
||||
@ -485,6 +480,7 @@ class LiveWebLoader(BaseLoader):
|
||||
else:
|
||||
conn = adapter.poolmanager
|
||||
|
||||
upstream_res = None
|
||||
try:
|
||||
upstream_res = conn.urlopen(method=method,
|
||||
url=load_url,
|
||||
@ -500,6 +496,8 @@ class LiveWebLoader(BaseLoader):
|
||||
return upstream_res
|
||||
|
||||
except Exception as e:
|
||||
if upstream_res:
|
||||
no_except_close(upstream_res)
|
||||
if logger.isEnabledFor(logging.DEBUG):
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
@ -527,7 +525,7 @@ class VideoLoader(BaseLoader):
|
||||
self.ydl = None
|
||||
return
|
||||
|
||||
self.ydl = YoutubeDL(dict(simulate=True,
|
||||
self.ydl = YoutubeDL(dict(simulate=True, quiet=True,
|
||||
youtube_include_dash_manifest=False))
|
||||
|
||||
self.ydl.add_default_info_extractors()
|
||||
|
@ -9,7 +9,7 @@ brotlipy
|
||||
pyyaml
|
||||
werkzeug
|
||||
webencodings
|
||||
gevent
|
||||
gevent==1.4.0
|
||||
webassets==0.12.1
|
||||
portalocker
|
||||
wsgiprox>=1.5.1
|
||||
|
Loading…
x
Reference in New Issue
Block a user