1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

Improved handling of open http connections and file handles (#463)

* improved pywb's closing of open file handles and http connects by adding to pywb.util.io no_except_close

replaced close calls with no_except_close
reformatted and optimizes import of files that were modified

additional ci build fixes:
- pin gevent to 1.4.0 in order to ensure build of pywb on ubuntu use gevent's wheel distribution
- youtube-dl fix: use youtube-dl in quiet mode to avoid errors with youtube-dl logging in pytest
This commit is contained in:
John Berlin 2019-05-15 20:38:12 -04:00 committed by Ilya Kreymer
parent 22b4297fc5
commit a907b2b511
13 changed files with 242 additions and 255 deletions

View File

@ -1,35 +1,23 @@
from io import BytesIO
import requests import requests
from werkzeug.http import HTTP_STATUS_CODES
from six.moves.urllib.parse import urlencode, urlsplit, urlunsplit from six.moves.urllib.parse import urlencode, urlsplit, urlunsplit
from pywb.rewrite.default_rewriter import DefaultRewriter, RewriterWithJSProxy
from pywb.rewrite.wburl import WbUrl
from pywb.rewrite.url_rewriter import UrlRewriter, IdentityUrlRewriter
from pywb.utils.wbexception import WbException
from pywb.utils.canonicalize import canonicalize
from pywb.utils.loaders import extract_client_cookie
from pywb.utils.io import BUFF_SIZE, OffsetLimitReader
from pywb.utils.memento import MementoUtils
from warcio.timeutils import http_date_to_timestamp, timestamp_to_http_date
from warcio.bufferedreaders import BufferedReader from warcio.bufferedreaders import BufferedReader
from warcio.recordloader import ArcWarcRecordLoader from warcio.recordloader import ArcWarcRecordLoader
from warcio.timeutils import http_date_to_timestamp, timestamp_to_http_date
from werkzeug.http import HTTP_STATUS_CODES
from pywb.warcserver.index.cdxobject import CDXObject
from pywb.apps.wbrequestresponse import WbResponse from pywb.apps.wbrequestresponse import WbResponse
from pywb.rewrite.default_rewriter import DefaultRewriter, RewriterWithJSProxy
from pywb.rewrite.rewriteinputreq import RewriteInputRequest from pywb.rewrite.rewriteinputreq import RewriteInputRequest
from pywb.rewrite.templateview import JinjaEnv, HeadInsertView, TopFrameView, BaseInsertView from pywb.rewrite.templateview import BaseInsertView, HeadInsertView, JinjaEnv, TopFrameView
from pywb.rewrite.url_rewriter import IdentityUrlRewriter, UrlRewriter
from pywb.rewrite.wburl import WbUrl
from io import BytesIO from pywb.utils.canonicalize import canonicalize
from copy import copy from pywb.utils.io import BUFF_SIZE, OffsetLimitReader, no_except_close
from pywb.utils.memento import MementoUtils
import gevent from pywb.utils.wbexception import WbException
import json from pywb.warcserver.index.cdxobject import CDXObject
# ============================================================================ # ============================================================================
@ -40,7 +28,7 @@ class UpstreamException(WbException):
# ============================================================================ # ============================================================================
#class Rewriter(RewriteDASHMixin, RewriteAMFMixin, RewriteContent): # class Rewriter(RewriteDASHMixin, RewriteAMFMixin, RewriteContent):
# pass # pass
@ -84,8 +72,8 @@ class RewriterApp(object):
self.banner_view) self.banner_view)
self.frame_insert_view = TopFrameView(self.jinja_env, self.frame_insert_view = TopFrameView(self.jinja_env,
self._html_templ('frame_insert_html'), self._html_templ('frame_insert_html'),
self.banner_view) self.banner_view)
self.error_view = BaseInsertView(self.jinja_env, self._html_templ('error_html')) self.error_view = BaseInsertView(self.jinja_env, self._html_templ('error_html'))
self.not_found_view = BaseInsertView(self.jinja_env, self._html_templ('not_found_html')) self.not_found_view = BaseInsertView(self.jinja_env, self._html_templ('not_found_html'))
@ -129,9 +117,9 @@ class RewriterApp(object):
if accept_dt: if accept_dt:
try: try:
wb_url.timestamp = http_date_to_timestamp(accept_dt) wb_url.timestamp = http_date_to_timestamp(accept_dt)
except: except Exception:
raise UpstreamException(400, url=wb_url.url, details='Invalid Accept-Datetime') raise UpstreamException(400, url=wb_url.url, details='Invalid Accept-Datetime')
#return WbResponse.text_response('Invalid Accept-Datetime', status='400 Bad Request') # return WbResponse.text_response('Invalid Accept-Datetime', status='400 Bad Request')
wb_url.type = wb_url.REPLAY wb_url.type = wb_url.REPLAY
@ -163,7 +151,7 @@ class RewriterApp(object):
range_start = start range_start = start
range_end = end range_end = end
#if start with 0, load from upstream, but add range after # if start with 0, load from upstream, but add range after
if start == 0: if start == 0:
del inputreq.env['HTTP_RANGE'] del inputreq.env['HTTP_RANGE']
else: else:
@ -193,11 +181,6 @@ class RewriterApp(object):
if range_start >= content_length or range_end >= content_length: if range_start >= content_length or range_end >= content_length:
details = 'Invalid Range: {0} >= {2} or {1} >= {2}'.format(range_start, range_end, content_length) details = 'Invalid Range: {0} >= {2} or {1} >= {2}'.format(range_start, range_end, content_length)
try:
r.raw.close()
except:
pass
raise UpstreamException(416, url=wb_url.url, details=details) raise UpstreamException(416, url=wb_url.url, details=details)
range_len = range_end - range_start + 1 range_len = range_end - range_start + 1
@ -296,9 +279,10 @@ class RewriterApp(object):
error = None error = None
try: try:
error = r.raw.read() error = r.raw.read()
r.raw.close() except Exception:
except:
pass pass
finally:
no_except_close(r.raw)
if error: if error:
error = error.decode('utf-8') error = error.decode('utf-8')
@ -316,10 +300,7 @@ class RewriterApp(object):
# add trailing slash # add trailing slash
new_path = url_parts.path + '/' new_path = url_parts.path + '/'
try: no_except_close(r.raw)
r.raw.close()
except:
pass
return self.send_redirect(new_path, url_parts, urlrewriter) return self.send_redirect(new_path, url_parts, urlrewriter)
@ -330,9 +311,9 @@ class RewriterApp(object):
memento_dt = r.headers.get('Memento-Datetime') memento_dt = r.headers.get('Memento-Datetime')
target_uri = r.headers.get('WARC-Target-URI') target_uri = r.headers.get('WARC-Target-URI')
#cdx['urlkey'] = urlkey # cdx['urlkey'] = urlkey
#cdx['timestamp'] = http_date_to_timestamp(memento_dt) # cdx['timestamp'] = http_date_to_timestamp(memento_dt)
#cdx['url'] = target_uri # cdx['url'] = target_uri
set_content_loc = False set_content_loc = False
@ -343,7 +324,7 @@ class RewriterApp(object):
# if redir to exact, redir if url or ts are different # if redir to exact, redir if url or ts are different
if self.redirect_to_exact: if self.redirect_to_exact:
if (set_content_loc or if (set_content_loc or
(wb_url.timestamp != cdx.get('timestamp') and not cdx.get('is_live'))): (wb_url.timestamp != cdx.get('timestamp') and not cdx.get('is_live'))):
new_url = urlrewriter.get_new_url(url=target_uri, new_url = urlrewriter.get_new_url(url=target_uri,
timestamp=cdx['timestamp'], timestamp=cdx['timestamp'],
@ -375,15 +356,15 @@ class RewriterApp(object):
else: else:
top_url = self.get_top_url(full_prefix, wb_url, cdx, kwargs) top_url = self.get_top_url(full_prefix, wb_url, cdx, kwargs)
head_insert_func = (self.head_insert_view. head_insert_func = (self.head_insert_view.
create_insert_func(wb_url, create_insert_func(wb_url,
full_prefix, full_prefix,
host_prefix, host_prefix,
top_url, top_url,
environ, environ,
framed_replay, framed_replay,
coll=kwargs.get('coll', ''), coll=kwargs.get('coll', ''),
replay_mod=self.replay_mod, replay_mod=self.replay_mod,
config=self.config)) config=self.config))
cookie_rewriter = None cookie_rewriter = None
if self.cookie_tracker: if self.cookie_tracker:
@ -511,7 +492,6 @@ class RewriterApp(object):
return WbResponse.text_response(resp, status=status, content_type='text/html') return WbResponse.text_response(resp, status=status, content_type='text/html')
def _do_req(self, inputreq, wb_url, kwargs, skip_record): def _do_req(self, inputreq, wb_url, kwargs, skip_record):
req_data = inputreq.reconstruct_request(wb_url.url) req_data = inputreq.reconstruct_request(wb_url.url)
@ -618,7 +598,7 @@ class RewriterApp(object):
return scheme + host return scheme + host
def get_rel_prefix(self, environ): def get_rel_prefix(self, environ):
#return request.script_name # return request.script_name
return environ.get('SCRIPT_NAME') + '/' return environ.get('SCRIPT_NAME') + '/'
def get_full_prefix(self, environ): def get_full_prefix(self, environ):

View File

@ -1,5 +1,7 @@
from warcio.statusandheaders import StatusAndHeaders from warcio.statusandheaders import StatusAndHeaders
from pywb.utils.io import no_except_close
try: try:
import ujson as json import ujson as json
except ImportError: # pragma: no cover except ImportError: # pragma: no cover
@ -151,8 +153,7 @@ class WbResponse(object):
self.status_headers.headers) self.status_headers.headers)
request_method = env['REQUEST_METHOD'] request_method = env['REQUEST_METHOD']
if request_method == 'HEAD' or request_method == 'OPTIONS' or self.status_headers.statusline.startswith('304'): if request_method == 'HEAD' or request_method == 'OPTIONS' or self.status_headers.statusline.startswith('304'):
if hasattr(self.body, 'close'): no_except_close(self.body)
self.body.close()
return [] return []
return self.body return self.body

View File

@ -2,15 +2,14 @@ import base64
import datetime import datetime
import os import os
import shutil import shutil
import traceback import traceback
import portalocker import portalocker
from warcio.timeutils import timestamp20_now from warcio.timeutils import timestamp20_now
from warcio.warcwriter import BaseWARCWriter from warcio.warcwriter import BaseWARCWriter
from pywb.utils.format import res_template from pywb.utils.format import res_template
from pywb.utils.io import no_except_close
# ============================================================================ # ============================================================================
@ -85,7 +84,7 @@ class MultiFileWARCWriter(BaseWARCWriter):
try: try:
os.makedirs(path) os.makedirs(path)
except: except Exception:
pass pass
fh = open(filename, 'a+b') fh = open(filename, 'a+b')
@ -99,11 +98,12 @@ class MultiFileWARCWriter(BaseWARCWriter):
try: try:
if os.name != 'nt': if os.name != 'nt':
portalocker.lock(fh, portalocker.LOCK_UN) portalocker.lock(fh, portalocker.LOCK_UN)
fh.close()
return True return True
except Exception as e: except Exception as e:
print(e) print(e)
return False return False
finally:
no_except_close(fh)
def get_dir_key(self, params): def get_dir_key(self, params):
return res_template(self.key_template, params) return res_template(self.key_template, params)
@ -249,7 +249,7 @@ class MultiFileWARCWriter(BaseWARCWriter):
for dir_key, out, filename in self.iter_open_files(): for dir_key, out, filename in self.iter_open_files():
try: try:
mtime = os.path.getmtime(filename) mtime = os.path.getmtime(filename)
except: except Exception:
self.close_key(dir_key) self.close_key(dir_key)
return return

View File

@ -1,26 +1,21 @@
from pywb.utils.io import StreamIter, BUFF_SIZE
from pywb.utils.format import ParamFormatter, res_template
from pywb.warcserver.inputrequest import DirectWSGIInputRequest
from warcio.recordloader import ArcWarcRecordLoader
from pywb.recorder.filters import SkipRangeRequestFilter, CollectionFilter
from six.moves.urllib.parse import parse_qsl
import six
import json import json
import tempfile import tempfile
import requests
import traceback import traceback
import gevent.queue
import gevent import gevent
import gevent.queue
import requests
import six
from six.moves.urllib.parse import parse_qsl
from warcio.recordloader import ArcWarcRecordLoader
from pywb.recorder.filters import CollectionFilter, SkipRangeRequestFilter
from pywb.utils.format import ParamFormatter
from pywb.utils.io import BUFF_SIZE, StreamIter, no_except_close
from pywb.warcserver.inputrequest import DirectWSGIInputRequest
#============================================================================== # ==============================================================================
class RecorderApp(object): class RecorderApp(object):
def __init__(self, upstream_host, writer, skip_filters=None, **kwargs): def __init__(self, upstream_host, writer, skip_filters=None, **kwargs):
self.upstream_host = upstream_host self.upstream_host = upstream_host
@ -52,13 +47,13 @@ class RecorderApp(object):
@staticmethod @staticmethod
def default_create_buffer(params, name): def default_create_buffer(params, name):
return tempfile.SpooledTemporaryFile(max_size=512*1024) return tempfile.SpooledTemporaryFile(max_size=512 * 1024)
def _write_loop(self): def _write_loop(self):
while True: while True:
try: try:
self._write_one() self._write_one()
except: except Exception:
traceback.print_exc() traceback.print_exc()
def _write_one(self): def _write_one(self):
@ -88,14 +83,13 @@ class RecorderApp(object):
else: else:
self.writer.write_record(resp, params) self.writer.write_record(resp, params)
finally: finally:
try: try:
if req_pay: if req_pay:
req_pay.close() no_except_close(req_pay)
if resp_pay: if resp_pay:
resp_pay.close() no_except_close(resp_pay)
except Exception as e: except Exception as e:
traceback.print_exc() traceback.print_exc()
@ -155,7 +149,7 @@ class RecorderApp(object):
finally: finally:
if req_stream: if req_stream:
req_stream.out.close() no_except_close(req_stream.out)
return self.send_message(msg, return self.send_message(msg,
'200 OK', '200 OK',
@ -169,8 +163,7 @@ class RecorderApp(object):
def __call__(self, environ, start_response): def __call__(self, environ, start_response):
try: try:
return self.handle_call(environ, start_response) return self.handle_call(environ, start_response)
except: except Exception:
import traceback
traceback.print_exc() traceback.print_exc()
def handle_call(self, environ, start_response): def handle_call(self, environ, start_response):
@ -217,15 +210,15 @@ class RecorderApp(object):
try: try:
res = requests.request(url=self.upstream_host + request_uri, res = requests.request(url=self.upstream_host + request_uri,
method=method, method=method,
data=data, data=data,
headers=headers, headers=headers,
allow_redirects=False, allow_redirects=False,
stream=True) stream=True)
res.raise_for_status() res.raise_for_status()
except Exception as e: except Exception as e:
if req_is_wrapped: if req_is_wrapped:
req_stream.out.close() no_except_close(req_stream.out)
return self.send_error(e, start_response) return self.send_error(e, start_response)
if not skipping: if not skipping:
@ -233,8 +226,7 @@ class RecorderApp(object):
req_stream.headers, req_stream.headers,
res.headers, res.headers,
params) params)
for x in self.skip_filters) for x in self.skip_filters)
if not skipping: if not skipping:
resp_stream = RespWrapper(res.raw, resp_stream = RespWrapper(res.raw,
@ -248,7 +240,7 @@ class RecorderApp(object):
else: else:
resp_stream = res.raw resp_stream = res.raw
if req_is_wrapped: if req_is_wrapped:
req_stream.out.close() no_except_close(req_stream.out)
resp_iter = StreamIter(resp_stream) resp_iter = StreamIter(resp_stream)
@ -260,7 +252,7 @@ class RecorderApp(object):
return resp_iter return resp_iter
#============================================================================== # ==============================================================================
class Wrapper(object): class Wrapper(object):
def __init__(self, stream, params, create_func): def __init__(self, stream, params, create_func):
self.stream = stream self.stream = stream
@ -280,7 +272,7 @@ class Wrapper(object):
return buff return buff
#============================================================================== # ==============================================================================
class RespWrapper(Wrapper): class RespWrapper(Wrapper):
def __init__(self, stream, headers, req, def __init__(self, stream, headers, req,
params, queue, path, create_func): params, queue, path, create_func):
@ -319,23 +311,20 @@ class RespWrapper(Wrapper):
entry = (self.req.headers, self.req.out, entry = (self.req.headers, self.req.out,
self.headers, self.out, self.params) self.headers, self.out, self.params)
self.queue.put(entry) self.queue.put(entry)
except: except Exception:
traceback.print_exc() traceback.print_exc()
skipping = True skipping = True
finally: finally:
try: if skipping:
if skipping: no_except_close(self.out)
self.out.close() no_except_close(self.req.out)
self.req.out.close()
except:
traceback.print_exc()
self.req.close() no_except_close(self.req)
self.req = None self.req = None
#============================================================================== # ==============================================================================
class ReqWrapper(Wrapper): class ReqWrapper(Wrapper):
def __init__(self, stream, req_headers, params, create_func): def __init__(self, stream, req_headers, params, create_func):
super(ReqWrapper, self).__init__(stream, params, create_func) super(ReqWrapper, self).__init__(stream, params, create_func)
@ -348,5 +337,3 @@ class ReqWrapper(Wrapper):
def close(self): def close(self):
# no need to close wsgi.input # no need to close wsgi.input
pass pass

View File

@ -1,19 +1,15 @@
from io import BytesIO import codecs
import json
import re
import tempfile
from contextlib import closing from contextlib import closing
import webencodings
from warcio.bufferedreaders import BufferedReader, ChunkedDataReader from warcio.bufferedreaders import BufferedReader, ChunkedDataReader
from warcio.utils import to_native_str from warcio.utils import to_native_str
import re from pywb.utils.io import BUFF_SIZE, StreamIter, no_except_close
import webencodings from pywb.utils.loaders import load_py_name, load_yaml_config
import tempfile
import json
import codecs
from pywb.utils.io import StreamIter, BUFF_SIZE
from pywb.utils.loaders import load_yaml_config, load_py_name
WORKER_MODS = {"wkr_", "sw_"} # type: Set[str] WORKER_MODS = {"wkr_", "sw_"} # type: Set[str]
@ -344,7 +340,7 @@ class StreamingRewriter(object):
yield buff.encode(charset) yield buff.encode(charset)
finally: finally:
stream.close() no_except_close(stream)
# ============================================================================ # ============================================================================

View File

@ -1,6 +1,8 @@
from gevent.pywsgi import WSGIServer, WSGIHandler
from gevent import spawn
import logging import logging
import traceback
from gevent import spawn
from gevent.pywsgi import WSGIHandler, WSGIServer
# ============================================================================ # ============================================================================

View File

@ -1,12 +1,35 @@
import zlib import zlib
from contextlib import closing, contextmanager from contextlib import closing, contextmanager
from warcio.utils import BUFF_SIZE
from warcio.limitreader import LimitReader
from tempfile import SpooledTemporaryFile from tempfile import SpooledTemporaryFile
from warcio.limitreader import LimitReader
from warcio.utils import BUFF_SIZE
#=============================================================================
def no_except_close(closable):
"""Attempts to call the close method of the
supplied object.
:param closable: The object to be closed
:rtype: None
"""
if not closable:
return
try:
closable.close()
except Exception:
pass
try:
release_conn = getattr(closable, 'release_conn', None)
if release_conn is not None:
release_conn()
except Exception:
pass
# =============================================================================
def StreamIter(stream, header1=None, header2=None, size=BUFF_SIZE, closer=closing): def StreamIter(stream, header1=None, header2=None, size=BUFF_SIZE, closer=closing):
with closer(stream): with closer(stream):
if header1: if header1:
@ -22,19 +45,16 @@ def StreamIter(stream, header1=None, header2=None, size=BUFF_SIZE, closer=closin
yield buff yield buff
#============================================================================= # =============================================================================
@contextmanager @contextmanager
def call_release_conn(stream): def call_release_conn(stream):
try: try:
yield stream yield stream
finally: finally:
if hasattr(stream, 'release_conn'): no_except_close(stream)
stream.release_conn()
else:
stream.close()
#============================================================================= # =============================================================================
def chunk_encode_iter(orig_iter): def chunk_encode_iter(orig_iter):
for chunk in orig_iter: for chunk in orig_iter:
if not len(chunk): if not len(chunk):
@ -47,7 +67,7 @@ def chunk_encode_iter(orig_iter):
yield b'0\r\n\r\n' yield b'0\r\n\r\n'
#============================================================================= # =============================================================================
def buffer_iter(status_headers, iterator, buff_size=BUFF_SIZE * 4): def buffer_iter(status_headers, iterator, buff_size=BUFF_SIZE * 4):
out = SpooledTemporaryFile(buff_size) out = SpooledTemporaryFile(buff_size)
size = 0 size = 0
@ -65,7 +85,7 @@ def buffer_iter(status_headers, iterator, buff_size=BUFF_SIZE * 4):
return StreamIter(out) return StreamIter(out)
#============================================================================= # =============================================================================
def compress_gzip_iter(orig_iter): def compress_gzip_iter(orig_iter):
compressobj = zlib.compressobj(9, zlib.DEFLATED, zlib.MAX_WBITS + 16) compressobj = zlib.compressobj(9, zlib.DEFLATED, zlib.MAX_WBITS + 16)
for chunk in orig_iter: for chunk in orig_iter:
@ -101,4 +121,3 @@ class OffsetLimitReader(LimitReader):
def readline(self, length=None): def readline(self, length=None):
self._skip() self._skip()
return super(OffsetLimitReader, self).readline(length) return super(OffsetLimitReader, self).readline(length)

View File

@ -11,22 +11,22 @@ import requests
import yaml import yaml
import six import six
from six.moves.urllib.parse import urljoin, unquote_plus, urlsplit, urlencode from six.moves.urllib.parse import unquote_plus, urlsplit
import time import time
import pkgutil import pkgutil
import base64
import cgi
from io import open, BytesIO from io import open, BytesIO
from warcio.limitreader import LimitReader from warcio.limitreader import LimitReader
from pywb.utils.io import no_except_close
try: try:
import boto3 import boto3
from botocore import UNSIGNED from botocore import UNSIGNED
from botocore.client import Config from botocore.client import Config
s3_avail = True s3_avail = True
except ImportError: #pragma: no cover except ImportError: # pragma: no cover
s3_avail = False s3_avail = False
@ -39,12 +39,12 @@ def load_py_name(string):
return getattr(mod, string[1]) return getattr(mod, string[1])
#================================================================= # =================================================================
def is_http(filename): def is_http(filename):
return filename.startswith(('http://', 'https://')) return filename.startswith(('http://', 'https://'))
#================================================================= # =================================================================
def to_file_url(filename): def to_file_url(filename):
""" Convert a filename to a file:// url """ Convert a filename to a file:// url
""" """
@ -52,7 +52,7 @@ def to_file_url(filename):
return url return url
#================================================================= # =================================================================
def from_file_url(url): def from_file_url(url):
""" Convert from file:// url to file path """ Convert from file:// url to file path
""" """
@ -62,7 +62,7 @@ def from_file_url(url):
return url return url
#================================================================= # =================================================================
def load(filename): def load(filename):
return BlockLoader().load(filename) return BlockLoader().load(filename)
@ -75,8 +75,7 @@ def load_yaml_config(config_file):
configdata = load(config_file) configdata = load(config_file)
config = yaml.load(configdata) config = yaml.load(configdata)
finally: finally:
if configdata: no_except_close(configdata)
configdata.close()
return config return config
@ -84,7 +83,6 @@ def load_yaml_config(config_file):
# ============================================================================= # =============================================================================
def load_overlay_config(main_env_var, main_default_file='', def load_overlay_config(main_env_var, main_default_file='',
overlay_env_var='', overlay_file=''): overlay_env_var='', overlay_file=''):
configfile = os.environ.get(main_env_var, main_default_file) configfile = os.environ.get(main_env_var, main_default_file)
config = None config = None
@ -104,7 +102,7 @@ def load_overlay_config(main_env_var, main_default_file='',
return config return config
#================================================================= # =================================================================
def extract_client_cookie(env, cookie_name): def extract_client_cookie(env, cookie_name):
cookie_header = env.get('HTTP_COOKIE') cookie_header = env.get('HTTP_COOKIE')
if not cookie_header: if not cookie_header:
@ -129,7 +127,7 @@ def extract_client_cookie(env, cookie_name):
return value return value
#================================================================= # =================================================================
def read_last_line(fh, offset=256): def read_last_line(fh, offset=256):
""" Read last line from a seekable file. Start reading """ Read last line from a seekable file. Start reading
from buff before end of file, and double backwards seek from buff before end of file, and double backwards seek
@ -150,7 +148,7 @@ def read_last_line(fh, offset=256):
return fh.readlines()[-1] return fh.readlines()[-1]
#================================================================= # =================================================================
class BaseLoader(object): class BaseLoader(object):
def __init__(self, **kwargs): def __init__(self, **kwargs):
pass pass
@ -159,7 +157,7 @@ class BaseLoader(object):
raise NotImplemented() raise NotImplemented()
#================================================================= # =================================================================
class BlockLoader(BaseLoader): class BlockLoader(BaseLoader):
""" """
a loader which can stream blocks of content a loader which can stream blocks of content
@ -171,6 +169,7 @@ class BlockLoader(BaseLoader):
profile_loader = None profile_loader = None
def __init__(self, **kwargs): def __init__(self, **kwargs):
super(BlockLoader, self).__init__()
self.cached = {} self.cached = {}
self.kwargs = kwargs self.kwargs = kwargs
@ -241,7 +240,7 @@ class BlockLoader(BaseLoader):
return range_header return range_header
#================================================================= # =================================================================
class PackageLoader(BaseLoader): class PackageLoader(BaseLoader):
def load(self, url, offset=0, length=-1): def load(self, url, offset=0, length=-1):
if url.startswith('pkg://'): if url.startswith('pkg://'):
@ -263,11 +262,11 @@ class PackageLoader(BaseLoader):
buff.name = url buff.name = url
return buff return buff
#afile = pkg_resources.resource_stream(pkg_split[0], # afile = pkg_resources.resource_stream(pkg_split[0],
# pkg_split[1]) # pkg_split[1])
#================================================================= # =================================================================
class LocalFileLoader(PackageLoader): class LocalFileLoader(PackageLoader):
def load(self, url, offset=0, length=-1): def load(self, url, offset=0, length=-1):
""" """
@ -283,11 +282,13 @@ class LocalFileLoader(PackageLoader):
file_only = True file_only = True
url = filename url = filename
afile = None
try: try:
# first, try as file # first, try as file
afile = open(url, 'rb') afile = open(url, 'rb')
except IOError: except IOError:
no_except_close(afile)
if file_only: if file_only:
raise raise
@ -302,9 +303,10 @@ class LocalFileLoader(PackageLoader):
return afile return afile
#================================================================= # =================================================================
class HttpLoader(BaseLoader): class HttpLoader(BaseLoader):
def __init__(self, **kwargs): def __init__(self, **kwargs):
super(HttpLoader, self).__init__()
self.cookie_maker = kwargs.get('cookie_maker') self.cookie_maker = kwargs.get('cookie_maker')
if not self.cookie_maker: if not self.cookie_maker:
self.cookie_maker = kwargs.get('cookie') self.cookie_maker = kwargs.get('cookie')
@ -333,16 +335,17 @@ class HttpLoader(BaseLoader):
return r.raw return r.raw
#================================================================= # =================================================================
class S3Loader(BaseLoader): class S3Loader(BaseLoader):
def __init__(self, **kwargs): def __init__(self, **kwargs):
super(S3Loader, self).__init__()
self.client = None self.client = None
self.aws_access_key_id = kwargs.get('aws_access_key_id') self.aws_access_key_id = kwargs.get('aws_access_key_id')
self.aws_secret_access_key = kwargs.get('aws_secret_access_key') self.aws_secret_access_key = kwargs.get('aws_secret_access_key')
def load(self, url, offset, length): def load(self, url, offset, length):
if not s3_avail: #pragma: no cover if not s3_avail: # pragma: no cover
raise IOError('To load from s3 paths, ' + raise IOError('To load from s3 paths, ' +
'you must install boto3: pip install boto3') 'you must install boto3: pip install boto3')
aws_access_key_id = self.aws_access_key_id aws_access_key_id = self.aws_access_key_id
@ -372,8 +375,8 @@ class S3Loader(BaseLoader):
config = None config = None
client = boto3.client('s3', aws_access_key_id=aws_access_key_id, client = boto3.client('s3', aws_access_key_id=aws_access_key_id,
aws_secret_access_key=aws_secret_access_key, aws_secret_access_key=aws_secret_access_key,
config=config) config=config)
else: else:
client = self.client client = self.client
@ -398,15 +401,16 @@ class S3Loader(BaseLoader):
return obj['Body'] return obj['Body']
#================================================================= # =================================================================
# Signed Cookie-Maker # Signed Cookie-Maker
#================================================================= # =================================================================
class HMACCookieMaker(object): class HMACCookieMaker(object):
""" """
Utility class to produce signed HMAC digest cookies Utility class to produce signed HMAC digest cookies
to be used with each http request to be used with each http request
""" """
def __init__(self, key, name, duration=10): def __init__(self, key, name, duration=10):
self.key = key self.key = key
self.name = name self.name = name
@ -435,4 +439,3 @@ class HMACCookieMaker(object):
# ============================================================================ # ============================================================================
BlockLoader.init_default_loaders() BlockLoader.init_default_loaders()

View File

@ -1,22 +1,18 @@
from pywb.utils.binsearch import iter_range import logging
from pywb.utils.canonicalize import canonicalize import re
from pywb.utils.wbexception import NotFoundException
from warcio.timeutils import timestamp_to_http_date, http_date_to_timestamp
from warcio.timeutils import timestamp_now, pad_timestamp, PAD_14_DOWN
from pywb.warcserver.http import DefaultAdapters
from pywb.warcserver.index.cdxobject import CDXObject
from pywb.utils.format import ParamFormatter, res_template
from pywb.utils.memento import MementoUtils
import redis import redis
import requests import requests
from warcio.timeutils import PAD_14_DOWN, http_date_to_timestamp, pad_timestamp, timestamp_now, timestamp_to_http_date
import re from pywb.utils.binsearch import iter_range
import logging from pywb.utils.canonicalize import canonicalize
from pywb.utils.format import res_template
from pywb.utils.io import no_except_close
from pywb.utils.memento import MementoUtils
from pywb.utils.wbexception import NotFoundException
from pywb.warcserver.http import DefaultAdapters
from pywb.warcserver.index.cdxobject import CDXObject
#============================================================================= #=============================================================================
@ -432,15 +428,16 @@ class MementoIndexSource(BaseIndexSource):
def handle_timemap(self, params): def handle_timemap(self, params):
url = res_template(self.timemap_url, params) url = res_template(self.timemap_url, params)
headers = self._get_headers(params) headers = self._get_headers(params)
res = None
try: try:
res = self.sesh.get(url, res = self.sesh.get(url,
headers=headers, headers=headers,
timeout=params.get('_timeout')) timeout=params.get('_timeout'))
res.raise_for_status() res.raise_for_status()
assert(res.text)
except Exception as e: except Exception as e:
no_except_close(res)
self.logger.debug('FAILED: ' + str(e)) self.logger.debug('FAILED: ' + str(e))
raise NotFoundException(url) raise NotFoundException(url)
@ -550,14 +547,17 @@ class WBMementoIndexSource(MementoIndexSource):
url = params['url'] url = params['url']
load_url = self.timegate_url.format(url=url, timestamp=timestamp) load_url = self.timegate_url.format(url=url, timestamp=timestamp)
res = None
try: try:
headers = self._get_headers(params) headers = self._get_headers(params)
res = self.sesh.head(load_url, headers=headers) res = self.sesh.head(load_url, headers=headers)
except Exception as e: except Exception as e:
no_except_close(res)
raise NotFoundException(url) raise NotFoundException(url)
if res and res.headers.get('Memento-Datetime'): if res and res.headers.get('Memento-Datetime'):
if res.status_code >= 400: if res.status_code >= 400:
no_except_close(res)
raise NotFoundException(url) raise NotFoundException(url)
if res.status_code >= 300: if res.status_code >= 300:

View File

@ -1,24 +1,20 @@
import datetime
import itertools
import json
import logging
import os
from io import BytesIO from io import BytesIO
import os
import collections
import itertools
import logging
import datetime
import json
import six import six
from six.moves import map
from warcio.bufferedreaders import gzip_decompressor from warcio.bufferedreaders import gzip_decompressor
#from pywb.warcserver.index.cdxsource import CDXSource
from pywb.warcserver.index.indexsource import BaseIndexSource
from pywb.warcserver.index.cdxobject import IDXObject, CDXException, CDXObject
from pywb.warcserver.index.query import CDXQuery
from pywb.utils.loaders import BlockLoader, read_last_line
from pywb.utils.binsearch import iter_range, linearsearch, search from pywb.utils.binsearch import iter_range, linearsearch, search
from pywb.utils.io import no_except_close
from pywb.utils.loaders import BlockLoader, read_last_line
from pywb.warcserver.index.cdxobject import CDXException, CDXObject, IDXObject
# from pywb.warcserver.index.cdxsource import CDXSource
from pywb.warcserver.index.indexsource import BaseIndexSource
from pywb.warcserver.index.query import CDXQuery
# ============================================================================ # ============================================================================
@ -211,7 +207,7 @@ class ZipNumIndexSource(BaseIndexSource):
if end_line == last_line and query.key >= last_line: if end_line == last_line and query.key >= last_line:
first_line = last_line first_line = last_line
else: else:
reader.close() no_except_close(reader)
if query.page_count: if query.page_count:
yield self._page_info(0, pagesize, 0) yield self._page_info(0, pagesize, 0)
return return
@ -240,13 +236,13 @@ class ZipNumIndexSource(BaseIndexSource):
blocks = -1 blocks = -1
yield self._page_info(total_pages, pagesize, blocks + 1) yield self._page_info(total_pages, pagesize, blocks + 1)
reader.close() no_except_close(reader)
return return
curr_page = query.page curr_page = query.page
if curr_page >= total_pages or curr_page < 0: if curr_page >= total_pages or curr_page < 0:
msg = 'Page {0} invalid: First Page is 0, Last Page is {1}' msg = 'Page {0} invalid: First Page is 0, Last Page is {1}'
reader.close() no_except_close(reader)
raise CDXException(msg.format(curr_page, total_pages - 1)) raise CDXException(msg.format(curr_page, total_pages - 1))
startline = curr_page * pagesize startline = curr_page * pagesize
@ -259,12 +255,14 @@ class ZipNumIndexSource(BaseIndexSource):
else: else:
startline -= 1 startline -= 1
idxiter = itertools.islice(first_iter, startline, endline) try:
for idx in idxiter: idxiter = itertools.islice(first_iter, startline, endline)
yield idx for idx in idxiter:
yield idx
reader.close() except Exception:
pass
finally:
no_except_close(reader)
def search_by_line_num(self, reader, line): # pragma: no cover def search_by_line_num(self, reader, line): # pragma: no cover
def line_cmp(line1, line2): def line_cmp(line1, line2):
@ -349,7 +347,7 @@ class ZipNumIndexSource(BaseIndexSource):
for r in ranges: for r in ranges:
yield decompress_block(r) yield decompress_block(r)
finally: finally:
reader.close() no_except_close(reader)
# iterate over all blocks # iterate over all blocks
iter_ = itertools.chain.from_iterable(iter_blocks(reader)) iter_ = itertools.chain.from_iterable(iter_blocks(reader))

View File

@ -1,20 +1,19 @@
import six
from warcio.recordloader import ArchiveLoadFailed from warcio.recordloader import ArchiveLoadFailed
from warcio.timeutils import iso_date_to_timestamp from warcio.timeutils import iso_date_to_timestamp
from pywb.utils.io import no_except_close
from pywb.utils.wbexception import NotFoundException
from pywb.warcserver.resource.blockrecordloader import BlockArcWarcRecordLoader from pywb.warcserver.resource.blockrecordloader import BlockArcWarcRecordLoader
from pywb.utils.wbexception import NotFoundException
import six # =================================================================
#=================================================================
class ResolvingLoader(object): class ResolvingLoader(object):
MISSING_REVISIT_MSG = 'Original for revisit record could not be loaded' MISSING_REVISIT_MSG = 'Original for revisit record could not be loaded'
def __init__(self, path_resolvers, record_loader=BlockArcWarcRecordLoader(), no_record_parse=False): def __init__(self, path_resolvers, record_loader=None, no_record_parse=False):
self.path_resolvers = path_resolvers self.path_resolvers = path_resolvers
self.record_loader = record_loader self.record_loader = record_loader if record_loader is not None else BlockArcWarcRecordLoader()
self.no_record_parse = no_record_parse self.no_record_parse = no_record_parse
def __call__(self, cdx, failed_files, cdx_loader, *args, **kwargs): def __call__(self, cdx, failed_files, cdx_loader, *args, **kwargs):
@ -29,7 +28,7 @@ class ResolvingLoader(object):
elif headers_record != payload_record: elif headers_record != payload_record:
# close remainder of stream as this record only used for # close remainder of stream as this record only used for
# (already parsed) headers # (already parsed) headers
headers_record.raw_stream.close() no_except_close(headers_record.raw_stream)
# special case: check if headers record is actually empty # special case: check if headers record is actually empty
# (eg empty revisit), then use headers from revisit # (eg empty revisit), then use headers from revisit
@ -37,6 +36,10 @@ class ResolvingLoader(object):
headers_record = payload_record headers_record = payload_record
if not headers_record or not payload_record: if not headers_record or not payload_record:
if headers_record:
no_except_close(headers_record.raw_stream)
if payload_record:
no_except_close(payload_record.raw_stream)
raise ArchiveLoadFailed('Could not load ' + str(cdx)) raise ArchiveLoadFailed('Could not load ' + str(cdx))
# ensure status line is valid from here # ensure status line is valid from here
@ -57,12 +60,13 @@ class ResolvingLoader(object):
from a different url to find the original record. from a different url to find the original record.
""" """
has_curr = (cdx['filename'] != '-') has_curr = (cdx['filename'] != '-')
#has_orig = (cdx.get('orig.filename', '-') != '-') # has_orig = (cdx.get('orig.filename', '-') != '-')
orig_f = cdx.get('orig.filename') orig_f = cdx.get('orig.filename')
has_orig = orig_f and orig_f != '-' has_orig = orig_f and orig_f != '-'
# load headers record from cdx['filename'] unless it is '-' (rare) # load headers record from cdx['filename'] unless it is '-' (rare)
headers_record = None headers_record = None
payload_record = None
if has_curr: if has_curr:
headers_record = self._resolve_path_load(cdx, False, failed_files) headers_record = self._resolve_path_load(cdx, False, failed_files)
@ -85,7 +89,6 @@ class ResolvingLoader(object):
return headers_record, payload_record return headers_record, payload_record
def _resolve_path_load(self, cdx, is_original, failed_files): def _resolve_path_load(self, cdx, is_original, failed_files):
""" """
Load specific record based on filename, offset and length Load specific record based on filename, offset and length
@ -127,8 +130,8 @@ class ResolvingLoader(object):
any_found = True any_found = True
try: try:
return (self.record_loader. return (self.record_loader.
load(path, offset, length, load(path, offset, length,
no_record_parse=self.no_record_parse)) no_record_parse=self.no_record_parse))
except Exception as ue: except Exception as ue:
last_exc = ue last_exc = ue
@ -140,12 +143,12 @@ class ResolvingLoader(object):
failed_files.append(filename) failed_files.append(filename)
if last_exc: if last_exc:
#msg = str(last_exc.__class__.__name__) # msg = str(last_exc.__class__.__name__)
msg = str(last_exc) msg = str(last_exc)
else: else:
msg = 'Archive File Not Found' msg = 'Archive File Not Found'
#raise ArchiveLoadFailed(msg, filename), None, last_traceback # raise ArchiveLoadFailed(msg, filename), None, last_traceback
six.reraise(ArchiveLoadFailed, ArchiveLoadFailed(filename + ': ' + msg), last_traceback) six.reraise(ArchiveLoadFailed, ArchiveLoadFailed(filename + ': ' + msg), last_traceback)
def _load_different_url_payload(self, cdx, headers_record, def _load_different_url_payload(self, cdx, headers_record,

View File

@ -1,36 +1,31 @@
from warcio.timeutils import timestamp_to_datetime, datetime_to_timestamp import datetime
from warcio.timeutils import iso_date_to_datetime, datetime_to_iso_date import json
from warcio.timeutils import http_date_to_datetime, datetime_to_http_date import logging
from warcio.utils import to_native_str import uuid
from warcio.statusandheaders import StatusAndHeaders, StatusAndHeadersParser
from pywb.utils.wbexception import LiveResourceException, WbException
from pywb.utils.canonicalize import canonicalize
from pywb.utils.memento import MementoUtils
from pywb.utils.io import StreamIter, compress_gzip_iter, call_release_conn
from pywb.utils.format import ParamFormatter
from pywb.warcserver.resource.resolvingloader import ResolvingLoader
from pywb.warcserver.resource.pathresolvers import DefaultResolverMixin
from pywb.warcserver.http import DefaultAdapters, SOCKS_PROXIES
from six.moves.urllib.parse import urlsplit, quote, unquote
from io import BytesIO from io import BytesIO
import uuid
import six import six
import itertools
import json
import glob
import datetime
import logging
from requests.models import PreparedRequest from requests.models import PreparedRequest
from six.moves.urllib.parse import quote, unquote, urlsplit
from warcio.statusandheaders import StatusAndHeaders, StatusAndHeadersParser
from warcio.timeutils import (
datetime_to_http_date,
datetime_to_iso_date,
datetime_to_timestamp,
http_date_to_datetime,
iso_date_to_datetime,
timestamp_to_datetime
)
from warcio.utils import to_native_str
from pywb.utils.canonicalize import canonicalize
from pywb.utils.format import ParamFormatter
from pywb.utils.io import StreamIter, call_release_conn, compress_gzip_iter, no_except_close
from pywb.utils.memento import MementoUtils
from pywb.utils.wbexception import LiveResourceException
from pywb.warcserver.http import DefaultAdapters, SOCKS_PROXIES
from pywb.warcserver.resource.pathresolvers import DefaultResolverMixin
from pywb.warcserver.resource.resolvingloader import ResolvingLoader
logger = logging.getLogger('warcserver') logger = logging.getLogger('warcserver')
@ -217,8 +212,8 @@ class WARCPathLoader(DefaultResolverMixin, BaseLoader):
http_headers.get_statuscode(), http_headers.get_statuscode(),
http_headers.get_header('Location')) http_headers.get_header('Location'))
except LiveResourceException: except LiveResourceException:
headers.raw_stream.close() no_except_close(headers.raw_stream)
payload.raw_stream.close() no_except_close(payload.raw_stream)
raise raise
http_headers_buff = http_headers.to_bytes() http_headers_buff = http_headers.to_bytes()
@ -237,8 +232,7 @@ class WARCPathLoader(DefaultResolverMixin, BaseLoader):
warc_headers.replace_header('WARC-Date', warc_headers.replace_header('WARC-Date',
headers.rec_headers.get_header('WARC-Date')) headers.rec_headers.get_header('WARC-Date'))
no_except_close(headers.raw_stream)
headers.raw_stream.close()
return (warc_headers, http_headers_buff, payload.raw_stream) return (warc_headers, http_headers_buff, payload.raw_stream)
@ -288,7 +282,7 @@ class LiveWebLoader(BaseLoader):
p = PreparedRequest() p = PreparedRequest()
try: try:
p.prepare_url(load_url, None) p.prepare_url(load_url, None)
except: except Exception:
raise LiveResourceException(load_url) raise LiveResourceException(load_url)
p.prepare_headers(None) p.prepare_headers(None)
p.prepare_auth(None, load_url) p.prepare_auth(None, load_url)
@ -320,6 +314,7 @@ class LiveWebLoader(BaseLoader):
elif cdx.get('memento_url'): elif cdx.get('memento_url'):
# if 'memento_url' set and no Memento-Datetime header present # if 'memento_url' set and no Memento-Datetime header present
# then its an error # then its an error
no_except_close(upstream_res)
return None return None
agg_type = upstream_res.headers.get('Warcserver-Type') agg_type = upstream_res.headers.get('Warcserver-Type')
@ -485,6 +480,7 @@ class LiveWebLoader(BaseLoader):
else: else:
conn = adapter.poolmanager conn = adapter.poolmanager
upstream_res = None
try: try:
upstream_res = conn.urlopen(method=method, upstream_res = conn.urlopen(method=method,
url=load_url, url=load_url,
@ -500,6 +496,8 @@ class LiveWebLoader(BaseLoader):
return upstream_res return upstream_res
except Exception as e: except Exception as e:
if upstream_res:
no_except_close(upstream_res)
if logger.isEnabledFor(logging.DEBUG): if logger.isEnabledFor(logging.DEBUG):
import traceback import traceback
traceback.print_exc() traceback.print_exc()
@ -527,7 +525,7 @@ class VideoLoader(BaseLoader):
self.ydl = None self.ydl = None
return return
self.ydl = YoutubeDL(dict(simulate=True, self.ydl = YoutubeDL(dict(simulate=True, quiet=True,
youtube_include_dash_manifest=False)) youtube_include_dash_manifest=False))
self.ydl.add_default_info_extractors() self.ydl.add_default_info_extractors()

View File

@ -9,7 +9,7 @@ brotlipy
pyyaml pyyaml
werkzeug werkzeug
webencodings webencodings
gevent gevent==1.4.0
webassets==0.12.1 webassets==0.12.1
portalocker portalocker
wsgiprox>=1.5.1 wsgiprox>=1.5.1