mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
Improved handling of open http connections and file handles (#463)
* improved pywb's closing of open file handles and http connects by adding to pywb.util.io no_except_close replaced close calls with no_except_close reformatted and optimizes import of files that were modified additional ci build fixes: - pin gevent to 1.4.0 in order to ensure build of pywb on ubuntu use gevent's wheel distribution - youtube-dl fix: use youtube-dl in quiet mode to avoid errors with youtube-dl logging in pytest
This commit is contained in:
parent
22b4297fc5
commit
a907b2b511
@ -1,35 +1,23 @@
|
||||
from io import BytesIO
|
||||
|
||||
import requests
|
||||
|
||||
from werkzeug.http import HTTP_STATUS_CODES
|
||||
from six.moves.urllib.parse import urlencode, urlsplit, urlunsplit
|
||||
|
||||
from pywb.rewrite.default_rewriter import DefaultRewriter, RewriterWithJSProxy
|
||||
|
||||
from pywb.rewrite.wburl import WbUrl
|
||||
from pywb.rewrite.url_rewriter import UrlRewriter, IdentityUrlRewriter
|
||||
|
||||
from pywb.utils.wbexception import WbException
|
||||
from pywb.utils.canonicalize import canonicalize
|
||||
from pywb.utils.loaders import extract_client_cookie
|
||||
from pywb.utils.io import BUFF_SIZE, OffsetLimitReader
|
||||
from pywb.utils.memento import MementoUtils
|
||||
|
||||
from warcio.timeutils import http_date_to_timestamp, timestamp_to_http_date
|
||||
from warcio.bufferedreaders import BufferedReader
|
||||
from warcio.recordloader import ArcWarcRecordLoader
|
||||
from warcio.timeutils import http_date_to_timestamp, timestamp_to_http_date
|
||||
from werkzeug.http import HTTP_STATUS_CODES
|
||||
|
||||
from pywb.warcserver.index.cdxobject import CDXObject
|
||||
from pywb.apps.wbrequestresponse import WbResponse
|
||||
|
||||
from pywb.rewrite.default_rewriter import DefaultRewriter, RewriterWithJSProxy
|
||||
from pywb.rewrite.rewriteinputreq import RewriteInputRequest
|
||||
from pywb.rewrite.templateview import JinjaEnv, HeadInsertView, TopFrameView, BaseInsertView
|
||||
|
||||
|
||||
from io import BytesIO
|
||||
from copy import copy
|
||||
|
||||
import gevent
|
||||
import json
|
||||
from pywb.rewrite.templateview import BaseInsertView, HeadInsertView, JinjaEnv, TopFrameView
|
||||
from pywb.rewrite.url_rewriter import IdentityUrlRewriter, UrlRewriter
|
||||
from pywb.rewrite.wburl import WbUrl
|
||||
from pywb.utils.canonicalize import canonicalize
|
||||
from pywb.utils.io import BUFF_SIZE, OffsetLimitReader, no_except_close
|
||||
from pywb.utils.memento import MementoUtils
|
||||
from pywb.utils.wbexception import WbException
|
||||
from pywb.warcserver.index.cdxobject import CDXObject
|
||||
|
||||
|
||||
# ============================================================================
|
||||
@ -40,7 +28,7 @@ class UpstreamException(WbException):
|
||||
|
||||
|
||||
# ============================================================================
|
||||
#class Rewriter(RewriteDASHMixin, RewriteAMFMixin, RewriteContent):
|
||||
# class Rewriter(RewriteDASHMixin, RewriteAMFMixin, RewriteContent):
|
||||
# pass
|
||||
|
||||
|
||||
@ -84,8 +72,8 @@ class RewriterApp(object):
|
||||
self.banner_view)
|
||||
|
||||
self.frame_insert_view = TopFrameView(self.jinja_env,
|
||||
self._html_templ('frame_insert_html'),
|
||||
self.banner_view)
|
||||
self._html_templ('frame_insert_html'),
|
||||
self.banner_view)
|
||||
|
||||
self.error_view = BaseInsertView(self.jinja_env, self._html_templ('error_html'))
|
||||
self.not_found_view = BaseInsertView(self.jinja_env, self._html_templ('not_found_html'))
|
||||
@ -129,9 +117,9 @@ class RewriterApp(object):
|
||||
if accept_dt:
|
||||
try:
|
||||
wb_url.timestamp = http_date_to_timestamp(accept_dt)
|
||||
except:
|
||||
except Exception:
|
||||
raise UpstreamException(400, url=wb_url.url, details='Invalid Accept-Datetime')
|
||||
#return WbResponse.text_response('Invalid Accept-Datetime', status='400 Bad Request')
|
||||
# return WbResponse.text_response('Invalid Accept-Datetime', status='400 Bad Request')
|
||||
|
||||
wb_url.type = wb_url.REPLAY
|
||||
|
||||
@ -163,7 +151,7 @@ class RewriterApp(object):
|
||||
range_start = start
|
||||
range_end = end
|
||||
|
||||
#if start with 0, load from upstream, but add range after
|
||||
# if start with 0, load from upstream, but add range after
|
||||
if start == 0:
|
||||
del inputreq.env['HTTP_RANGE']
|
||||
else:
|
||||
@ -193,11 +181,6 @@ class RewriterApp(object):
|
||||
|
||||
if range_start >= content_length or range_end >= content_length:
|
||||
details = 'Invalid Range: {0} >= {2} or {1} >= {2}'.format(range_start, range_end, content_length)
|
||||
try:
|
||||
r.raw.close()
|
||||
except:
|
||||
pass
|
||||
|
||||
raise UpstreamException(416, url=wb_url.url, details=details)
|
||||
|
||||
range_len = range_end - range_start + 1
|
||||
@ -296,9 +279,10 @@ class RewriterApp(object):
|
||||
error = None
|
||||
try:
|
||||
error = r.raw.read()
|
||||
r.raw.close()
|
||||
except:
|
||||
except Exception:
|
||||
pass
|
||||
finally:
|
||||
no_except_close(r.raw)
|
||||
|
||||
if error:
|
||||
error = error.decode('utf-8')
|
||||
@ -316,10 +300,7 @@ class RewriterApp(object):
|
||||
# add trailing slash
|
||||
new_path = url_parts.path + '/'
|
||||
|
||||
try:
|
||||
r.raw.close()
|
||||
except:
|
||||
pass
|
||||
no_except_close(r.raw)
|
||||
|
||||
return self.send_redirect(new_path, url_parts, urlrewriter)
|
||||
|
||||
@ -330,9 +311,9 @@ class RewriterApp(object):
|
||||
memento_dt = r.headers.get('Memento-Datetime')
|
||||
target_uri = r.headers.get('WARC-Target-URI')
|
||||
|
||||
#cdx['urlkey'] = urlkey
|
||||
#cdx['timestamp'] = http_date_to_timestamp(memento_dt)
|
||||
#cdx['url'] = target_uri
|
||||
# cdx['urlkey'] = urlkey
|
||||
# cdx['timestamp'] = http_date_to_timestamp(memento_dt)
|
||||
# cdx['url'] = target_uri
|
||||
|
||||
set_content_loc = False
|
||||
|
||||
@ -343,7 +324,7 @@ class RewriterApp(object):
|
||||
# if redir to exact, redir if url or ts are different
|
||||
if self.redirect_to_exact:
|
||||
if (set_content_loc or
|
||||
(wb_url.timestamp != cdx.get('timestamp') and not cdx.get('is_live'))):
|
||||
(wb_url.timestamp != cdx.get('timestamp') and not cdx.get('is_live'))):
|
||||
|
||||
new_url = urlrewriter.get_new_url(url=target_uri,
|
||||
timestamp=cdx['timestamp'],
|
||||
@ -375,15 +356,15 @@ class RewriterApp(object):
|
||||
else:
|
||||
top_url = self.get_top_url(full_prefix, wb_url, cdx, kwargs)
|
||||
head_insert_func = (self.head_insert_view.
|
||||
create_insert_func(wb_url,
|
||||
full_prefix,
|
||||
host_prefix,
|
||||
top_url,
|
||||
environ,
|
||||
framed_replay,
|
||||
coll=kwargs.get('coll', ''),
|
||||
replay_mod=self.replay_mod,
|
||||
config=self.config))
|
||||
create_insert_func(wb_url,
|
||||
full_prefix,
|
||||
host_prefix,
|
||||
top_url,
|
||||
environ,
|
||||
framed_replay,
|
||||
coll=kwargs.get('coll', ''),
|
||||
replay_mod=self.replay_mod,
|
||||
config=self.config))
|
||||
|
||||
cookie_rewriter = None
|
||||
if self.cookie_tracker:
|
||||
@ -511,7 +492,6 @@ class RewriterApp(object):
|
||||
|
||||
return WbResponse.text_response(resp, status=status, content_type='text/html')
|
||||
|
||||
|
||||
def _do_req(self, inputreq, wb_url, kwargs, skip_record):
|
||||
req_data = inputreq.reconstruct_request(wb_url.url)
|
||||
|
||||
@ -618,7 +598,7 @@ class RewriterApp(object):
|
||||
return scheme + host
|
||||
|
||||
def get_rel_prefix(self, environ):
|
||||
#return request.script_name
|
||||
# return request.script_name
|
||||
return environ.get('SCRIPT_NAME') + '/'
|
||||
|
||||
def get_full_prefix(self, environ):
|
||||
|
@ -1,5 +1,7 @@
|
||||
from warcio.statusandheaders import StatusAndHeaders
|
||||
|
||||
from pywb.utils.io import no_except_close
|
||||
|
||||
try:
|
||||
import ujson as json
|
||||
except ImportError: # pragma: no cover
|
||||
@ -151,8 +153,7 @@ class WbResponse(object):
|
||||
self.status_headers.headers)
|
||||
request_method = env['REQUEST_METHOD']
|
||||
if request_method == 'HEAD' or request_method == 'OPTIONS' or self.status_headers.statusline.startswith('304'):
|
||||
if hasattr(self.body, 'close'):
|
||||
self.body.close()
|
||||
no_except_close(self.body)
|
||||
return []
|
||||
|
||||
return self.body
|
||||
|
@ -2,15 +2,14 @@ import base64
|
||||
import datetime
|
||||
import os
|
||||
import shutil
|
||||
|
||||
import traceback
|
||||
|
||||
import portalocker
|
||||
|
||||
from warcio.timeutils import timestamp20_now
|
||||
from warcio.warcwriter import BaseWARCWriter
|
||||
|
||||
from pywb.utils.format import res_template
|
||||
from pywb.utils.io import no_except_close
|
||||
|
||||
|
||||
# ============================================================================
|
||||
@ -85,7 +84,7 @@ class MultiFileWARCWriter(BaseWARCWriter):
|
||||
|
||||
try:
|
||||
os.makedirs(path)
|
||||
except:
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
fh = open(filename, 'a+b')
|
||||
@ -99,11 +98,12 @@ class MultiFileWARCWriter(BaseWARCWriter):
|
||||
try:
|
||||
if os.name != 'nt':
|
||||
portalocker.lock(fh, portalocker.LOCK_UN)
|
||||
fh.close()
|
||||
return True
|
||||
except Exception as e:
|
||||
print(e)
|
||||
return False
|
||||
finally:
|
||||
no_except_close(fh)
|
||||
|
||||
def get_dir_key(self, params):
|
||||
return res_template(self.key_template, params)
|
||||
@ -249,7 +249,7 @@ class MultiFileWARCWriter(BaseWARCWriter):
|
||||
for dir_key, out, filename in self.iter_open_files():
|
||||
try:
|
||||
mtime = os.path.getmtime(filename)
|
||||
except:
|
||||
except Exception:
|
||||
self.close_key(dir_key)
|
||||
return
|
||||
|
||||
|
@ -1,26 +1,21 @@
|
||||
from pywb.utils.io import StreamIter, BUFF_SIZE
|
||||
from pywb.utils.format import ParamFormatter, res_template
|
||||
from pywb.warcserver.inputrequest import DirectWSGIInputRequest
|
||||
|
||||
from warcio.recordloader import ArcWarcRecordLoader
|
||||
|
||||
from pywb.recorder.filters import SkipRangeRequestFilter, CollectionFilter
|
||||
|
||||
from six.moves.urllib.parse import parse_qsl
|
||||
import six
|
||||
|
||||
import json
|
||||
import tempfile
|
||||
|
||||
import requests
|
||||
|
||||
import traceback
|
||||
|
||||
import gevent.queue
|
||||
import gevent
|
||||
import gevent.queue
|
||||
import requests
|
||||
import six
|
||||
from six.moves.urllib.parse import parse_qsl
|
||||
from warcio.recordloader import ArcWarcRecordLoader
|
||||
|
||||
from pywb.recorder.filters import CollectionFilter, SkipRangeRequestFilter
|
||||
from pywb.utils.format import ParamFormatter
|
||||
from pywb.utils.io import BUFF_SIZE, StreamIter, no_except_close
|
||||
from pywb.warcserver.inputrequest import DirectWSGIInputRequest
|
||||
|
||||
|
||||
#==============================================================================
|
||||
# ==============================================================================
|
||||
class RecorderApp(object):
|
||||
def __init__(self, upstream_host, writer, skip_filters=None, **kwargs):
|
||||
self.upstream_host = upstream_host
|
||||
@ -52,13 +47,13 @@ class RecorderApp(object):
|
||||
|
||||
@staticmethod
|
||||
def default_create_buffer(params, name):
|
||||
return tempfile.SpooledTemporaryFile(max_size=512*1024)
|
||||
return tempfile.SpooledTemporaryFile(max_size=512 * 1024)
|
||||
|
||||
def _write_loop(self):
|
||||
while True:
|
||||
try:
|
||||
self._write_one()
|
||||
except:
|
||||
except Exception:
|
||||
traceback.print_exc()
|
||||
|
||||
def _write_one(self):
|
||||
@ -88,14 +83,13 @@ class RecorderApp(object):
|
||||
else:
|
||||
self.writer.write_record(resp, params)
|
||||
|
||||
|
||||
finally:
|
||||
try:
|
||||
if req_pay:
|
||||
req_pay.close()
|
||||
no_except_close(req_pay)
|
||||
|
||||
if resp_pay:
|
||||
resp_pay.close()
|
||||
no_except_close(resp_pay)
|
||||
except Exception as e:
|
||||
traceback.print_exc()
|
||||
|
||||
@ -155,7 +149,7 @@ class RecorderApp(object):
|
||||
|
||||
finally:
|
||||
if req_stream:
|
||||
req_stream.out.close()
|
||||
no_except_close(req_stream.out)
|
||||
|
||||
return self.send_message(msg,
|
||||
'200 OK',
|
||||
@ -169,8 +163,7 @@ class RecorderApp(object):
|
||||
def __call__(self, environ, start_response):
|
||||
try:
|
||||
return self.handle_call(environ, start_response)
|
||||
except:
|
||||
import traceback
|
||||
except Exception:
|
||||
traceback.print_exc()
|
||||
|
||||
def handle_call(self, environ, start_response):
|
||||
@ -217,15 +210,15 @@ class RecorderApp(object):
|
||||
|
||||
try:
|
||||
res = requests.request(url=self.upstream_host + request_uri,
|
||||
method=method,
|
||||
data=data,
|
||||
headers=headers,
|
||||
allow_redirects=False,
|
||||
stream=True)
|
||||
method=method,
|
||||
data=data,
|
||||
headers=headers,
|
||||
allow_redirects=False,
|
||||
stream=True)
|
||||
res.raise_for_status()
|
||||
except Exception as e:
|
||||
if req_is_wrapped:
|
||||
req_stream.out.close()
|
||||
no_except_close(req_stream.out)
|
||||
return self.send_error(e, start_response)
|
||||
|
||||
if not skipping:
|
||||
@ -233,8 +226,7 @@ class RecorderApp(object):
|
||||
req_stream.headers,
|
||||
res.headers,
|
||||
params)
|
||||
for x in self.skip_filters)
|
||||
|
||||
for x in self.skip_filters)
|
||||
|
||||
if not skipping:
|
||||
resp_stream = RespWrapper(res.raw,
|
||||
@ -248,7 +240,7 @@ class RecorderApp(object):
|
||||
else:
|
||||
resp_stream = res.raw
|
||||
if req_is_wrapped:
|
||||
req_stream.out.close()
|
||||
no_except_close(req_stream.out)
|
||||
|
||||
resp_iter = StreamIter(resp_stream)
|
||||
|
||||
@ -260,7 +252,7 @@ class RecorderApp(object):
|
||||
return resp_iter
|
||||
|
||||
|
||||
#==============================================================================
|
||||
# ==============================================================================
|
||||
class Wrapper(object):
|
||||
def __init__(self, stream, params, create_func):
|
||||
self.stream = stream
|
||||
@ -280,7 +272,7 @@ class Wrapper(object):
|
||||
return buff
|
||||
|
||||
|
||||
#==============================================================================
|
||||
# ==============================================================================
|
||||
class RespWrapper(Wrapper):
|
||||
def __init__(self, stream, headers, req,
|
||||
params, queue, path, create_func):
|
||||
@ -319,23 +311,20 @@ class RespWrapper(Wrapper):
|
||||
entry = (self.req.headers, self.req.out,
|
||||
self.headers, self.out, self.params)
|
||||
self.queue.put(entry)
|
||||
except:
|
||||
except Exception:
|
||||
traceback.print_exc()
|
||||
skipping = True
|
||||
|
||||
finally:
|
||||
try:
|
||||
if skipping:
|
||||
self.out.close()
|
||||
self.req.out.close()
|
||||
except:
|
||||
traceback.print_exc()
|
||||
if skipping:
|
||||
no_except_close(self.out)
|
||||
no_except_close(self.req.out)
|
||||
|
||||
self.req.close()
|
||||
no_except_close(self.req)
|
||||
self.req = None
|
||||
|
||||
|
||||
#==============================================================================
|
||||
# ==============================================================================
|
||||
class ReqWrapper(Wrapper):
|
||||
def __init__(self, stream, req_headers, params, create_func):
|
||||
super(ReqWrapper, self).__init__(stream, params, create_func)
|
||||
@ -348,5 +337,3 @@ class ReqWrapper(Wrapper):
|
||||
def close(self):
|
||||
# no need to close wsgi.input
|
||||
pass
|
||||
|
||||
|
||||
|
@ -1,19 +1,15 @@
|
||||
from io import BytesIO
|
||||
|
||||
import codecs
|
||||
import json
|
||||
import re
|
||||
import tempfile
|
||||
from contextlib import closing
|
||||
|
||||
import webencodings
|
||||
from warcio.bufferedreaders import BufferedReader, ChunkedDataReader
|
||||
from warcio.utils import to_native_str
|
||||
|
||||
import re
|
||||
import webencodings
|
||||
import tempfile
|
||||
import json
|
||||
import codecs
|
||||
|
||||
from pywb.utils.io import StreamIter, BUFF_SIZE
|
||||
|
||||
from pywb.utils.loaders import load_yaml_config, load_py_name
|
||||
from pywb.utils.io import BUFF_SIZE, StreamIter, no_except_close
|
||||
from pywb.utils.loaders import load_py_name, load_yaml_config
|
||||
|
||||
WORKER_MODS = {"wkr_", "sw_"} # type: Set[str]
|
||||
|
||||
@ -344,7 +340,7 @@ class StreamingRewriter(object):
|
||||
yield buff.encode(charset)
|
||||
|
||||
finally:
|
||||
stream.close()
|
||||
no_except_close(stream)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
|
@ -1,6 +1,8 @@
|
||||
from gevent.pywsgi import WSGIServer, WSGIHandler
|
||||
from gevent import spawn
|
||||
import logging
|
||||
import traceback
|
||||
|
||||
from gevent import spawn
|
||||
from gevent.pywsgi import WSGIHandler, WSGIServer
|
||||
|
||||
|
||||
# ============================================================================
|
||||
|
@ -1,12 +1,35 @@
|
||||
import zlib
|
||||
from contextlib import closing, contextmanager
|
||||
|
||||
from warcio.utils import BUFF_SIZE
|
||||
from warcio.limitreader import LimitReader
|
||||
from tempfile import SpooledTemporaryFile
|
||||
|
||||
from warcio.limitreader import LimitReader
|
||||
from warcio.utils import BUFF_SIZE
|
||||
|
||||
#=============================================================================
|
||||
|
||||
def no_except_close(closable):
|
||||
"""Attempts to call the close method of the
|
||||
supplied object.
|
||||
|
||||
:param closable: The object to be closed
|
||||
:rtype: None
|
||||
"""
|
||||
if not closable:
|
||||
return
|
||||
|
||||
try:
|
||||
closable.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
try:
|
||||
release_conn = getattr(closable, 'release_conn', None)
|
||||
if release_conn is not None:
|
||||
release_conn()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
# =============================================================================
|
||||
def StreamIter(stream, header1=None, header2=None, size=BUFF_SIZE, closer=closing):
|
||||
with closer(stream):
|
||||
if header1:
|
||||
@ -22,19 +45,16 @@ def StreamIter(stream, header1=None, header2=None, size=BUFF_SIZE, closer=closin
|
||||
yield buff
|
||||
|
||||
|
||||
#=============================================================================
|
||||
# =============================================================================
|
||||
@contextmanager
|
||||
def call_release_conn(stream):
|
||||
try:
|
||||
yield stream
|
||||
finally:
|
||||
if hasattr(stream, 'release_conn'):
|
||||
stream.release_conn()
|
||||
else:
|
||||
stream.close()
|
||||
no_except_close(stream)
|
||||
|
||||
|
||||
#=============================================================================
|
||||
# =============================================================================
|
||||
def chunk_encode_iter(orig_iter):
|
||||
for chunk in orig_iter:
|
||||
if not len(chunk):
|
||||
@ -47,7 +67,7 @@ def chunk_encode_iter(orig_iter):
|
||||
yield b'0\r\n\r\n'
|
||||
|
||||
|
||||
#=============================================================================
|
||||
# =============================================================================
|
||||
def buffer_iter(status_headers, iterator, buff_size=BUFF_SIZE * 4):
|
||||
out = SpooledTemporaryFile(buff_size)
|
||||
size = 0
|
||||
@ -65,7 +85,7 @@ def buffer_iter(status_headers, iterator, buff_size=BUFF_SIZE * 4):
|
||||
return StreamIter(out)
|
||||
|
||||
|
||||
#=============================================================================
|
||||
# =============================================================================
|
||||
def compress_gzip_iter(orig_iter):
|
||||
compressobj = zlib.compressobj(9, zlib.DEFLATED, zlib.MAX_WBITS + 16)
|
||||
for chunk in orig_iter:
|
||||
@ -101,4 +121,3 @@ class OffsetLimitReader(LimitReader):
|
||||
def readline(self, length=None):
|
||||
self._skip()
|
||||
return super(OffsetLimitReader, self).readline(length)
|
||||
|
||||
|
@ -11,22 +11,22 @@ import requests
|
||||
import yaml
|
||||
|
||||
import six
|
||||
from six.moves.urllib.parse import urljoin, unquote_plus, urlsplit, urlencode
|
||||
from six.moves.urllib.parse import unquote_plus, urlsplit
|
||||
|
||||
import time
|
||||
import pkgutil
|
||||
import base64
|
||||
import cgi
|
||||
|
||||
from io import open, BytesIO
|
||||
from warcio.limitreader import LimitReader
|
||||
from pywb.utils.io import no_except_close
|
||||
|
||||
try:
|
||||
import boto3
|
||||
from botocore import UNSIGNED
|
||||
from botocore.client import Config
|
||||
|
||||
s3_avail = True
|
||||
except ImportError: #pragma: no cover
|
||||
except ImportError: # pragma: no cover
|
||||
s3_avail = False
|
||||
|
||||
|
||||
@ -39,12 +39,12 @@ def load_py_name(string):
|
||||
return getattr(mod, string[1])
|
||||
|
||||
|
||||
#=================================================================
|
||||
# =================================================================
|
||||
def is_http(filename):
|
||||
return filename.startswith(('http://', 'https://'))
|
||||
|
||||
|
||||
#=================================================================
|
||||
# =================================================================
|
||||
def to_file_url(filename):
|
||||
""" Convert a filename to a file:// url
|
||||
"""
|
||||
@ -52,7 +52,7 @@ def to_file_url(filename):
|
||||
return url
|
||||
|
||||
|
||||
#=================================================================
|
||||
# =================================================================
|
||||
def from_file_url(url):
|
||||
""" Convert from file:// url to file path
|
||||
"""
|
||||
@ -62,7 +62,7 @@ def from_file_url(url):
|
||||
return url
|
||||
|
||||
|
||||
#=================================================================
|
||||
# =================================================================
|
||||
def load(filename):
|
||||
return BlockLoader().load(filename)
|
||||
|
||||
@ -75,8 +75,7 @@ def load_yaml_config(config_file):
|
||||
configdata = load(config_file)
|
||||
config = yaml.load(configdata)
|
||||
finally:
|
||||
if configdata:
|
||||
configdata.close()
|
||||
no_except_close(configdata)
|
||||
|
||||
return config
|
||||
|
||||
@ -84,7 +83,6 @@ def load_yaml_config(config_file):
|
||||
# =============================================================================
|
||||
def load_overlay_config(main_env_var, main_default_file='',
|
||||
overlay_env_var='', overlay_file=''):
|
||||
|
||||
configfile = os.environ.get(main_env_var, main_default_file)
|
||||
config = None
|
||||
|
||||
@ -104,7 +102,7 @@ def load_overlay_config(main_env_var, main_default_file='',
|
||||
return config
|
||||
|
||||
|
||||
#=================================================================
|
||||
# =================================================================
|
||||
def extract_client_cookie(env, cookie_name):
|
||||
cookie_header = env.get('HTTP_COOKIE')
|
||||
if not cookie_header:
|
||||
@ -129,7 +127,7 @@ def extract_client_cookie(env, cookie_name):
|
||||
return value
|
||||
|
||||
|
||||
#=================================================================
|
||||
# =================================================================
|
||||
def read_last_line(fh, offset=256):
|
||||
""" Read last line from a seekable file. Start reading
|
||||
from buff before end of file, and double backwards seek
|
||||
@ -150,7 +148,7 @@ def read_last_line(fh, offset=256):
|
||||
return fh.readlines()[-1]
|
||||
|
||||
|
||||
#=================================================================
|
||||
# =================================================================
|
||||
class BaseLoader(object):
|
||||
def __init__(self, **kwargs):
|
||||
pass
|
||||
@ -159,7 +157,7 @@ class BaseLoader(object):
|
||||
raise NotImplemented()
|
||||
|
||||
|
||||
#=================================================================
|
||||
# =================================================================
|
||||
class BlockLoader(BaseLoader):
|
||||
"""
|
||||
a loader which can stream blocks of content
|
||||
@ -171,6 +169,7 @@ class BlockLoader(BaseLoader):
|
||||
profile_loader = None
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
super(BlockLoader, self).__init__()
|
||||
self.cached = {}
|
||||
self.kwargs = kwargs
|
||||
|
||||
@ -241,7 +240,7 @@ class BlockLoader(BaseLoader):
|
||||
return range_header
|
||||
|
||||
|
||||
#=================================================================
|
||||
# =================================================================
|
||||
class PackageLoader(BaseLoader):
|
||||
def load(self, url, offset=0, length=-1):
|
||||
if url.startswith('pkg://'):
|
||||
@ -263,11 +262,11 @@ class PackageLoader(BaseLoader):
|
||||
buff.name = url
|
||||
return buff
|
||||
|
||||
#afile = pkg_resources.resource_stream(pkg_split[0],
|
||||
# afile = pkg_resources.resource_stream(pkg_split[0],
|
||||
# pkg_split[1])
|
||||
|
||||
|
||||
#=================================================================
|
||||
# =================================================================
|
||||
class LocalFileLoader(PackageLoader):
|
||||
def load(self, url, offset=0, length=-1):
|
||||
"""
|
||||
@ -283,11 +282,13 @@ class LocalFileLoader(PackageLoader):
|
||||
file_only = True
|
||||
url = filename
|
||||
|
||||
afile = None
|
||||
try:
|
||||
# first, try as file
|
||||
afile = open(url, 'rb')
|
||||
|
||||
except IOError:
|
||||
no_except_close(afile)
|
||||
if file_only:
|
||||
raise
|
||||
|
||||
@ -302,9 +303,10 @@ class LocalFileLoader(PackageLoader):
|
||||
return afile
|
||||
|
||||
|
||||
#=================================================================
|
||||
# =================================================================
|
||||
class HttpLoader(BaseLoader):
|
||||
def __init__(self, **kwargs):
|
||||
super(HttpLoader, self).__init__()
|
||||
self.cookie_maker = kwargs.get('cookie_maker')
|
||||
if not self.cookie_maker:
|
||||
self.cookie_maker = kwargs.get('cookie')
|
||||
@ -333,16 +335,17 @@ class HttpLoader(BaseLoader):
|
||||
return r.raw
|
||||
|
||||
|
||||
#=================================================================
|
||||
# =================================================================
|
||||
class S3Loader(BaseLoader):
|
||||
def __init__(self, **kwargs):
|
||||
super(S3Loader, self).__init__()
|
||||
self.client = None
|
||||
self.aws_access_key_id = kwargs.get('aws_access_key_id')
|
||||
self.aws_secret_access_key = kwargs.get('aws_secret_access_key')
|
||||
|
||||
def load(self, url, offset, length):
|
||||
if not s3_avail: #pragma: no cover
|
||||
raise IOError('To load from s3 paths, ' +
|
||||
if not s3_avail: # pragma: no cover
|
||||
raise IOError('To load from s3 paths, ' +
|
||||
'you must install boto3: pip install boto3')
|
||||
|
||||
aws_access_key_id = self.aws_access_key_id
|
||||
@ -372,8 +375,8 @@ class S3Loader(BaseLoader):
|
||||
config = None
|
||||
|
||||
client = boto3.client('s3', aws_access_key_id=aws_access_key_id,
|
||||
aws_secret_access_key=aws_secret_access_key,
|
||||
config=config)
|
||||
aws_secret_access_key=aws_secret_access_key,
|
||||
config=config)
|
||||
else:
|
||||
client = self.client
|
||||
|
||||
@ -398,15 +401,16 @@ class S3Loader(BaseLoader):
|
||||
return obj['Body']
|
||||
|
||||
|
||||
#=================================================================
|
||||
# =================================================================
|
||||
# Signed Cookie-Maker
|
||||
#=================================================================
|
||||
# =================================================================
|
||||
|
||||
class HMACCookieMaker(object):
|
||||
"""
|
||||
Utility class to produce signed HMAC digest cookies
|
||||
to be used with each http request
|
||||
"""
|
||||
|
||||
def __init__(self, key, name, duration=10):
|
||||
self.key = key
|
||||
self.name = name
|
||||
@ -435,4 +439,3 @@ class HMACCookieMaker(object):
|
||||
|
||||
# ============================================================================
|
||||
BlockLoader.init_default_loaders()
|
||||
|
||||
|
@ -1,22 +1,18 @@
|
||||
from pywb.utils.binsearch import iter_range
|
||||
from pywb.utils.canonicalize import canonicalize
|
||||
from pywb.utils.wbexception import NotFoundException
|
||||
|
||||
from warcio.timeutils import timestamp_to_http_date, http_date_to_timestamp
|
||||
from warcio.timeutils import timestamp_now, pad_timestamp, PAD_14_DOWN
|
||||
|
||||
from pywb.warcserver.http import DefaultAdapters
|
||||
from pywb.warcserver.index.cdxobject import CDXObject
|
||||
|
||||
from pywb.utils.format import ParamFormatter, res_template
|
||||
from pywb.utils.memento import MementoUtils
|
||||
import logging
|
||||
import re
|
||||
|
||||
import redis
|
||||
|
||||
import requests
|
||||
from warcio.timeutils import PAD_14_DOWN, http_date_to_timestamp, pad_timestamp, timestamp_now, timestamp_to_http_date
|
||||
|
||||
import re
|
||||
import logging
|
||||
from pywb.utils.binsearch import iter_range
|
||||
from pywb.utils.canonicalize import canonicalize
|
||||
from pywb.utils.format import res_template
|
||||
from pywb.utils.io import no_except_close
|
||||
from pywb.utils.memento import MementoUtils
|
||||
from pywb.utils.wbexception import NotFoundException
|
||||
from pywb.warcserver.http import DefaultAdapters
|
||||
from pywb.warcserver.index.cdxobject import CDXObject
|
||||
|
||||
|
||||
#=============================================================================
|
||||
@ -432,15 +428,16 @@ class MementoIndexSource(BaseIndexSource):
|
||||
def handle_timemap(self, params):
|
||||
url = res_template(self.timemap_url, params)
|
||||
headers = self._get_headers(params)
|
||||
res = None
|
||||
try:
|
||||
res = self.sesh.get(url,
|
||||
headers=headers,
|
||||
timeout=params.get('_timeout'))
|
||||
|
||||
res.raise_for_status()
|
||||
assert(res.text)
|
||||
|
||||
except Exception as e:
|
||||
no_except_close(res)
|
||||
self.logger.debug('FAILED: ' + str(e))
|
||||
raise NotFoundException(url)
|
||||
|
||||
@ -550,14 +547,17 @@ class WBMementoIndexSource(MementoIndexSource):
|
||||
url = params['url']
|
||||
load_url = self.timegate_url.format(url=url, timestamp=timestamp)
|
||||
|
||||
res = None
|
||||
try:
|
||||
headers = self._get_headers(params)
|
||||
res = self.sesh.head(load_url, headers=headers)
|
||||
except Exception as e:
|
||||
no_except_close(res)
|
||||
raise NotFoundException(url)
|
||||
|
||||
if res and res.headers.get('Memento-Datetime'):
|
||||
if res.status_code >= 400:
|
||||
no_except_close(res)
|
||||
raise NotFoundException(url)
|
||||
|
||||
if res.status_code >= 300:
|
||||
|
@ -1,24 +1,20 @@
|
||||
import datetime
|
||||
import itertools
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
from io import BytesIO
|
||||
|
||||
import os
|
||||
import collections
|
||||
import itertools
|
||||
import logging
|
||||
import datetime
|
||||
import json
|
||||
import six
|
||||
|
||||
from six.moves import map
|
||||
|
||||
from warcio.bufferedreaders import gzip_decompressor
|
||||
|
||||
#from pywb.warcserver.index.cdxsource import CDXSource
|
||||
from pywb.warcserver.index.indexsource import BaseIndexSource
|
||||
from pywb.warcserver.index.cdxobject import IDXObject, CDXException, CDXObject
|
||||
from pywb.warcserver.index.query import CDXQuery
|
||||
|
||||
from pywb.utils.loaders import BlockLoader, read_last_line
|
||||
from pywb.utils.binsearch import iter_range, linearsearch, search
|
||||
from pywb.utils.io import no_except_close
|
||||
from pywb.utils.loaders import BlockLoader, read_last_line
|
||||
from pywb.warcserver.index.cdxobject import CDXException, CDXObject, IDXObject
|
||||
# from pywb.warcserver.index.cdxsource import CDXSource
|
||||
from pywb.warcserver.index.indexsource import BaseIndexSource
|
||||
from pywb.warcserver.index.query import CDXQuery
|
||||
|
||||
|
||||
# ============================================================================
|
||||
@ -211,7 +207,7 @@ class ZipNumIndexSource(BaseIndexSource):
|
||||
if end_line == last_line and query.key >= last_line:
|
||||
first_line = last_line
|
||||
else:
|
||||
reader.close()
|
||||
no_except_close(reader)
|
||||
if query.page_count:
|
||||
yield self._page_info(0, pagesize, 0)
|
||||
return
|
||||
@ -240,13 +236,13 @@ class ZipNumIndexSource(BaseIndexSource):
|
||||
blocks = -1
|
||||
|
||||
yield self._page_info(total_pages, pagesize, blocks + 1)
|
||||
reader.close()
|
||||
no_except_close(reader)
|
||||
return
|
||||
|
||||
curr_page = query.page
|
||||
if curr_page >= total_pages or curr_page < 0:
|
||||
msg = 'Page {0} invalid: First Page is 0, Last Page is {1}'
|
||||
reader.close()
|
||||
no_except_close(reader)
|
||||
raise CDXException(msg.format(curr_page, total_pages - 1))
|
||||
|
||||
startline = curr_page * pagesize
|
||||
@ -259,12 +255,14 @@ class ZipNumIndexSource(BaseIndexSource):
|
||||
else:
|
||||
startline -= 1
|
||||
|
||||
idxiter = itertools.islice(first_iter, startline, endline)
|
||||
for idx in idxiter:
|
||||
yield idx
|
||||
|
||||
reader.close()
|
||||
|
||||
try:
|
||||
idxiter = itertools.islice(first_iter, startline, endline)
|
||||
for idx in idxiter:
|
||||
yield idx
|
||||
except Exception:
|
||||
pass
|
||||
finally:
|
||||
no_except_close(reader)
|
||||
|
||||
def search_by_line_num(self, reader, line): # pragma: no cover
|
||||
def line_cmp(line1, line2):
|
||||
@ -349,7 +347,7 @@ class ZipNumIndexSource(BaseIndexSource):
|
||||
for r in ranges:
|
||||
yield decompress_block(r)
|
||||
finally:
|
||||
reader.close()
|
||||
no_except_close(reader)
|
||||
|
||||
# iterate over all blocks
|
||||
iter_ = itertools.chain.from_iterable(iter_blocks(reader))
|
||||
|
@ -1,20 +1,19 @@
|
||||
import six
|
||||
from warcio.recordloader import ArchiveLoadFailed
|
||||
from warcio.timeutils import iso_date_to_timestamp
|
||||
|
||||
from pywb.utils.io import no_except_close
|
||||
from pywb.utils.wbexception import NotFoundException
|
||||
from pywb.warcserver.resource.blockrecordloader import BlockArcWarcRecordLoader
|
||||
|
||||
from pywb.utils.wbexception import NotFoundException
|
||||
|
||||
import six
|
||||
|
||||
|
||||
#=================================================================
|
||||
# =================================================================
|
||||
class ResolvingLoader(object):
|
||||
MISSING_REVISIT_MSG = 'Original for revisit record could not be loaded'
|
||||
|
||||
def __init__(self, path_resolvers, record_loader=BlockArcWarcRecordLoader(), no_record_parse=False):
|
||||
def __init__(self, path_resolvers, record_loader=None, no_record_parse=False):
|
||||
self.path_resolvers = path_resolvers
|
||||
self.record_loader = record_loader
|
||||
self.record_loader = record_loader if record_loader is not None else BlockArcWarcRecordLoader()
|
||||
self.no_record_parse = no_record_parse
|
||||
|
||||
def __call__(self, cdx, failed_files, cdx_loader, *args, **kwargs):
|
||||
@ -29,7 +28,7 @@ class ResolvingLoader(object):
|
||||
elif headers_record != payload_record:
|
||||
# close remainder of stream as this record only used for
|
||||
# (already parsed) headers
|
||||
headers_record.raw_stream.close()
|
||||
no_except_close(headers_record.raw_stream)
|
||||
|
||||
# special case: check if headers record is actually empty
|
||||
# (eg empty revisit), then use headers from revisit
|
||||
@ -37,6 +36,10 @@ class ResolvingLoader(object):
|
||||
headers_record = payload_record
|
||||
|
||||
if not headers_record or not payload_record:
|
||||
if headers_record:
|
||||
no_except_close(headers_record.raw_stream)
|
||||
if payload_record:
|
||||
no_except_close(payload_record.raw_stream)
|
||||
raise ArchiveLoadFailed('Could not load ' + str(cdx))
|
||||
|
||||
# ensure status line is valid from here
|
||||
@ -57,12 +60,13 @@ class ResolvingLoader(object):
|
||||
from a different url to find the original record.
|
||||
"""
|
||||
has_curr = (cdx['filename'] != '-')
|
||||
#has_orig = (cdx.get('orig.filename', '-') != '-')
|
||||
# has_orig = (cdx.get('orig.filename', '-') != '-')
|
||||
orig_f = cdx.get('orig.filename')
|
||||
has_orig = orig_f and orig_f != '-'
|
||||
|
||||
# load headers record from cdx['filename'] unless it is '-' (rare)
|
||||
headers_record = None
|
||||
payload_record = None
|
||||
if has_curr:
|
||||
headers_record = self._resolve_path_load(cdx, False, failed_files)
|
||||
|
||||
@ -85,7 +89,6 @@ class ResolvingLoader(object):
|
||||
|
||||
return headers_record, payload_record
|
||||
|
||||
|
||||
def _resolve_path_load(self, cdx, is_original, failed_files):
|
||||
"""
|
||||
Load specific record based on filename, offset and length
|
||||
@ -127,8 +130,8 @@ class ResolvingLoader(object):
|
||||
any_found = True
|
||||
try:
|
||||
return (self.record_loader.
|
||||
load(path, offset, length,
|
||||
no_record_parse=self.no_record_parse))
|
||||
load(path, offset, length,
|
||||
no_record_parse=self.no_record_parse))
|
||||
|
||||
except Exception as ue:
|
||||
last_exc = ue
|
||||
@ -140,12 +143,12 @@ class ResolvingLoader(object):
|
||||
failed_files.append(filename)
|
||||
|
||||
if last_exc:
|
||||
#msg = str(last_exc.__class__.__name__)
|
||||
# msg = str(last_exc.__class__.__name__)
|
||||
msg = str(last_exc)
|
||||
else:
|
||||
msg = 'Archive File Not Found'
|
||||
|
||||
#raise ArchiveLoadFailed(msg, filename), None, last_traceback
|
||||
# raise ArchiveLoadFailed(msg, filename), None, last_traceback
|
||||
six.reraise(ArchiveLoadFailed, ArchiveLoadFailed(filename + ': ' + msg), last_traceback)
|
||||
|
||||
def _load_different_url_payload(self, cdx, headers_record,
|
||||
|
@ -1,36 +1,31 @@
|
||||
from warcio.timeutils import timestamp_to_datetime, datetime_to_timestamp
|
||||
from warcio.timeutils import iso_date_to_datetime, datetime_to_iso_date
|
||||
from warcio.timeutils import http_date_to_datetime, datetime_to_http_date
|
||||
from warcio.utils import to_native_str
|
||||
|
||||
from warcio.statusandheaders import StatusAndHeaders, StatusAndHeadersParser
|
||||
|
||||
from pywb.utils.wbexception import LiveResourceException, WbException
|
||||
|
||||
from pywb.utils.canonicalize import canonicalize
|
||||
|
||||
from pywb.utils.memento import MementoUtils
|
||||
from pywb.utils.io import StreamIter, compress_gzip_iter, call_release_conn
|
||||
from pywb.utils.format import ParamFormatter
|
||||
|
||||
from pywb.warcserver.resource.resolvingloader import ResolvingLoader
|
||||
from pywb.warcserver.resource.pathresolvers import DefaultResolverMixin
|
||||
|
||||
from pywb.warcserver.http import DefaultAdapters, SOCKS_PROXIES
|
||||
|
||||
from six.moves.urllib.parse import urlsplit, quote, unquote
|
||||
|
||||
import datetime
|
||||
import json
|
||||
import logging
|
||||
import uuid
|
||||
from io import BytesIO
|
||||
|
||||
import uuid
|
||||
import six
|
||||
import itertools
|
||||
import json
|
||||
import glob
|
||||
import datetime
|
||||
import logging
|
||||
|
||||
from requests.models import PreparedRequest
|
||||
from six.moves.urllib.parse import quote, unquote, urlsplit
|
||||
from warcio.statusandheaders import StatusAndHeaders, StatusAndHeadersParser
|
||||
from warcio.timeutils import (
|
||||
datetime_to_http_date,
|
||||
datetime_to_iso_date,
|
||||
datetime_to_timestamp,
|
||||
http_date_to_datetime,
|
||||
iso_date_to_datetime,
|
||||
timestamp_to_datetime
|
||||
)
|
||||
from warcio.utils import to_native_str
|
||||
|
||||
from pywb.utils.canonicalize import canonicalize
|
||||
from pywb.utils.format import ParamFormatter
|
||||
from pywb.utils.io import StreamIter, call_release_conn, compress_gzip_iter, no_except_close
|
||||
from pywb.utils.memento import MementoUtils
|
||||
from pywb.utils.wbexception import LiveResourceException
|
||||
from pywb.warcserver.http import DefaultAdapters, SOCKS_PROXIES
|
||||
from pywb.warcserver.resource.pathresolvers import DefaultResolverMixin
|
||||
from pywb.warcserver.resource.resolvingloader import ResolvingLoader
|
||||
|
||||
logger = logging.getLogger('warcserver')
|
||||
|
||||
@ -217,8 +212,8 @@ class WARCPathLoader(DefaultResolverMixin, BaseLoader):
|
||||
http_headers.get_statuscode(),
|
||||
http_headers.get_header('Location'))
|
||||
except LiveResourceException:
|
||||
headers.raw_stream.close()
|
||||
payload.raw_stream.close()
|
||||
no_except_close(headers.raw_stream)
|
||||
no_except_close(payload.raw_stream)
|
||||
raise
|
||||
|
||||
http_headers_buff = http_headers.to_bytes()
|
||||
@ -237,8 +232,7 @@ class WARCPathLoader(DefaultResolverMixin, BaseLoader):
|
||||
|
||||
warc_headers.replace_header('WARC-Date',
|
||||
headers.rec_headers.get_header('WARC-Date'))
|
||||
|
||||
headers.raw_stream.close()
|
||||
no_except_close(headers.raw_stream)
|
||||
|
||||
return (warc_headers, http_headers_buff, payload.raw_stream)
|
||||
|
||||
@ -288,7 +282,7 @@ class LiveWebLoader(BaseLoader):
|
||||
p = PreparedRequest()
|
||||
try:
|
||||
p.prepare_url(load_url, None)
|
||||
except:
|
||||
except Exception:
|
||||
raise LiveResourceException(load_url)
|
||||
p.prepare_headers(None)
|
||||
p.prepare_auth(None, load_url)
|
||||
@ -320,6 +314,7 @@ class LiveWebLoader(BaseLoader):
|
||||
elif cdx.get('memento_url'):
|
||||
# if 'memento_url' set and no Memento-Datetime header present
|
||||
# then its an error
|
||||
no_except_close(upstream_res)
|
||||
return None
|
||||
|
||||
agg_type = upstream_res.headers.get('Warcserver-Type')
|
||||
@ -485,6 +480,7 @@ class LiveWebLoader(BaseLoader):
|
||||
else:
|
||||
conn = adapter.poolmanager
|
||||
|
||||
upstream_res = None
|
||||
try:
|
||||
upstream_res = conn.urlopen(method=method,
|
||||
url=load_url,
|
||||
@ -500,6 +496,8 @@ class LiveWebLoader(BaseLoader):
|
||||
return upstream_res
|
||||
|
||||
except Exception as e:
|
||||
if upstream_res:
|
||||
no_except_close(upstream_res)
|
||||
if logger.isEnabledFor(logging.DEBUG):
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
@ -527,7 +525,7 @@ class VideoLoader(BaseLoader):
|
||||
self.ydl = None
|
||||
return
|
||||
|
||||
self.ydl = YoutubeDL(dict(simulate=True,
|
||||
self.ydl = YoutubeDL(dict(simulate=True, quiet=True,
|
||||
youtube_include_dash_manifest=False))
|
||||
|
||||
self.ydl.add_default_info_extractors()
|
||||
|
@ -9,7 +9,7 @@ brotlipy
|
||||
pyyaml
|
||||
werkzeug
|
||||
webencodings
|
||||
gevent
|
||||
gevent==1.4.0
|
||||
webassets==0.12.1
|
||||
portalocker
|
||||
wsgiprox>=1.5.1
|
||||
|
Loading…
x
Reference in New Issue
Block a user