mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
Improved handling of open http connections and file handles (#463)
* improved pywb's closing of open file handles and http connects by adding to pywb.util.io no_except_close replaced close calls with no_except_close reformatted and optimizes import of files that were modified additional ci build fixes: - pin gevent to 1.4.0 in order to ensure build of pywb on ubuntu use gevent's wheel distribution - youtube-dl fix: use youtube-dl in quiet mode to avoid errors with youtube-dl logging in pytest
This commit is contained in:
parent
22b4297fc5
commit
a907b2b511
@ -1,35 +1,23 @@
|
|||||||
|
from io import BytesIO
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
from werkzeug.http import HTTP_STATUS_CODES
|
|
||||||
from six.moves.urllib.parse import urlencode, urlsplit, urlunsplit
|
from six.moves.urllib.parse import urlencode, urlsplit, urlunsplit
|
||||||
|
|
||||||
from pywb.rewrite.default_rewriter import DefaultRewriter, RewriterWithJSProxy
|
|
||||||
|
|
||||||
from pywb.rewrite.wburl import WbUrl
|
|
||||||
from pywb.rewrite.url_rewriter import UrlRewriter, IdentityUrlRewriter
|
|
||||||
|
|
||||||
from pywb.utils.wbexception import WbException
|
|
||||||
from pywb.utils.canonicalize import canonicalize
|
|
||||||
from pywb.utils.loaders import extract_client_cookie
|
|
||||||
from pywb.utils.io import BUFF_SIZE, OffsetLimitReader
|
|
||||||
from pywb.utils.memento import MementoUtils
|
|
||||||
|
|
||||||
from warcio.timeutils import http_date_to_timestamp, timestamp_to_http_date
|
|
||||||
from warcio.bufferedreaders import BufferedReader
|
from warcio.bufferedreaders import BufferedReader
|
||||||
from warcio.recordloader import ArcWarcRecordLoader
|
from warcio.recordloader import ArcWarcRecordLoader
|
||||||
|
from warcio.timeutils import http_date_to_timestamp, timestamp_to_http_date
|
||||||
|
from werkzeug.http import HTTP_STATUS_CODES
|
||||||
|
|
||||||
from pywb.warcserver.index.cdxobject import CDXObject
|
|
||||||
from pywb.apps.wbrequestresponse import WbResponse
|
from pywb.apps.wbrequestresponse import WbResponse
|
||||||
|
from pywb.rewrite.default_rewriter import DefaultRewriter, RewriterWithJSProxy
|
||||||
from pywb.rewrite.rewriteinputreq import RewriteInputRequest
|
from pywb.rewrite.rewriteinputreq import RewriteInputRequest
|
||||||
from pywb.rewrite.templateview import JinjaEnv, HeadInsertView, TopFrameView, BaseInsertView
|
from pywb.rewrite.templateview import BaseInsertView, HeadInsertView, JinjaEnv, TopFrameView
|
||||||
|
from pywb.rewrite.url_rewriter import IdentityUrlRewriter, UrlRewriter
|
||||||
|
from pywb.rewrite.wburl import WbUrl
|
||||||
from io import BytesIO
|
from pywb.utils.canonicalize import canonicalize
|
||||||
from copy import copy
|
from pywb.utils.io import BUFF_SIZE, OffsetLimitReader, no_except_close
|
||||||
|
from pywb.utils.memento import MementoUtils
|
||||||
import gevent
|
from pywb.utils.wbexception import WbException
|
||||||
import json
|
from pywb.warcserver.index.cdxobject import CDXObject
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
@ -129,7 +117,7 @@ class RewriterApp(object):
|
|||||||
if accept_dt:
|
if accept_dt:
|
||||||
try:
|
try:
|
||||||
wb_url.timestamp = http_date_to_timestamp(accept_dt)
|
wb_url.timestamp = http_date_to_timestamp(accept_dt)
|
||||||
except:
|
except Exception:
|
||||||
raise UpstreamException(400, url=wb_url.url, details='Invalid Accept-Datetime')
|
raise UpstreamException(400, url=wb_url.url, details='Invalid Accept-Datetime')
|
||||||
# return WbResponse.text_response('Invalid Accept-Datetime', status='400 Bad Request')
|
# return WbResponse.text_response('Invalid Accept-Datetime', status='400 Bad Request')
|
||||||
|
|
||||||
@ -193,11 +181,6 @@ class RewriterApp(object):
|
|||||||
|
|
||||||
if range_start >= content_length or range_end >= content_length:
|
if range_start >= content_length or range_end >= content_length:
|
||||||
details = 'Invalid Range: {0} >= {2} or {1} >= {2}'.format(range_start, range_end, content_length)
|
details = 'Invalid Range: {0} >= {2} or {1} >= {2}'.format(range_start, range_end, content_length)
|
||||||
try:
|
|
||||||
r.raw.close()
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
|
|
||||||
raise UpstreamException(416, url=wb_url.url, details=details)
|
raise UpstreamException(416, url=wb_url.url, details=details)
|
||||||
|
|
||||||
range_len = range_end - range_start + 1
|
range_len = range_end - range_start + 1
|
||||||
@ -296,9 +279,10 @@ class RewriterApp(object):
|
|||||||
error = None
|
error = None
|
||||||
try:
|
try:
|
||||||
error = r.raw.read()
|
error = r.raw.read()
|
||||||
r.raw.close()
|
except Exception:
|
||||||
except:
|
|
||||||
pass
|
pass
|
||||||
|
finally:
|
||||||
|
no_except_close(r.raw)
|
||||||
|
|
||||||
if error:
|
if error:
|
||||||
error = error.decode('utf-8')
|
error = error.decode('utf-8')
|
||||||
@ -316,10 +300,7 @@ class RewriterApp(object):
|
|||||||
# add trailing slash
|
# add trailing slash
|
||||||
new_path = url_parts.path + '/'
|
new_path = url_parts.path + '/'
|
||||||
|
|
||||||
try:
|
no_except_close(r.raw)
|
||||||
r.raw.close()
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
|
|
||||||
return self.send_redirect(new_path, url_parts, urlrewriter)
|
return self.send_redirect(new_path, url_parts, urlrewriter)
|
||||||
|
|
||||||
@ -511,7 +492,6 @@ class RewriterApp(object):
|
|||||||
|
|
||||||
return WbResponse.text_response(resp, status=status, content_type='text/html')
|
return WbResponse.text_response(resp, status=status, content_type='text/html')
|
||||||
|
|
||||||
|
|
||||||
def _do_req(self, inputreq, wb_url, kwargs, skip_record):
|
def _do_req(self, inputreq, wb_url, kwargs, skip_record):
|
||||||
req_data = inputreq.reconstruct_request(wb_url.url)
|
req_data = inputreq.reconstruct_request(wb_url.url)
|
||||||
|
|
||||||
|
@ -1,5 +1,7 @@
|
|||||||
from warcio.statusandheaders import StatusAndHeaders
|
from warcio.statusandheaders import StatusAndHeaders
|
||||||
|
|
||||||
|
from pywb.utils.io import no_except_close
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import ujson as json
|
import ujson as json
|
||||||
except ImportError: # pragma: no cover
|
except ImportError: # pragma: no cover
|
||||||
@ -151,8 +153,7 @@ class WbResponse(object):
|
|||||||
self.status_headers.headers)
|
self.status_headers.headers)
|
||||||
request_method = env['REQUEST_METHOD']
|
request_method = env['REQUEST_METHOD']
|
||||||
if request_method == 'HEAD' or request_method == 'OPTIONS' or self.status_headers.statusline.startswith('304'):
|
if request_method == 'HEAD' or request_method == 'OPTIONS' or self.status_headers.statusline.startswith('304'):
|
||||||
if hasattr(self.body, 'close'):
|
no_except_close(self.body)
|
||||||
self.body.close()
|
|
||||||
return []
|
return []
|
||||||
|
|
||||||
return self.body
|
return self.body
|
||||||
|
@ -2,15 +2,14 @@ import base64
|
|||||||
import datetime
|
import datetime
|
||||||
import os
|
import os
|
||||||
import shutil
|
import shutil
|
||||||
|
|
||||||
import traceback
|
import traceback
|
||||||
|
|
||||||
import portalocker
|
import portalocker
|
||||||
|
|
||||||
from warcio.timeutils import timestamp20_now
|
from warcio.timeutils import timestamp20_now
|
||||||
from warcio.warcwriter import BaseWARCWriter
|
from warcio.warcwriter import BaseWARCWriter
|
||||||
|
|
||||||
from pywb.utils.format import res_template
|
from pywb.utils.format import res_template
|
||||||
|
from pywb.utils.io import no_except_close
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
@ -85,7 +84,7 @@ class MultiFileWARCWriter(BaseWARCWriter):
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
os.makedirs(path)
|
os.makedirs(path)
|
||||||
except:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
fh = open(filename, 'a+b')
|
fh = open(filename, 'a+b')
|
||||||
@ -99,11 +98,12 @@ class MultiFileWARCWriter(BaseWARCWriter):
|
|||||||
try:
|
try:
|
||||||
if os.name != 'nt':
|
if os.name != 'nt':
|
||||||
portalocker.lock(fh, portalocker.LOCK_UN)
|
portalocker.lock(fh, portalocker.LOCK_UN)
|
||||||
fh.close()
|
|
||||||
return True
|
return True
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(e)
|
print(e)
|
||||||
return False
|
return False
|
||||||
|
finally:
|
||||||
|
no_except_close(fh)
|
||||||
|
|
||||||
def get_dir_key(self, params):
|
def get_dir_key(self, params):
|
||||||
return res_template(self.key_template, params)
|
return res_template(self.key_template, params)
|
||||||
@ -249,7 +249,7 @@ class MultiFileWARCWriter(BaseWARCWriter):
|
|||||||
for dir_key, out, filename in self.iter_open_files():
|
for dir_key, out, filename in self.iter_open_files():
|
||||||
try:
|
try:
|
||||||
mtime = os.path.getmtime(filename)
|
mtime = os.path.getmtime(filename)
|
||||||
except:
|
except Exception:
|
||||||
self.close_key(dir_key)
|
self.close_key(dir_key)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -1,23 +1,18 @@
|
|||||||
from pywb.utils.io import StreamIter, BUFF_SIZE
|
|
||||||
from pywb.utils.format import ParamFormatter, res_template
|
|
||||||
from pywb.warcserver.inputrequest import DirectWSGIInputRequest
|
|
||||||
|
|
||||||
from warcio.recordloader import ArcWarcRecordLoader
|
|
||||||
|
|
||||||
from pywb.recorder.filters import SkipRangeRequestFilter, CollectionFilter
|
|
||||||
|
|
||||||
from six.moves.urllib.parse import parse_qsl
|
|
||||||
import six
|
|
||||||
|
|
||||||
import json
|
import json
|
||||||
import tempfile
|
import tempfile
|
||||||
|
|
||||||
import requests
|
|
||||||
|
|
||||||
import traceback
|
import traceback
|
||||||
|
|
||||||
import gevent.queue
|
|
||||||
import gevent
|
import gevent
|
||||||
|
import gevent.queue
|
||||||
|
import requests
|
||||||
|
import six
|
||||||
|
from six.moves.urllib.parse import parse_qsl
|
||||||
|
from warcio.recordloader import ArcWarcRecordLoader
|
||||||
|
|
||||||
|
from pywb.recorder.filters import CollectionFilter, SkipRangeRequestFilter
|
||||||
|
from pywb.utils.format import ParamFormatter
|
||||||
|
from pywb.utils.io import BUFF_SIZE, StreamIter, no_except_close
|
||||||
|
from pywb.warcserver.inputrequest import DirectWSGIInputRequest
|
||||||
|
|
||||||
|
|
||||||
# ==============================================================================
|
# ==============================================================================
|
||||||
@ -58,7 +53,7 @@ class RecorderApp(object):
|
|||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
self._write_one()
|
self._write_one()
|
||||||
except:
|
except Exception:
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
|
|
||||||
def _write_one(self):
|
def _write_one(self):
|
||||||
@ -88,14 +83,13 @@ class RecorderApp(object):
|
|||||||
else:
|
else:
|
||||||
self.writer.write_record(resp, params)
|
self.writer.write_record(resp, params)
|
||||||
|
|
||||||
|
|
||||||
finally:
|
finally:
|
||||||
try:
|
try:
|
||||||
if req_pay:
|
if req_pay:
|
||||||
req_pay.close()
|
no_except_close(req_pay)
|
||||||
|
|
||||||
if resp_pay:
|
if resp_pay:
|
||||||
resp_pay.close()
|
no_except_close(resp_pay)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
|
|
||||||
@ -155,7 +149,7 @@ class RecorderApp(object):
|
|||||||
|
|
||||||
finally:
|
finally:
|
||||||
if req_stream:
|
if req_stream:
|
||||||
req_stream.out.close()
|
no_except_close(req_stream.out)
|
||||||
|
|
||||||
return self.send_message(msg,
|
return self.send_message(msg,
|
||||||
'200 OK',
|
'200 OK',
|
||||||
@ -169,8 +163,7 @@ class RecorderApp(object):
|
|||||||
def __call__(self, environ, start_response):
|
def __call__(self, environ, start_response):
|
||||||
try:
|
try:
|
||||||
return self.handle_call(environ, start_response)
|
return self.handle_call(environ, start_response)
|
||||||
except:
|
except Exception:
|
||||||
import traceback
|
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
|
|
||||||
def handle_call(self, environ, start_response):
|
def handle_call(self, environ, start_response):
|
||||||
@ -225,7 +218,7 @@ class RecorderApp(object):
|
|||||||
res.raise_for_status()
|
res.raise_for_status()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if req_is_wrapped:
|
if req_is_wrapped:
|
||||||
req_stream.out.close()
|
no_except_close(req_stream.out)
|
||||||
return self.send_error(e, start_response)
|
return self.send_error(e, start_response)
|
||||||
|
|
||||||
if not skipping:
|
if not skipping:
|
||||||
@ -235,7 +228,6 @@ class RecorderApp(object):
|
|||||||
params)
|
params)
|
||||||
for x in self.skip_filters)
|
for x in self.skip_filters)
|
||||||
|
|
||||||
|
|
||||||
if not skipping:
|
if not skipping:
|
||||||
resp_stream = RespWrapper(res.raw,
|
resp_stream = RespWrapper(res.raw,
|
||||||
res.headers,
|
res.headers,
|
||||||
@ -248,7 +240,7 @@ class RecorderApp(object):
|
|||||||
else:
|
else:
|
||||||
resp_stream = res.raw
|
resp_stream = res.raw
|
||||||
if req_is_wrapped:
|
if req_is_wrapped:
|
||||||
req_stream.out.close()
|
no_except_close(req_stream.out)
|
||||||
|
|
||||||
resp_iter = StreamIter(resp_stream)
|
resp_iter = StreamIter(resp_stream)
|
||||||
|
|
||||||
@ -319,19 +311,16 @@ class RespWrapper(Wrapper):
|
|||||||
entry = (self.req.headers, self.req.out,
|
entry = (self.req.headers, self.req.out,
|
||||||
self.headers, self.out, self.params)
|
self.headers, self.out, self.params)
|
||||||
self.queue.put(entry)
|
self.queue.put(entry)
|
||||||
except:
|
except Exception:
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
skipping = True
|
skipping = True
|
||||||
|
|
||||||
finally:
|
finally:
|
||||||
try:
|
|
||||||
if skipping:
|
if skipping:
|
||||||
self.out.close()
|
no_except_close(self.out)
|
||||||
self.req.out.close()
|
no_except_close(self.req.out)
|
||||||
except:
|
|
||||||
traceback.print_exc()
|
|
||||||
|
|
||||||
self.req.close()
|
no_except_close(self.req)
|
||||||
self.req = None
|
self.req = None
|
||||||
|
|
||||||
|
|
||||||
@ -348,5 +337,3 @@ class ReqWrapper(Wrapper):
|
|||||||
def close(self):
|
def close(self):
|
||||||
# no need to close wsgi.input
|
# no need to close wsgi.input
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,19 +1,15 @@
|
|||||||
from io import BytesIO
|
import codecs
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
import tempfile
|
||||||
from contextlib import closing
|
from contextlib import closing
|
||||||
|
|
||||||
|
import webencodings
|
||||||
from warcio.bufferedreaders import BufferedReader, ChunkedDataReader
|
from warcio.bufferedreaders import BufferedReader, ChunkedDataReader
|
||||||
from warcio.utils import to_native_str
|
from warcio.utils import to_native_str
|
||||||
|
|
||||||
import re
|
from pywb.utils.io import BUFF_SIZE, StreamIter, no_except_close
|
||||||
import webencodings
|
from pywb.utils.loaders import load_py_name, load_yaml_config
|
||||||
import tempfile
|
|
||||||
import json
|
|
||||||
import codecs
|
|
||||||
|
|
||||||
from pywb.utils.io import StreamIter, BUFF_SIZE
|
|
||||||
|
|
||||||
from pywb.utils.loaders import load_yaml_config, load_py_name
|
|
||||||
|
|
||||||
WORKER_MODS = {"wkr_", "sw_"} # type: Set[str]
|
WORKER_MODS = {"wkr_", "sw_"} # type: Set[str]
|
||||||
|
|
||||||
@ -344,7 +340,7 @@ class StreamingRewriter(object):
|
|||||||
yield buff.encode(charset)
|
yield buff.encode(charset)
|
||||||
|
|
||||||
finally:
|
finally:
|
||||||
stream.close()
|
no_except_close(stream)
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
|
@ -1,6 +1,8 @@
|
|||||||
from gevent.pywsgi import WSGIServer, WSGIHandler
|
|
||||||
from gevent import spawn
|
|
||||||
import logging
|
import logging
|
||||||
|
import traceback
|
||||||
|
|
||||||
|
from gevent import spawn
|
||||||
|
from gevent.pywsgi import WSGIHandler, WSGIServer
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
|
@ -1,10 +1,33 @@
|
|||||||
import zlib
|
import zlib
|
||||||
from contextlib import closing, contextmanager
|
from contextlib import closing, contextmanager
|
||||||
|
|
||||||
from warcio.utils import BUFF_SIZE
|
|
||||||
from warcio.limitreader import LimitReader
|
|
||||||
from tempfile import SpooledTemporaryFile
|
from tempfile import SpooledTemporaryFile
|
||||||
|
|
||||||
|
from warcio.limitreader import LimitReader
|
||||||
|
from warcio.utils import BUFF_SIZE
|
||||||
|
|
||||||
|
|
||||||
|
def no_except_close(closable):
|
||||||
|
"""Attempts to call the close method of the
|
||||||
|
supplied object.
|
||||||
|
|
||||||
|
:param closable: The object to be closed
|
||||||
|
:rtype: None
|
||||||
|
"""
|
||||||
|
if not closable:
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
closable.close()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
release_conn = getattr(closable, 'release_conn', None)
|
||||||
|
if release_conn is not None:
|
||||||
|
release_conn()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
def StreamIter(stream, header1=None, header2=None, size=BUFF_SIZE, closer=closing):
|
def StreamIter(stream, header1=None, header2=None, size=BUFF_SIZE, closer=closing):
|
||||||
@ -28,10 +51,7 @@ def call_release_conn(stream):
|
|||||||
try:
|
try:
|
||||||
yield stream
|
yield stream
|
||||||
finally:
|
finally:
|
||||||
if hasattr(stream, 'release_conn'):
|
no_except_close(stream)
|
||||||
stream.release_conn()
|
|
||||||
else:
|
|
||||||
stream.close()
|
|
||||||
|
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
@ -101,4 +121,3 @@ class OffsetLimitReader(LimitReader):
|
|||||||
def readline(self, length=None):
|
def readline(self, length=None):
|
||||||
self._skip()
|
self._skip()
|
||||||
return super(OffsetLimitReader, self).readline(length)
|
return super(OffsetLimitReader, self).readline(length)
|
||||||
|
|
||||||
|
@ -11,20 +11,20 @@ import requests
|
|||||||
import yaml
|
import yaml
|
||||||
|
|
||||||
import six
|
import six
|
||||||
from six.moves.urllib.parse import urljoin, unquote_plus, urlsplit, urlencode
|
from six.moves.urllib.parse import unquote_plus, urlsplit
|
||||||
|
|
||||||
import time
|
import time
|
||||||
import pkgutil
|
import pkgutil
|
||||||
import base64
|
|
||||||
import cgi
|
|
||||||
|
|
||||||
from io import open, BytesIO
|
from io import open, BytesIO
|
||||||
from warcio.limitreader import LimitReader
|
from warcio.limitreader import LimitReader
|
||||||
|
from pywb.utils.io import no_except_close
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import boto3
|
import boto3
|
||||||
from botocore import UNSIGNED
|
from botocore import UNSIGNED
|
||||||
from botocore.client import Config
|
from botocore.client import Config
|
||||||
|
|
||||||
s3_avail = True
|
s3_avail = True
|
||||||
except ImportError: # pragma: no cover
|
except ImportError: # pragma: no cover
|
||||||
s3_avail = False
|
s3_avail = False
|
||||||
@ -75,8 +75,7 @@ def load_yaml_config(config_file):
|
|||||||
configdata = load(config_file)
|
configdata = load(config_file)
|
||||||
config = yaml.load(configdata)
|
config = yaml.load(configdata)
|
||||||
finally:
|
finally:
|
||||||
if configdata:
|
no_except_close(configdata)
|
||||||
configdata.close()
|
|
||||||
|
|
||||||
return config
|
return config
|
||||||
|
|
||||||
@ -84,7 +83,6 @@ def load_yaml_config(config_file):
|
|||||||
# =============================================================================
|
# =============================================================================
|
||||||
def load_overlay_config(main_env_var, main_default_file='',
|
def load_overlay_config(main_env_var, main_default_file='',
|
||||||
overlay_env_var='', overlay_file=''):
|
overlay_env_var='', overlay_file=''):
|
||||||
|
|
||||||
configfile = os.environ.get(main_env_var, main_default_file)
|
configfile = os.environ.get(main_env_var, main_default_file)
|
||||||
config = None
|
config = None
|
||||||
|
|
||||||
@ -171,6 +169,7 @@ class BlockLoader(BaseLoader):
|
|||||||
profile_loader = None
|
profile_loader = None
|
||||||
|
|
||||||
def __init__(self, **kwargs):
|
def __init__(self, **kwargs):
|
||||||
|
super(BlockLoader, self).__init__()
|
||||||
self.cached = {}
|
self.cached = {}
|
||||||
self.kwargs = kwargs
|
self.kwargs = kwargs
|
||||||
|
|
||||||
@ -283,11 +282,13 @@ class LocalFileLoader(PackageLoader):
|
|||||||
file_only = True
|
file_only = True
|
||||||
url = filename
|
url = filename
|
||||||
|
|
||||||
|
afile = None
|
||||||
try:
|
try:
|
||||||
# first, try as file
|
# first, try as file
|
||||||
afile = open(url, 'rb')
|
afile = open(url, 'rb')
|
||||||
|
|
||||||
except IOError:
|
except IOError:
|
||||||
|
no_except_close(afile)
|
||||||
if file_only:
|
if file_only:
|
||||||
raise
|
raise
|
||||||
|
|
||||||
@ -305,6 +306,7 @@ class LocalFileLoader(PackageLoader):
|
|||||||
# =================================================================
|
# =================================================================
|
||||||
class HttpLoader(BaseLoader):
|
class HttpLoader(BaseLoader):
|
||||||
def __init__(self, **kwargs):
|
def __init__(self, **kwargs):
|
||||||
|
super(HttpLoader, self).__init__()
|
||||||
self.cookie_maker = kwargs.get('cookie_maker')
|
self.cookie_maker = kwargs.get('cookie_maker')
|
||||||
if not self.cookie_maker:
|
if not self.cookie_maker:
|
||||||
self.cookie_maker = kwargs.get('cookie')
|
self.cookie_maker = kwargs.get('cookie')
|
||||||
@ -336,6 +338,7 @@ class HttpLoader(BaseLoader):
|
|||||||
# =================================================================
|
# =================================================================
|
||||||
class S3Loader(BaseLoader):
|
class S3Loader(BaseLoader):
|
||||||
def __init__(self, **kwargs):
|
def __init__(self, **kwargs):
|
||||||
|
super(S3Loader, self).__init__()
|
||||||
self.client = None
|
self.client = None
|
||||||
self.aws_access_key_id = kwargs.get('aws_access_key_id')
|
self.aws_access_key_id = kwargs.get('aws_access_key_id')
|
||||||
self.aws_secret_access_key = kwargs.get('aws_secret_access_key')
|
self.aws_secret_access_key = kwargs.get('aws_secret_access_key')
|
||||||
@ -407,6 +410,7 @@ class HMACCookieMaker(object):
|
|||||||
Utility class to produce signed HMAC digest cookies
|
Utility class to produce signed HMAC digest cookies
|
||||||
to be used with each http request
|
to be used with each http request
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, key, name, duration=10):
|
def __init__(self, key, name, duration=10):
|
||||||
self.key = key
|
self.key = key
|
||||||
self.name = name
|
self.name = name
|
||||||
@ -435,4 +439,3 @@ class HMACCookieMaker(object):
|
|||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
BlockLoader.init_default_loaders()
|
BlockLoader.init_default_loaders()
|
||||||
|
|
||||||
|
@ -1,22 +1,18 @@
|
|||||||
from pywb.utils.binsearch import iter_range
|
import logging
|
||||||
from pywb.utils.canonicalize import canonicalize
|
import re
|
||||||
from pywb.utils.wbexception import NotFoundException
|
|
||||||
|
|
||||||
from warcio.timeutils import timestamp_to_http_date, http_date_to_timestamp
|
|
||||||
from warcio.timeutils import timestamp_now, pad_timestamp, PAD_14_DOWN
|
|
||||||
|
|
||||||
from pywb.warcserver.http import DefaultAdapters
|
|
||||||
from pywb.warcserver.index.cdxobject import CDXObject
|
|
||||||
|
|
||||||
from pywb.utils.format import ParamFormatter, res_template
|
|
||||||
from pywb.utils.memento import MementoUtils
|
|
||||||
|
|
||||||
import redis
|
import redis
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
from warcio.timeutils import PAD_14_DOWN, http_date_to_timestamp, pad_timestamp, timestamp_now, timestamp_to_http_date
|
||||||
|
|
||||||
import re
|
from pywb.utils.binsearch import iter_range
|
||||||
import logging
|
from pywb.utils.canonicalize import canonicalize
|
||||||
|
from pywb.utils.format import res_template
|
||||||
|
from pywb.utils.io import no_except_close
|
||||||
|
from pywb.utils.memento import MementoUtils
|
||||||
|
from pywb.utils.wbexception import NotFoundException
|
||||||
|
from pywb.warcserver.http import DefaultAdapters
|
||||||
|
from pywb.warcserver.index.cdxobject import CDXObject
|
||||||
|
|
||||||
|
|
||||||
#=============================================================================
|
#=============================================================================
|
||||||
@ -432,15 +428,16 @@ class MementoIndexSource(BaseIndexSource):
|
|||||||
def handle_timemap(self, params):
|
def handle_timemap(self, params):
|
||||||
url = res_template(self.timemap_url, params)
|
url = res_template(self.timemap_url, params)
|
||||||
headers = self._get_headers(params)
|
headers = self._get_headers(params)
|
||||||
|
res = None
|
||||||
try:
|
try:
|
||||||
res = self.sesh.get(url,
|
res = self.sesh.get(url,
|
||||||
headers=headers,
|
headers=headers,
|
||||||
timeout=params.get('_timeout'))
|
timeout=params.get('_timeout'))
|
||||||
|
|
||||||
res.raise_for_status()
|
res.raise_for_status()
|
||||||
assert(res.text)
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
no_except_close(res)
|
||||||
self.logger.debug('FAILED: ' + str(e))
|
self.logger.debug('FAILED: ' + str(e))
|
||||||
raise NotFoundException(url)
|
raise NotFoundException(url)
|
||||||
|
|
||||||
@ -550,14 +547,17 @@ class WBMementoIndexSource(MementoIndexSource):
|
|||||||
url = params['url']
|
url = params['url']
|
||||||
load_url = self.timegate_url.format(url=url, timestamp=timestamp)
|
load_url = self.timegate_url.format(url=url, timestamp=timestamp)
|
||||||
|
|
||||||
|
res = None
|
||||||
try:
|
try:
|
||||||
headers = self._get_headers(params)
|
headers = self._get_headers(params)
|
||||||
res = self.sesh.head(load_url, headers=headers)
|
res = self.sesh.head(load_url, headers=headers)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
no_except_close(res)
|
||||||
raise NotFoundException(url)
|
raise NotFoundException(url)
|
||||||
|
|
||||||
if res and res.headers.get('Memento-Datetime'):
|
if res and res.headers.get('Memento-Datetime'):
|
||||||
if res.status_code >= 400:
|
if res.status_code >= 400:
|
||||||
|
no_except_close(res)
|
||||||
raise NotFoundException(url)
|
raise NotFoundException(url)
|
||||||
|
|
||||||
if res.status_code >= 300:
|
if res.status_code >= 300:
|
||||||
|
@ -1,25 +1,21 @@
|
|||||||
|
import datetime
|
||||||
|
import itertools
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
|
|
||||||
import os
|
|
||||||
import collections
|
|
||||||
import itertools
|
|
||||||
import logging
|
|
||||||
import datetime
|
|
||||||
import json
|
|
||||||
import six
|
import six
|
||||||
|
|
||||||
from six.moves import map
|
|
||||||
|
|
||||||
from warcio.bufferedreaders import gzip_decompressor
|
from warcio.bufferedreaders import gzip_decompressor
|
||||||
|
|
||||||
|
from pywb.utils.binsearch import iter_range, linearsearch, search
|
||||||
|
from pywb.utils.io import no_except_close
|
||||||
|
from pywb.utils.loaders import BlockLoader, read_last_line
|
||||||
|
from pywb.warcserver.index.cdxobject import CDXException, CDXObject, IDXObject
|
||||||
# from pywb.warcserver.index.cdxsource import CDXSource
|
# from pywb.warcserver.index.cdxsource import CDXSource
|
||||||
from pywb.warcserver.index.indexsource import BaseIndexSource
|
from pywb.warcserver.index.indexsource import BaseIndexSource
|
||||||
from pywb.warcserver.index.cdxobject import IDXObject, CDXException, CDXObject
|
|
||||||
from pywb.warcserver.index.query import CDXQuery
|
from pywb.warcserver.index.query import CDXQuery
|
||||||
|
|
||||||
from pywb.utils.loaders import BlockLoader, read_last_line
|
|
||||||
from pywb.utils.binsearch import iter_range, linearsearch, search
|
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
class ZipBlocks(object):
|
class ZipBlocks(object):
|
||||||
@ -211,7 +207,7 @@ class ZipNumIndexSource(BaseIndexSource):
|
|||||||
if end_line == last_line and query.key >= last_line:
|
if end_line == last_line and query.key >= last_line:
|
||||||
first_line = last_line
|
first_line = last_line
|
||||||
else:
|
else:
|
||||||
reader.close()
|
no_except_close(reader)
|
||||||
if query.page_count:
|
if query.page_count:
|
||||||
yield self._page_info(0, pagesize, 0)
|
yield self._page_info(0, pagesize, 0)
|
||||||
return
|
return
|
||||||
@ -240,13 +236,13 @@ class ZipNumIndexSource(BaseIndexSource):
|
|||||||
blocks = -1
|
blocks = -1
|
||||||
|
|
||||||
yield self._page_info(total_pages, pagesize, blocks + 1)
|
yield self._page_info(total_pages, pagesize, blocks + 1)
|
||||||
reader.close()
|
no_except_close(reader)
|
||||||
return
|
return
|
||||||
|
|
||||||
curr_page = query.page
|
curr_page = query.page
|
||||||
if curr_page >= total_pages or curr_page < 0:
|
if curr_page >= total_pages or curr_page < 0:
|
||||||
msg = 'Page {0} invalid: First Page is 0, Last Page is {1}'
|
msg = 'Page {0} invalid: First Page is 0, Last Page is {1}'
|
||||||
reader.close()
|
no_except_close(reader)
|
||||||
raise CDXException(msg.format(curr_page, total_pages - 1))
|
raise CDXException(msg.format(curr_page, total_pages - 1))
|
||||||
|
|
||||||
startline = curr_page * pagesize
|
startline = curr_page * pagesize
|
||||||
@ -259,12 +255,14 @@ class ZipNumIndexSource(BaseIndexSource):
|
|||||||
else:
|
else:
|
||||||
startline -= 1
|
startline -= 1
|
||||||
|
|
||||||
|
try:
|
||||||
idxiter = itertools.islice(first_iter, startline, endline)
|
idxiter = itertools.islice(first_iter, startline, endline)
|
||||||
for idx in idxiter:
|
for idx in idxiter:
|
||||||
yield idx
|
yield idx
|
||||||
|
except Exception:
|
||||||
reader.close()
|
pass
|
||||||
|
finally:
|
||||||
|
no_except_close(reader)
|
||||||
|
|
||||||
def search_by_line_num(self, reader, line): # pragma: no cover
|
def search_by_line_num(self, reader, line): # pragma: no cover
|
||||||
def line_cmp(line1, line2):
|
def line_cmp(line1, line2):
|
||||||
@ -349,7 +347,7 @@ class ZipNumIndexSource(BaseIndexSource):
|
|||||||
for r in ranges:
|
for r in ranges:
|
||||||
yield decompress_block(r)
|
yield decompress_block(r)
|
||||||
finally:
|
finally:
|
||||||
reader.close()
|
no_except_close(reader)
|
||||||
|
|
||||||
# iterate over all blocks
|
# iterate over all blocks
|
||||||
iter_ = itertools.chain.from_iterable(iter_blocks(reader))
|
iter_ = itertools.chain.from_iterable(iter_blocks(reader))
|
||||||
|
@ -1,20 +1,19 @@
|
|||||||
|
import six
|
||||||
from warcio.recordloader import ArchiveLoadFailed
|
from warcio.recordloader import ArchiveLoadFailed
|
||||||
from warcio.timeutils import iso_date_to_timestamp
|
from warcio.timeutils import iso_date_to_timestamp
|
||||||
|
|
||||||
from pywb.warcserver.resource.blockrecordloader import BlockArcWarcRecordLoader
|
from pywb.utils.io import no_except_close
|
||||||
|
|
||||||
from pywb.utils.wbexception import NotFoundException
|
from pywb.utils.wbexception import NotFoundException
|
||||||
|
from pywb.warcserver.resource.blockrecordloader import BlockArcWarcRecordLoader
|
||||||
import six
|
|
||||||
|
|
||||||
|
|
||||||
# =================================================================
|
# =================================================================
|
||||||
class ResolvingLoader(object):
|
class ResolvingLoader(object):
|
||||||
MISSING_REVISIT_MSG = 'Original for revisit record could not be loaded'
|
MISSING_REVISIT_MSG = 'Original for revisit record could not be loaded'
|
||||||
|
|
||||||
def __init__(self, path_resolvers, record_loader=BlockArcWarcRecordLoader(), no_record_parse=False):
|
def __init__(self, path_resolvers, record_loader=None, no_record_parse=False):
|
||||||
self.path_resolvers = path_resolvers
|
self.path_resolvers = path_resolvers
|
||||||
self.record_loader = record_loader
|
self.record_loader = record_loader if record_loader is not None else BlockArcWarcRecordLoader()
|
||||||
self.no_record_parse = no_record_parse
|
self.no_record_parse = no_record_parse
|
||||||
|
|
||||||
def __call__(self, cdx, failed_files, cdx_loader, *args, **kwargs):
|
def __call__(self, cdx, failed_files, cdx_loader, *args, **kwargs):
|
||||||
@ -29,7 +28,7 @@ class ResolvingLoader(object):
|
|||||||
elif headers_record != payload_record:
|
elif headers_record != payload_record:
|
||||||
# close remainder of stream as this record only used for
|
# close remainder of stream as this record only used for
|
||||||
# (already parsed) headers
|
# (already parsed) headers
|
||||||
headers_record.raw_stream.close()
|
no_except_close(headers_record.raw_stream)
|
||||||
|
|
||||||
# special case: check if headers record is actually empty
|
# special case: check if headers record is actually empty
|
||||||
# (eg empty revisit), then use headers from revisit
|
# (eg empty revisit), then use headers from revisit
|
||||||
@ -37,6 +36,10 @@ class ResolvingLoader(object):
|
|||||||
headers_record = payload_record
|
headers_record = payload_record
|
||||||
|
|
||||||
if not headers_record or not payload_record:
|
if not headers_record or not payload_record:
|
||||||
|
if headers_record:
|
||||||
|
no_except_close(headers_record.raw_stream)
|
||||||
|
if payload_record:
|
||||||
|
no_except_close(payload_record.raw_stream)
|
||||||
raise ArchiveLoadFailed('Could not load ' + str(cdx))
|
raise ArchiveLoadFailed('Could not load ' + str(cdx))
|
||||||
|
|
||||||
# ensure status line is valid from here
|
# ensure status line is valid from here
|
||||||
@ -63,6 +66,7 @@ class ResolvingLoader(object):
|
|||||||
|
|
||||||
# load headers record from cdx['filename'] unless it is '-' (rare)
|
# load headers record from cdx['filename'] unless it is '-' (rare)
|
||||||
headers_record = None
|
headers_record = None
|
||||||
|
payload_record = None
|
||||||
if has_curr:
|
if has_curr:
|
||||||
headers_record = self._resolve_path_load(cdx, False, failed_files)
|
headers_record = self._resolve_path_load(cdx, False, failed_files)
|
||||||
|
|
||||||
@ -85,7 +89,6 @@ class ResolvingLoader(object):
|
|||||||
|
|
||||||
return headers_record, payload_record
|
return headers_record, payload_record
|
||||||
|
|
||||||
|
|
||||||
def _resolve_path_load(self, cdx, is_original, failed_files):
|
def _resolve_path_load(self, cdx, is_original, failed_files):
|
||||||
"""
|
"""
|
||||||
Load specific record based on filename, offset and length
|
Load specific record based on filename, offset and length
|
||||||
|
@ -1,36 +1,31 @@
|
|||||||
from warcio.timeutils import timestamp_to_datetime, datetime_to_timestamp
|
import datetime
|
||||||
from warcio.timeutils import iso_date_to_datetime, datetime_to_iso_date
|
import json
|
||||||
from warcio.timeutils import http_date_to_datetime, datetime_to_http_date
|
import logging
|
||||||
from warcio.utils import to_native_str
|
import uuid
|
||||||
|
|
||||||
from warcio.statusandheaders import StatusAndHeaders, StatusAndHeadersParser
|
|
||||||
|
|
||||||
from pywb.utils.wbexception import LiveResourceException, WbException
|
|
||||||
|
|
||||||
from pywb.utils.canonicalize import canonicalize
|
|
||||||
|
|
||||||
from pywb.utils.memento import MementoUtils
|
|
||||||
from pywb.utils.io import StreamIter, compress_gzip_iter, call_release_conn
|
|
||||||
from pywb.utils.format import ParamFormatter
|
|
||||||
|
|
||||||
from pywb.warcserver.resource.resolvingloader import ResolvingLoader
|
|
||||||
from pywb.warcserver.resource.pathresolvers import DefaultResolverMixin
|
|
||||||
|
|
||||||
from pywb.warcserver.http import DefaultAdapters, SOCKS_PROXIES
|
|
||||||
|
|
||||||
from six.moves.urllib.parse import urlsplit, quote, unquote
|
|
||||||
|
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
|
|
||||||
import uuid
|
|
||||||
import six
|
import six
|
||||||
import itertools
|
|
||||||
import json
|
|
||||||
import glob
|
|
||||||
import datetime
|
|
||||||
import logging
|
|
||||||
|
|
||||||
from requests.models import PreparedRequest
|
from requests.models import PreparedRequest
|
||||||
|
from six.moves.urllib.parse import quote, unquote, urlsplit
|
||||||
|
from warcio.statusandheaders import StatusAndHeaders, StatusAndHeadersParser
|
||||||
|
from warcio.timeutils import (
|
||||||
|
datetime_to_http_date,
|
||||||
|
datetime_to_iso_date,
|
||||||
|
datetime_to_timestamp,
|
||||||
|
http_date_to_datetime,
|
||||||
|
iso_date_to_datetime,
|
||||||
|
timestamp_to_datetime
|
||||||
|
)
|
||||||
|
from warcio.utils import to_native_str
|
||||||
|
|
||||||
|
from pywb.utils.canonicalize import canonicalize
|
||||||
|
from pywb.utils.format import ParamFormatter
|
||||||
|
from pywb.utils.io import StreamIter, call_release_conn, compress_gzip_iter, no_except_close
|
||||||
|
from pywb.utils.memento import MementoUtils
|
||||||
|
from pywb.utils.wbexception import LiveResourceException
|
||||||
|
from pywb.warcserver.http import DefaultAdapters, SOCKS_PROXIES
|
||||||
|
from pywb.warcserver.resource.pathresolvers import DefaultResolverMixin
|
||||||
|
from pywb.warcserver.resource.resolvingloader import ResolvingLoader
|
||||||
|
|
||||||
logger = logging.getLogger('warcserver')
|
logger = logging.getLogger('warcserver')
|
||||||
|
|
||||||
@ -217,8 +212,8 @@ class WARCPathLoader(DefaultResolverMixin, BaseLoader):
|
|||||||
http_headers.get_statuscode(),
|
http_headers.get_statuscode(),
|
||||||
http_headers.get_header('Location'))
|
http_headers.get_header('Location'))
|
||||||
except LiveResourceException:
|
except LiveResourceException:
|
||||||
headers.raw_stream.close()
|
no_except_close(headers.raw_stream)
|
||||||
payload.raw_stream.close()
|
no_except_close(payload.raw_stream)
|
||||||
raise
|
raise
|
||||||
|
|
||||||
http_headers_buff = http_headers.to_bytes()
|
http_headers_buff = http_headers.to_bytes()
|
||||||
@ -237,8 +232,7 @@ class WARCPathLoader(DefaultResolverMixin, BaseLoader):
|
|||||||
|
|
||||||
warc_headers.replace_header('WARC-Date',
|
warc_headers.replace_header('WARC-Date',
|
||||||
headers.rec_headers.get_header('WARC-Date'))
|
headers.rec_headers.get_header('WARC-Date'))
|
||||||
|
no_except_close(headers.raw_stream)
|
||||||
headers.raw_stream.close()
|
|
||||||
|
|
||||||
return (warc_headers, http_headers_buff, payload.raw_stream)
|
return (warc_headers, http_headers_buff, payload.raw_stream)
|
||||||
|
|
||||||
@ -288,7 +282,7 @@ class LiveWebLoader(BaseLoader):
|
|||||||
p = PreparedRequest()
|
p = PreparedRequest()
|
||||||
try:
|
try:
|
||||||
p.prepare_url(load_url, None)
|
p.prepare_url(load_url, None)
|
||||||
except:
|
except Exception:
|
||||||
raise LiveResourceException(load_url)
|
raise LiveResourceException(load_url)
|
||||||
p.prepare_headers(None)
|
p.prepare_headers(None)
|
||||||
p.prepare_auth(None, load_url)
|
p.prepare_auth(None, load_url)
|
||||||
@ -320,6 +314,7 @@ class LiveWebLoader(BaseLoader):
|
|||||||
elif cdx.get('memento_url'):
|
elif cdx.get('memento_url'):
|
||||||
# if 'memento_url' set and no Memento-Datetime header present
|
# if 'memento_url' set and no Memento-Datetime header present
|
||||||
# then its an error
|
# then its an error
|
||||||
|
no_except_close(upstream_res)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
agg_type = upstream_res.headers.get('Warcserver-Type')
|
agg_type = upstream_res.headers.get('Warcserver-Type')
|
||||||
@ -485,6 +480,7 @@ class LiveWebLoader(BaseLoader):
|
|||||||
else:
|
else:
|
||||||
conn = adapter.poolmanager
|
conn = adapter.poolmanager
|
||||||
|
|
||||||
|
upstream_res = None
|
||||||
try:
|
try:
|
||||||
upstream_res = conn.urlopen(method=method,
|
upstream_res = conn.urlopen(method=method,
|
||||||
url=load_url,
|
url=load_url,
|
||||||
@ -500,6 +496,8 @@ class LiveWebLoader(BaseLoader):
|
|||||||
return upstream_res
|
return upstream_res
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
if upstream_res:
|
||||||
|
no_except_close(upstream_res)
|
||||||
if logger.isEnabledFor(logging.DEBUG):
|
if logger.isEnabledFor(logging.DEBUG):
|
||||||
import traceback
|
import traceback
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
@ -527,7 +525,7 @@ class VideoLoader(BaseLoader):
|
|||||||
self.ydl = None
|
self.ydl = None
|
||||||
return
|
return
|
||||||
|
|
||||||
self.ydl = YoutubeDL(dict(simulate=True,
|
self.ydl = YoutubeDL(dict(simulate=True, quiet=True,
|
||||||
youtube_include_dash_manifest=False))
|
youtube_include_dash_manifest=False))
|
||||||
|
|
||||||
self.ydl.add_default_info_extractors()
|
self.ydl.add_default_info_extractors()
|
||||||
|
@ -9,7 +9,7 @@ brotlipy
|
|||||||
pyyaml
|
pyyaml
|
||||||
werkzeug
|
werkzeug
|
||||||
webencodings
|
webencodings
|
||||||
gevent
|
gevent==1.4.0
|
||||||
webassets==0.12.1
|
webassets==0.12.1
|
||||||
portalocker
|
portalocker
|
||||||
wsgiprox>=1.5.1
|
wsgiprox>=1.5.1
|
||||||
|
Loading…
x
Reference in New Issue
Block a user