diff --git a/pywb/apps/rewriterapp.py b/pywb/apps/rewriterapp.py index cf7bffd6..c22bcbfc 100644 --- a/pywb/apps/rewriterapp.py +++ b/pywb/apps/rewriterapp.py @@ -1,35 +1,23 @@ +from io import BytesIO + import requests - -from werkzeug.http import HTTP_STATUS_CODES from six.moves.urllib.parse import urlencode, urlsplit, urlunsplit - -from pywb.rewrite.default_rewriter import DefaultRewriter, RewriterWithJSProxy - -from pywb.rewrite.wburl import WbUrl -from pywb.rewrite.url_rewriter import UrlRewriter, IdentityUrlRewriter - -from pywb.utils.wbexception import WbException -from pywb.utils.canonicalize import canonicalize -from pywb.utils.loaders import extract_client_cookie -from pywb.utils.io import BUFF_SIZE, OffsetLimitReader -from pywb.utils.memento import MementoUtils - -from warcio.timeutils import http_date_to_timestamp, timestamp_to_http_date from warcio.bufferedreaders import BufferedReader from warcio.recordloader import ArcWarcRecordLoader +from warcio.timeutils import http_date_to_timestamp, timestamp_to_http_date +from werkzeug.http import HTTP_STATUS_CODES -from pywb.warcserver.index.cdxobject import CDXObject from pywb.apps.wbrequestresponse import WbResponse - +from pywb.rewrite.default_rewriter import DefaultRewriter, RewriterWithJSProxy from pywb.rewrite.rewriteinputreq import RewriteInputRequest -from pywb.rewrite.templateview import JinjaEnv, HeadInsertView, TopFrameView, BaseInsertView - - -from io import BytesIO -from copy import copy - -import gevent -import json +from pywb.rewrite.templateview import BaseInsertView, HeadInsertView, JinjaEnv, TopFrameView +from pywb.rewrite.url_rewriter import IdentityUrlRewriter, UrlRewriter +from pywb.rewrite.wburl import WbUrl +from pywb.utils.canonicalize import canonicalize +from pywb.utils.io import BUFF_SIZE, OffsetLimitReader, no_except_close +from pywb.utils.memento import MementoUtils +from pywb.utils.wbexception import WbException +from pywb.warcserver.index.cdxobject import CDXObject # ============================================================================ @@ -40,7 +28,7 @@ class UpstreamException(WbException): # ============================================================================ -#class Rewriter(RewriteDASHMixin, RewriteAMFMixin, RewriteContent): +# class Rewriter(RewriteDASHMixin, RewriteAMFMixin, RewriteContent): # pass @@ -84,8 +72,8 @@ class RewriterApp(object): self.banner_view) self.frame_insert_view = TopFrameView(self.jinja_env, - self._html_templ('frame_insert_html'), - self.banner_view) + self._html_templ('frame_insert_html'), + self.banner_view) self.error_view = BaseInsertView(self.jinja_env, self._html_templ('error_html')) self.not_found_view = BaseInsertView(self.jinja_env, self._html_templ('not_found_html')) @@ -129,9 +117,9 @@ class RewriterApp(object): if accept_dt: try: wb_url.timestamp = http_date_to_timestamp(accept_dt) - except: + except Exception: raise UpstreamException(400, url=wb_url.url, details='Invalid Accept-Datetime') - #return WbResponse.text_response('Invalid Accept-Datetime', status='400 Bad Request') + # return WbResponse.text_response('Invalid Accept-Datetime', status='400 Bad Request') wb_url.type = wb_url.REPLAY @@ -163,7 +151,7 @@ class RewriterApp(object): range_start = start range_end = end - #if start with 0, load from upstream, but add range after + # if start with 0, load from upstream, but add range after if start == 0: del inputreq.env['HTTP_RANGE'] else: @@ -193,11 +181,6 @@ class RewriterApp(object): if range_start >= content_length or range_end >= content_length: details = 'Invalid Range: {0} >= {2} or {1} >= {2}'.format(range_start, range_end, content_length) - try: - r.raw.close() - except: - pass - raise UpstreamException(416, url=wb_url.url, details=details) range_len = range_end - range_start + 1 @@ -296,9 +279,10 @@ class RewriterApp(object): error = None try: error = r.raw.read() - r.raw.close() - except: + except Exception: pass + finally: + no_except_close(r.raw) if error: error = error.decode('utf-8') @@ -316,10 +300,7 @@ class RewriterApp(object): # add trailing slash new_path = url_parts.path + '/' - try: - r.raw.close() - except: - pass + no_except_close(r.raw) return self.send_redirect(new_path, url_parts, urlrewriter) @@ -330,9 +311,9 @@ class RewriterApp(object): memento_dt = r.headers.get('Memento-Datetime') target_uri = r.headers.get('WARC-Target-URI') - #cdx['urlkey'] = urlkey - #cdx['timestamp'] = http_date_to_timestamp(memento_dt) - #cdx['url'] = target_uri + # cdx['urlkey'] = urlkey + # cdx['timestamp'] = http_date_to_timestamp(memento_dt) + # cdx['url'] = target_uri set_content_loc = False @@ -343,7 +324,7 @@ class RewriterApp(object): # if redir to exact, redir if url or ts are different if self.redirect_to_exact: if (set_content_loc or - (wb_url.timestamp != cdx.get('timestamp') and not cdx.get('is_live'))): + (wb_url.timestamp != cdx.get('timestamp') and not cdx.get('is_live'))): new_url = urlrewriter.get_new_url(url=target_uri, timestamp=cdx['timestamp'], @@ -375,15 +356,15 @@ class RewriterApp(object): else: top_url = self.get_top_url(full_prefix, wb_url, cdx, kwargs) head_insert_func = (self.head_insert_view. - create_insert_func(wb_url, - full_prefix, - host_prefix, - top_url, - environ, - framed_replay, - coll=kwargs.get('coll', ''), - replay_mod=self.replay_mod, - config=self.config)) + create_insert_func(wb_url, + full_prefix, + host_prefix, + top_url, + environ, + framed_replay, + coll=kwargs.get('coll', ''), + replay_mod=self.replay_mod, + config=self.config)) cookie_rewriter = None if self.cookie_tracker: @@ -511,7 +492,6 @@ class RewriterApp(object): return WbResponse.text_response(resp, status=status, content_type='text/html') - def _do_req(self, inputreq, wb_url, kwargs, skip_record): req_data = inputreq.reconstruct_request(wb_url.url) @@ -618,7 +598,7 @@ class RewriterApp(object): return scheme + host def get_rel_prefix(self, environ): - #return request.script_name + # return request.script_name return environ.get('SCRIPT_NAME') + '/' def get_full_prefix(self, environ): diff --git a/pywb/apps/wbrequestresponse.py b/pywb/apps/wbrequestresponse.py index db9993b7..b0604582 100644 --- a/pywb/apps/wbrequestresponse.py +++ b/pywb/apps/wbrequestresponse.py @@ -1,5 +1,7 @@ from warcio.statusandheaders import StatusAndHeaders +from pywb.utils.io import no_except_close + try: import ujson as json except ImportError: # pragma: no cover @@ -151,8 +153,7 @@ class WbResponse(object): self.status_headers.headers) request_method = env['REQUEST_METHOD'] if request_method == 'HEAD' or request_method == 'OPTIONS' or self.status_headers.statusline.startswith('304'): - if hasattr(self.body, 'close'): - self.body.close() + no_except_close(self.body) return [] return self.body diff --git a/pywb/recorder/multifilewarcwriter.py b/pywb/recorder/multifilewarcwriter.py index dbfce96a..ba7d3977 100644 --- a/pywb/recorder/multifilewarcwriter.py +++ b/pywb/recorder/multifilewarcwriter.py @@ -2,15 +2,14 @@ import base64 import datetime import os import shutil - import traceback import portalocker - from warcio.timeutils import timestamp20_now from warcio.warcwriter import BaseWARCWriter from pywb.utils.format import res_template +from pywb.utils.io import no_except_close # ============================================================================ @@ -85,7 +84,7 @@ class MultiFileWARCWriter(BaseWARCWriter): try: os.makedirs(path) - except: + except Exception: pass fh = open(filename, 'a+b') @@ -99,11 +98,12 @@ class MultiFileWARCWriter(BaseWARCWriter): try: if os.name != 'nt': portalocker.lock(fh, portalocker.LOCK_UN) - fh.close() return True except Exception as e: print(e) return False + finally: + no_except_close(fh) def get_dir_key(self, params): return res_template(self.key_template, params) @@ -249,7 +249,7 @@ class MultiFileWARCWriter(BaseWARCWriter): for dir_key, out, filename in self.iter_open_files(): try: mtime = os.path.getmtime(filename) - except: + except Exception: self.close_key(dir_key) return diff --git a/pywb/recorder/recorderapp.py b/pywb/recorder/recorderapp.py index 5fb35eac..689d4171 100644 --- a/pywb/recorder/recorderapp.py +++ b/pywb/recorder/recorderapp.py @@ -1,26 +1,21 @@ -from pywb.utils.io import StreamIter, BUFF_SIZE -from pywb.utils.format import ParamFormatter, res_template -from pywb.warcserver.inputrequest import DirectWSGIInputRequest - -from warcio.recordloader import ArcWarcRecordLoader - -from pywb.recorder.filters import SkipRangeRequestFilter, CollectionFilter - -from six.moves.urllib.parse import parse_qsl -import six - import json import tempfile - -import requests - import traceback -import gevent.queue import gevent +import gevent.queue +import requests +import six +from six.moves.urllib.parse import parse_qsl +from warcio.recordloader import ArcWarcRecordLoader + +from pywb.recorder.filters import CollectionFilter, SkipRangeRequestFilter +from pywb.utils.format import ParamFormatter +from pywb.utils.io import BUFF_SIZE, StreamIter, no_except_close +from pywb.warcserver.inputrequest import DirectWSGIInputRequest -#============================================================================== +# ============================================================================== class RecorderApp(object): def __init__(self, upstream_host, writer, skip_filters=None, **kwargs): self.upstream_host = upstream_host @@ -52,13 +47,13 @@ class RecorderApp(object): @staticmethod def default_create_buffer(params, name): - return tempfile.SpooledTemporaryFile(max_size=512*1024) + return tempfile.SpooledTemporaryFile(max_size=512 * 1024) def _write_loop(self): while True: try: self._write_one() - except: + except Exception: traceback.print_exc() def _write_one(self): @@ -88,14 +83,13 @@ class RecorderApp(object): else: self.writer.write_record(resp, params) - finally: try: if req_pay: - req_pay.close() + no_except_close(req_pay) if resp_pay: - resp_pay.close() + no_except_close(resp_pay) except Exception as e: traceback.print_exc() @@ -155,7 +149,7 @@ class RecorderApp(object): finally: if req_stream: - req_stream.out.close() + no_except_close(req_stream.out) return self.send_message(msg, '200 OK', @@ -169,8 +163,7 @@ class RecorderApp(object): def __call__(self, environ, start_response): try: return self.handle_call(environ, start_response) - except: - import traceback + except Exception: traceback.print_exc() def handle_call(self, environ, start_response): @@ -217,15 +210,15 @@ class RecorderApp(object): try: res = requests.request(url=self.upstream_host + request_uri, - method=method, - data=data, - headers=headers, - allow_redirects=False, - stream=True) + method=method, + data=data, + headers=headers, + allow_redirects=False, + stream=True) res.raise_for_status() except Exception as e: if req_is_wrapped: - req_stream.out.close() + no_except_close(req_stream.out) return self.send_error(e, start_response) if not skipping: @@ -233,8 +226,7 @@ class RecorderApp(object): req_stream.headers, res.headers, params) - for x in self.skip_filters) - + for x in self.skip_filters) if not skipping: resp_stream = RespWrapper(res.raw, @@ -248,7 +240,7 @@ class RecorderApp(object): else: resp_stream = res.raw if req_is_wrapped: - req_stream.out.close() + no_except_close(req_stream.out) resp_iter = StreamIter(resp_stream) @@ -260,7 +252,7 @@ class RecorderApp(object): return resp_iter -#============================================================================== +# ============================================================================== class Wrapper(object): def __init__(self, stream, params, create_func): self.stream = stream @@ -280,7 +272,7 @@ class Wrapper(object): return buff -#============================================================================== +# ============================================================================== class RespWrapper(Wrapper): def __init__(self, stream, headers, req, params, queue, path, create_func): @@ -319,23 +311,20 @@ class RespWrapper(Wrapper): entry = (self.req.headers, self.req.out, self.headers, self.out, self.params) self.queue.put(entry) - except: + except Exception: traceback.print_exc() skipping = True finally: - try: - if skipping: - self.out.close() - self.req.out.close() - except: - traceback.print_exc() + if skipping: + no_except_close(self.out) + no_except_close(self.req.out) - self.req.close() + no_except_close(self.req) self.req = None -#============================================================================== +# ============================================================================== class ReqWrapper(Wrapper): def __init__(self, stream, req_headers, params, create_func): super(ReqWrapper, self).__init__(stream, params, create_func) @@ -348,5 +337,3 @@ class ReqWrapper(Wrapper): def close(self): # no need to close wsgi.input pass - - diff --git a/pywb/rewrite/content_rewriter.py b/pywb/rewrite/content_rewriter.py index 34cf5be4..34e0e225 100644 --- a/pywb/rewrite/content_rewriter.py +++ b/pywb/rewrite/content_rewriter.py @@ -1,19 +1,15 @@ -from io import BytesIO - +import codecs +import json +import re +import tempfile from contextlib import closing +import webencodings from warcio.bufferedreaders import BufferedReader, ChunkedDataReader from warcio.utils import to_native_str -import re -import webencodings -import tempfile -import json -import codecs - -from pywb.utils.io import StreamIter, BUFF_SIZE - -from pywb.utils.loaders import load_yaml_config, load_py_name +from pywb.utils.io import BUFF_SIZE, StreamIter, no_except_close +from pywb.utils.loaders import load_py_name, load_yaml_config WORKER_MODS = {"wkr_", "sw_"} # type: Set[str] @@ -344,7 +340,7 @@ class StreamingRewriter(object): yield buff.encode(charset) finally: - stream.close() + no_except_close(stream) # ============================================================================ diff --git a/pywb/utils/geventserver.py b/pywb/utils/geventserver.py index a42519be..1db2cf96 100644 --- a/pywb/utils/geventserver.py +++ b/pywb/utils/geventserver.py @@ -1,6 +1,8 @@ -from gevent.pywsgi import WSGIServer, WSGIHandler -from gevent import spawn import logging +import traceback + +from gevent import spawn +from gevent.pywsgi import WSGIHandler, WSGIServer # ============================================================================ diff --git a/pywb/utils/io.py b/pywb/utils/io.py index a651a75e..c5de3ede 100644 --- a/pywb/utils/io.py +++ b/pywb/utils/io.py @@ -1,12 +1,35 @@ import zlib from contextlib import closing, contextmanager - -from warcio.utils import BUFF_SIZE -from warcio.limitreader import LimitReader from tempfile import SpooledTemporaryFile +from warcio.limitreader import LimitReader +from warcio.utils import BUFF_SIZE -#============================================================================= + +def no_except_close(closable): + """Attempts to call the close method of the + supplied object. + + :param closable: The object to be closed + :rtype: None + """ + if not closable: + return + + try: + closable.close() + except Exception: + pass + + try: + release_conn = getattr(closable, 'release_conn', None) + if release_conn is not None: + release_conn() + except Exception: + pass + + +# ============================================================================= def StreamIter(stream, header1=None, header2=None, size=BUFF_SIZE, closer=closing): with closer(stream): if header1: @@ -22,19 +45,16 @@ def StreamIter(stream, header1=None, header2=None, size=BUFF_SIZE, closer=closin yield buff -#============================================================================= +# ============================================================================= @contextmanager def call_release_conn(stream): try: yield stream finally: - if hasattr(stream, 'release_conn'): - stream.release_conn() - else: - stream.close() + no_except_close(stream) -#============================================================================= +# ============================================================================= def chunk_encode_iter(orig_iter): for chunk in orig_iter: if not len(chunk): @@ -47,7 +67,7 @@ def chunk_encode_iter(orig_iter): yield b'0\r\n\r\n' -#============================================================================= +# ============================================================================= def buffer_iter(status_headers, iterator, buff_size=BUFF_SIZE * 4): out = SpooledTemporaryFile(buff_size) size = 0 @@ -65,7 +85,7 @@ def buffer_iter(status_headers, iterator, buff_size=BUFF_SIZE * 4): return StreamIter(out) -#============================================================================= +# ============================================================================= def compress_gzip_iter(orig_iter): compressobj = zlib.compressobj(9, zlib.DEFLATED, zlib.MAX_WBITS + 16) for chunk in orig_iter: @@ -101,4 +121,3 @@ class OffsetLimitReader(LimitReader): def readline(self, length=None): self._skip() return super(OffsetLimitReader, self).readline(length) - diff --git a/pywb/utils/loaders.py b/pywb/utils/loaders.py index d03582c9..e2426dc8 100644 --- a/pywb/utils/loaders.py +++ b/pywb/utils/loaders.py @@ -11,22 +11,22 @@ import requests import yaml import six -from six.moves.urllib.parse import urljoin, unquote_plus, urlsplit, urlencode +from six.moves.urllib.parse import unquote_plus, urlsplit import time import pkgutil -import base64 -import cgi from io import open, BytesIO from warcio.limitreader import LimitReader +from pywb.utils.io import no_except_close try: import boto3 from botocore import UNSIGNED from botocore.client import Config + s3_avail = True -except ImportError: #pragma: no cover +except ImportError: # pragma: no cover s3_avail = False @@ -39,12 +39,12 @@ def load_py_name(string): return getattr(mod, string[1]) -#================================================================= +# ================================================================= def is_http(filename): return filename.startswith(('http://', 'https://')) -#================================================================= +# ================================================================= def to_file_url(filename): """ Convert a filename to a file:// url """ @@ -52,7 +52,7 @@ def to_file_url(filename): return url -#================================================================= +# ================================================================= def from_file_url(url): """ Convert from file:// url to file path """ @@ -62,7 +62,7 @@ def from_file_url(url): return url -#================================================================= +# ================================================================= def load(filename): return BlockLoader().load(filename) @@ -75,8 +75,7 @@ def load_yaml_config(config_file): configdata = load(config_file) config = yaml.load(configdata) finally: - if configdata: - configdata.close() + no_except_close(configdata) return config @@ -84,7 +83,6 @@ def load_yaml_config(config_file): # ============================================================================= def load_overlay_config(main_env_var, main_default_file='', overlay_env_var='', overlay_file=''): - configfile = os.environ.get(main_env_var, main_default_file) config = None @@ -104,7 +102,7 @@ def load_overlay_config(main_env_var, main_default_file='', return config -#================================================================= +# ================================================================= def extract_client_cookie(env, cookie_name): cookie_header = env.get('HTTP_COOKIE') if not cookie_header: @@ -129,7 +127,7 @@ def extract_client_cookie(env, cookie_name): return value -#================================================================= +# ================================================================= def read_last_line(fh, offset=256): """ Read last line from a seekable file. Start reading from buff before end of file, and double backwards seek @@ -150,7 +148,7 @@ def read_last_line(fh, offset=256): return fh.readlines()[-1] -#================================================================= +# ================================================================= class BaseLoader(object): def __init__(self, **kwargs): pass @@ -159,7 +157,7 @@ class BaseLoader(object): raise NotImplemented() -#================================================================= +# ================================================================= class BlockLoader(BaseLoader): """ a loader which can stream blocks of content @@ -171,6 +169,7 @@ class BlockLoader(BaseLoader): profile_loader = None def __init__(self, **kwargs): + super(BlockLoader, self).__init__() self.cached = {} self.kwargs = kwargs @@ -241,7 +240,7 @@ class BlockLoader(BaseLoader): return range_header -#================================================================= +# ================================================================= class PackageLoader(BaseLoader): def load(self, url, offset=0, length=-1): if url.startswith('pkg://'): @@ -263,11 +262,11 @@ class PackageLoader(BaseLoader): buff.name = url return buff - #afile = pkg_resources.resource_stream(pkg_split[0], + # afile = pkg_resources.resource_stream(pkg_split[0], # pkg_split[1]) -#================================================================= +# ================================================================= class LocalFileLoader(PackageLoader): def load(self, url, offset=0, length=-1): """ @@ -283,11 +282,13 @@ class LocalFileLoader(PackageLoader): file_only = True url = filename + afile = None try: # first, try as file afile = open(url, 'rb') except IOError: + no_except_close(afile) if file_only: raise @@ -302,9 +303,10 @@ class LocalFileLoader(PackageLoader): return afile -#================================================================= +# ================================================================= class HttpLoader(BaseLoader): def __init__(self, **kwargs): + super(HttpLoader, self).__init__() self.cookie_maker = kwargs.get('cookie_maker') if not self.cookie_maker: self.cookie_maker = kwargs.get('cookie') @@ -333,16 +335,17 @@ class HttpLoader(BaseLoader): return r.raw -#================================================================= +# ================================================================= class S3Loader(BaseLoader): def __init__(self, **kwargs): + super(S3Loader, self).__init__() self.client = None self.aws_access_key_id = kwargs.get('aws_access_key_id') self.aws_secret_access_key = kwargs.get('aws_secret_access_key') def load(self, url, offset, length): - if not s3_avail: #pragma: no cover - raise IOError('To load from s3 paths, ' + + if not s3_avail: # pragma: no cover + raise IOError('To load from s3 paths, ' + 'you must install boto3: pip install boto3') aws_access_key_id = self.aws_access_key_id @@ -372,8 +375,8 @@ class S3Loader(BaseLoader): config = None client = boto3.client('s3', aws_access_key_id=aws_access_key_id, - aws_secret_access_key=aws_secret_access_key, - config=config) + aws_secret_access_key=aws_secret_access_key, + config=config) else: client = self.client @@ -398,15 +401,16 @@ class S3Loader(BaseLoader): return obj['Body'] -#================================================================= +# ================================================================= # Signed Cookie-Maker -#================================================================= +# ================================================================= class HMACCookieMaker(object): """ Utility class to produce signed HMAC digest cookies to be used with each http request """ + def __init__(self, key, name, duration=10): self.key = key self.name = name @@ -435,4 +439,3 @@ class HMACCookieMaker(object): # ============================================================================ BlockLoader.init_default_loaders() - diff --git a/pywb/warcserver/index/indexsource.py b/pywb/warcserver/index/indexsource.py index 80baa77f..09101e18 100644 --- a/pywb/warcserver/index/indexsource.py +++ b/pywb/warcserver/index/indexsource.py @@ -1,22 +1,18 @@ -from pywb.utils.binsearch import iter_range -from pywb.utils.canonicalize import canonicalize -from pywb.utils.wbexception import NotFoundException - -from warcio.timeutils import timestamp_to_http_date, http_date_to_timestamp -from warcio.timeutils import timestamp_now, pad_timestamp, PAD_14_DOWN - -from pywb.warcserver.http import DefaultAdapters -from pywb.warcserver.index.cdxobject import CDXObject - -from pywb.utils.format import ParamFormatter, res_template -from pywb.utils.memento import MementoUtils +import logging +import re import redis - import requests +from warcio.timeutils import PAD_14_DOWN, http_date_to_timestamp, pad_timestamp, timestamp_now, timestamp_to_http_date -import re -import logging +from pywb.utils.binsearch import iter_range +from pywb.utils.canonicalize import canonicalize +from pywb.utils.format import res_template +from pywb.utils.io import no_except_close +from pywb.utils.memento import MementoUtils +from pywb.utils.wbexception import NotFoundException +from pywb.warcserver.http import DefaultAdapters +from pywb.warcserver.index.cdxobject import CDXObject #============================================================================= @@ -432,15 +428,16 @@ class MementoIndexSource(BaseIndexSource): def handle_timemap(self, params): url = res_template(self.timemap_url, params) headers = self._get_headers(params) + res = None try: res = self.sesh.get(url, headers=headers, timeout=params.get('_timeout')) res.raise_for_status() - assert(res.text) except Exception as e: + no_except_close(res) self.logger.debug('FAILED: ' + str(e)) raise NotFoundException(url) @@ -550,14 +547,17 @@ class WBMementoIndexSource(MementoIndexSource): url = params['url'] load_url = self.timegate_url.format(url=url, timestamp=timestamp) + res = None try: headers = self._get_headers(params) res = self.sesh.head(load_url, headers=headers) except Exception as e: + no_except_close(res) raise NotFoundException(url) if res and res.headers.get('Memento-Datetime'): if res.status_code >= 400: + no_except_close(res) raise NotFoundException(url) if res.status_code >= 300: diff --git a/pywb/warcserver/index/zipnum.py b/pywb/warcserver/index/zipnum.py index 63a348ae..47f2e46d 100644 --- a/pywb/warcserver/index/zipnum.py +++ b/pywb/warcserver/index/zipnum.py @@ -1,24 +1,20 @@ +import datetime +import itertools +import json +import logging +import os from io import BytesIO -import os -import collections -import itertools -import logging -import datetime -import json import six - -from six.moves import map - from warcio.bufferedreaders import gzip_decompressor -#from pywb.warcserver.index.cdxsource import CDXSource -from pywb.warcserver.index.indexsource import BaseIndexSource -from pywb.warcserver.index.cdxobject import IDXObject, CDXException, CDXObject -from pywb.warcserver.index.query import CDXQuery - -from pywb.utils.loaders import BlockLoader, read_last_line from pywb.utils.binsearch import iter_range, linearsearch, search +from pywb.utils.io import no_except_close +from pywb.utils.loaders import BlockLoader, read_last_line +from pywb.warcserver.index.cdxobject import CDXException, CDXObject, IDXObject +# from pywb.warcserver.index.cdxsource import CDXSource +from pywb.warcserver.index.indexsource import BaseIndexSource +from pywb.warcserver.index.query import CDXQuery # ============================================================================ @@ -211,7 +207,7 @@ class ZipNumIndexSource(BaseIndexSource): if end_line == last_line and query.key >= last_line: first_line = last_line else: - reader.close() + no_except_close(reader) if query.page_count: yield self._page_info(0, pagesize, 0) return @@ -240,13 +236,13 @@ class ZipNumIndexSource(BaseIndexSource): blocks = -1 yield self._page_info(total_pages, pagesize, blocks + 1) - reader.close() + no_except_close(reader) return curr_page = query.page if curr_page >= total_pages or curr_page < 0: msg = 'Page {0} invalid: First Page is 0, Last Page is {1}' - reader.close() + no_except_close(reader) raise CDXException(msg.format(curr_page, total_pages - 1)) startline = curr_page * pagesize @@ -259,12 +255,14 @@ class ZipNumIndexSource(BaseIndexSource): else: startline -= 1 - idxiter = itertools.islice(first_iter, startline, endline) - for idx in idxiter: - yield idx - - reader.close() - + try: + idxiter = itertools.islice(first_iter, startline, endline) + for idx in idxiter: + yield idx + except Exception: + pass + finally: + no_except_close(reader) def search_by_line_num(self, reader, line): # pragma: no cover def line_cmp(line1, line2): @@ -349,7 +347,7 @@ class ZipNumIndexSource(BaseIndexSource): for r in ranges: yield decompress_block(r) finally: - reader.close() + no_except_close(reader) # iterate over all blocks iter_ = itertools.chain.from_iterable(iter_blocks(reader)) diff --git a/pywb/warcserver/resource/resolvingloader.py b/pywb/warcserver/resource/resolvingloader.py index a8fc455f..aa584f5e 100644 --- a/pywb/warcserver/resource/resolvingloader.py +++ b/pywb/warcserver/resource/resolvingloader.py @@ -1,20 +1,19 @@ +import six from warcio.recordloader import ArchiveLoadFailed from warcio.timeutils import iso_date_to_timestamp +from pywb.utils.io import no_except_close +from pywb.utils.wbexception import NotFoundException from pywb.warcserver.resource.blockrecordloader import BlockArcWarcRecordLoader -from pywb.utils.wbexception import NotFoundException -import six - - -#================================================================= +# ================================================================= class ResolvingLoader(object): MISSING_REVISIT_MSG = 'Original for revisit record could not be loaded' - def __init__(self, path_resolvers, record_loader=BlockArcWarcRecordLoader(), no_record_parse=False): + def __init__(self, path_resolvers, record_loader=None, no_record_parse=False): self.path_resolvers = path_resolvers - self.record_loader = record_loader + self.record_loader = record_loader if record_loader is not None else BlockArcWarcRecordLoader() self.no_record_parse = no_record_parse def __call__(self, cdx, failed_files, cdx_loader, *args, **kwargs): @@ -29,7 +28,7 @@ class ResolvingLoader(object): elif headers_record != payload_record: # close remainder of stream as this record only used for # (already parsed) headers - headers_record.raw_stream.close() + no_except_close(headers_record.raw_stream) # special case: check if headers record is actually empty # (eg empty revisit), then use headers from revisit @@ -37,6 +36,10 @@ class ResolvingLoader(object): headers_record = payload_record if not headers_record or not payload_record: + if headers_record: + no_except_close(headers_record.raw_stream) + if payload_record: + no_except_close(payload_record.raw_stream) raise ArchiveLoadFailed('Could not load ' + str(cdx)) # ensure status line is valid from here @@ -57,12 +60,13 @@ class ResolvingLoader(object): from a different url to find the original record. """ has_curr = (cdx['filename'] != '-') - #has_orig = (cdx.get('orig.filename', '-') != '-') + # has_orig = (cdx.get('orig.filename', '-') != '-') orig_f = cdx.get('orig.filename') has_orig = orig_f and orig_f != '-' # load headers record from cdx['filename'] unless it is '-' (rare) headers_record = None + payload_record = None if has_curr: headers_record = self._resolve_path_load(cdx, False, failed_files) @@ -85,7 +89,6 @@ class ResolvingLoader(object): return headers_record, payload_record - def _resolve_path_load(self, cdx, is_original, failed_files): """ Load specific record based on filename, offset and length @@ -127,8 +130,8 @@ class ResolvingLoader(object): any_found = True try: return (self.record_loader. - load(path, offset, length, - no_record_parse=self.no_record_parse)) + load(path, offset, length, + no_record_parse=self.no_record_parse)) except Exception as ue: last_exc = ue @@ -140,12 +143,12 @@ class ResolvingLoader(object): failed_files.append(filename) if last_exc: - #msg = str(last_exc.__class__.__name__) + # msg = str(last_exc.__class__.__name__) msg = str(last_exc) else: msg = 'Archive File Not Found' - #raise ArchiveLoadFailed(msg, filename), None, last_traceback + # raise ArchiveLoadFailed(msg, filename), None, last_traceback six.reraise(ArchiveLoadFailed, ArchiveLoadFailed(filename + ': ' + msg), last_traceback) def _load_different_url_payload(self, cdx, headers_record, diff --git a/pywb/warcserver/resource/responseloader.py b/pywb/warcserver/resource/responseloader.py index 2396da1e..830c9fbe 100644 --- a/pywb/warcserver/resource/responseloader.py +++ b/pywb/warcserver/resource/responseloader.py @@ -1,36 +1,31 @@ -from warcio.timeutils import timestamp_to_datetime, datetime_to_timestamp -from warcio.timeutils import iso_date_to_datetime, datetime_to_iso_date -from warcio.timeutils import http_date_to_datetime, datetime_to_http_date -from warcio.utils import to_native_str - -from warcio.statusandheaders import StatusAndHeaders, StatusAndHeadersParser - -from pywb.utils.wbexception import LiveResourceException, WbException - -from pywb.utils.canonicalize import canonicalize - -from pywb.utils.memento import MementoUtils -from pywb.utils.io import StreamIter, compress_gzip_iter, call_release_conn -from pywb.utils.format import ParamFormatter - -from pywb.warcserver.resource.resolvingloader import ResolvingLoader -from pywb.warcserver.resource.pathresolvers import DefaultResolverMixin - -from pywb.warcserver.http import DefaultAdapters, SOCKS_PROXIES - -from six.moves.urllib.parse import urlsplit, quote, unquote - +import datetime +import json +import logging +import uuid from io import BytesIO -import uuid import six -import itertools -import json -import glob -import datetime -import logging - from requests.models import PreparedRequest +from six.moves.urllib.parse import quote, unquote, urlsplit +from warcio.statusandheaders import StatusAndHeaders, StatusAndHeadersParser +from warcio.timeutils import ( + datetime_to_http_date, + datetime_to_iso_date, + datetime_to_timestamp, + http_date_to_datetime, + iso_date_to_datetime, + timestamp_to_datetime +) +from warcio.utils import to_native_str + +from pywb.utils.canonicalize import canonicalize +from pywb.utils.format import ParamFormatter +from pywb.utils.io import StreamIter, call_release_conn, compress_gzip_iter, no_except_close +from pywb.utils.memento import MementoUtils +from pywb.utils.wbexception import LiveResourceException +from pywb.warcserver.http import DefaultAdapters, SOCKS_PROXIES +from pywb.warcserver.resource.pathresolvers import DefaultResolverMixin +from pywb.warcserver.resource.resolvingloader import ResolvingLoader logger = logging.getLogger('warcserver') @@ -217,8 +212,8 @@ class WARCPathLoader(DefaultResolverMixin, BaseLoader): http_headers.get_statuscode(), http_headers.get_header('Location')) except LiveResourceException: - headers.raw_stream.close() - payload.raw_stream.close() + no_except_close(headers.raw_stream) + no_except_close(payload.raw_stream) raise http_headers_buff = http_headers.to_bytes() @@ -237,8 +232,7 @@ class WARCPathLoader(DefaultResolverMixin, BaseLoader): warc_headers.replace_header('WARC-Date', headers.rec_headers.get_header('WARC-Date')) - - headers.raw_stream.close() + no_except_close(headers.raw_stream) return (warc_headers, http_headers_buff, payload.raw_stream) @@ -288,7 +282,7 @@ class LiveWebLoader(BaseLoader): p = PreparedRequest() try: p.prepare_url(load_url, None) - except: + except Exception: raise LiveResourceException(load_url) p.prepare_headers(None) p.prepare_auth(None, load_url) @@ -320,6 +314,7 @@ class LiveWebLoader(BaseLoader): elif cdx.get('memento_url'): # if 'memento_url' set and no Memento-Datetime header present # then its an error + no_except_close(upstream_res) return None agg_type = upstream_res.headers.get('Warcserver-Type') @@ -485,6 +480,7 @@ class LiveWebLoader(BaseLoader): else: conn = adapter.poolmanager + upstream_res = None try: upstream_res = conn.urlopen(method=method, url=load_url, @@ -500,6 +496,8 @@ class LiveWebLoader(BaseLoader): return upstream_res except Exception as e: + if upstream_res: + no_except_close(upstream_res) if logger.isEnabledFor(logging.DEBUG): import traceback traceback.print_exc() @@ -527,7 +525,7 @@ class VideoLoader(BaseLoader): self.ydl = None return - self.ydl = YoutubeDL(dict(simulate=True, + self.ydl = YoutubeDL(dict(simulate=True, quiet=True, youtube_include_dash_manifest=False)) self.ydl.add_default_info_extractors() diff --git a/requirements.txt b/requirements.txt index 997de6bc..ec32afda 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,7 +9,7 @@ brotlipy pyyaml werkzeug webencodings -gevent +gevent==1.4.0 webassets==0.12.1 portalocker wsgiprox>=1.5.1