diff --git a/pywb/apps/frontendapp.py b/pywb/apps/frontendapp.py index 5987aa12..d2edf55c 100644 --- a/pywb/apps/frontendapp.py +++ b/pywb/apps/frontendapp.py @@ -7,7 +7,9 @@ from werkzeug.wsgi import pop_path_info from six.moves.urllib.parse import urljoin from six import iteritems -from pywb.utils.loaders import load_yaml_config, to_native_str +from warcio.utils import to_native_str + +from pywb.utils.loaders import load_yaml_config from pywb.utils.geventserver import GeventServer from pywb.warcserver.warcserver import WarcServer diff --git a/pywb/apps/rewriterapp.py b/pywb/apps/rewriterapp.py index 132e93cd..635c8838 100644 --- a/pywb/apps/rewriterapp.py +++ b/pywb/apps/rewriterapp.py @@ -14,6 +14,8 @@ from pywb.rewrite.url_rewriter import UrlRewriter, SchemeOnlyUrlRewriter from pywb.utils.wbexception import WbException from pywb.utils.canonicalize import canonicalize from pywb.utils.loaders import extract_client_cookie +from pywb.utils.io import BUFF_SIZE +from pywb.utils.memento import MementoUtils from warcio.timeutils import http_date_to_timestamp from warcio.bufferedreaders import BufferedReader @@ -22,9 +24,6 @@ from warcio.recordloader import ArcWarcRecordLoader from pywb.warcserver.index.cdxobject import CDXObject from pywb.apps.wbrequestresponse import WbResponse -from pywb.warcserver.utils import BUFF_SIZE -from pywb.warcserver.utils import MementoUtils - from pywb.rewrite.rewriteinputreq import RewriteInputRequest from pywb.rewrite.templateview import JinjaEnv, HeadInsertView, TopFrameView, BaseInsertView diff --git a/pywb/indexer/archiveindexer.py b/pywb/indexer/archiveindexer.py index 2a651525..1b5f1760 100644 --- a/pywb/indexer/archiveindexer.py +++ b/pywb/indexer/archiveindexer.py @@ -1,7 +1,7 @@ from pywb.utils.canonicalize import canonicalize from pywb.warcserver.inputrequest import PostQueryExtractor -from pywb.warcserver.utils import BUFF_SIZE +from pywb.utils.io import BUFF_SIZE from warcio.timeutils import iso_date_to_timestamp from warcio.archiveiterator import ArchiveIterator diff --git a/pywb/recorder/multifilewarcwriter.py b/pywb/recorder/multifilewarcwriter.py index f0518de3..41e9a363 100644 --- a/pywb/recorder/multifilewarcwriter.py +++ b/pywb/recorder/multifilewarcwriter.py @@ -10,7 +10,7 @@ import portalocker from warcio.timeutils import timestamp20_now from warcio.warcwriter import BaseWARCWriter -from pywb.warcserver.utils import res_template +from pywb.utils.format import res_template # ============================================================================ diff --git a/pywb/recorder/recorderapp.py b/pywb/recorder/recorderapp.py index 8e164b24..5aa326af 100644 --- a/pywb/recorder/recorderapp.py +++ b/pywb/recorder/recorderapp.py @@ -1,5 +1,5 @@ -from pywb.warcserver.utils import StreamIter, BUFF_SIZE -from pywb.warcserver.utils import ParamFormatter, res_template +from pywb.utils.io import StreamIter, BUFF_SIZE +from pywb.utils.format import ParamFormatter, res_template from pywb.warcserver.inputrequest import DirectWSGIInputRequest from warcio.recordloader import ArcWarcRecordLoader diff --git a/pywb/recorder/redisindexer.py b/pywb/recorder/redisindexer.py index 55433464..af244260 100644 --- a/pywb/recorder/redisindexer.py +++ b/pywb/recorder/redisindexer.py @@ -4,12 +4,13 @@ from io import BytesIO import os from pywb.utils.canonicalize import calc_search_range +from pywb.utils.format import res_template + from pywb.indexer.cdxindexer import write_cdx_index from pywb.warcserver.index.cdxobject import CDXObject from pywb.warcserver.index.indexsource import RedisIndexSource from pywb.warcserver.index.aggregator import SimpleAggregator -from pywb.warcserver.utils import res_template from pywb.recorder.filters import WriteRevisitDupePolicy diff --git a/pywb/recorder/test/test_recorder.py b/pywb/recorder/test/test_recorder.py index a9526228..5c9deace 100644 --- a/pywb/recorder/test/test_recorder.py +++ b/pywb/recorder/test/test_recorder.py @@ -16,7 +16,7 @@ from pywb.recorder.multifilewarcwriter import PerRecordWARCWriter, MultiFileWARC from pywb.recorder.filters import ExcludeSpecificHeaders, ExcludeHttpOnlyCookieHeaders from pywb.recorder.filters import SkipDupePolicy, WriteDupePolicy, WriteRevisitDupePolicy -from pywb.warcserver.utils import MementoUtils +from pywb.utils.memento import MementoUtils from pywb.warcserver.index.cdxobject import CDXObject diff --git a/pywb/rewrite/content_rewriter.py b/pywb/rewrite/content_rewriter.py index efd915bc..43610852 100644 --- a/pywb/rewrite/content_rewriter.py +++ b/pywb/rewrite/content_rewriter.py @@ -9,7 +9,7 @@ import re import webencodings import tempfile -from pywb.warcserver.utils import StreamIter, BUFF_SIZE +from pywb.utils.io import StreamIter, BUFF_SIZE from pywb.utils.loaders import load_yaml_config diff --git a/pywb/rewrite/test/test_wburl.py b/pywb/rewrite/test/test_wburl.py index 962d73c6..c8d1e5e1 100644 --- a/pywb/rewrite/test/test_wburl.py +++ b/pywb/rewrite/test/test_wburl.py @@ -245,7 +245,7 @@ Exception: ('Invalid WbUrl: ', '') from pywb.rewrite.wburl import WbUrl from six.moves.urllib.parse import quote_plus, unquote_plus -from pywb.utils.loaders import to_native_str +from warcio.utils import to_native_str from io import StringIO diff --git a/pywb/rewrite/wburl.py b/pywb/rewrite/wburl.py index 1853ed01..a606f20a 100644 --- a/pywb/rewrite/wburl.py +++ b/pywb/rewrite/wburl.py @@ -44,7 +44,7 @@ import six from six.moves.urllib.parse import urlsplit, urlunsplit from six.moves.urllib.parse import quote_plus, quote, unquote_plus -from pywb.utils.loaders import to_native_str +from warcio.utils import to_native_str #================================================================= diff --git a/pywb/utils/loaders.py b/pywb/utils/loaders.py index 13b58397..25bdf3a2 100644 --- a/pywb/utils/loaders.py +++ b/pywb/utils/loaders.py @@ -6,6 +6,7 @@ local and remote access import os import hmac import requests +import yaml import six from six.moves.urllib.request import pathname2url, url2pathname @@ -18,7 +19,6 @@ import cgi from io import open, BytesIO from warcio.limitreader import LimitReader -from warcio.utils import to_native_str try: from boto import connect_s3 @@ -46,9 +46,8 @@ def load(filename): return BlockLoader().load(filename) -#================================================================= +# ============================================================================= def load_yaml_config(config_file): - import yaml config = None configdata = None try: @@ -61,6 +60,29 @@ def load_yaml_config(config_file): return config +# ============================================================================= +def load_overlay_config(main_env_var, main_default_file='', + overlay_env_var='', overlay_file=''): + + configfile = os.environ.get(main_env_var, main_default_file) + config = None + + if configfile: + configfile = os.path.expandvars(configfile) + + config = load_yaml_config(configfile) + + config = config or {} + + overlay_configfile = os.environ.get(overlay_env_var, overlay_file) + + if overlay_configfile: + overlay_configfile = os.path.expandvars(overlay_configfile) + config.update(load_yaml_config(overlay_configfile)) + + return config + + #================================================================= def extract_client_cookie(env, cookie_name): cookie_header = env.get('HTTP_COOKIE') diff --git a/pywb/warcserver/handlers.py b/pywb/warcserver/handlers.py index 5b490fa2..785de72d 100644 --- a/pywb/warcserver/handlers.py +++ b/pywb/warcserver/handlers.py @@ -1,11 +1,11 @@ from pywb.utils.wbexception import BadRequestException, WbException from pywb.utils.wbexception import NotFoundException +from pywb.utils.memento import MementoUtils from warcio.recordloader import ArchiveLoadFailed from pywb.warcserver.index.fuzzymatcher import FuzzyMatcher from pywb.warcserver.resource.responseloader import WARCPathLoader, LiveWebLoader, VideoLoader -from pywb.warcserver.utils import MementoUtils import six diff --git a/pywb/warcserver/index/aggregator.py b/pywb/warcserver/index/aggregator.py index 90f396cf..db5cf435 100644 --- a/pywb/warcserver/index/aggregator.py +++ b/pywb/warcserver/index/aggregator.py @@ -12,7 +12,7 @@ from collections import deque from itertools import chain from pywb.utils.wbexception import NotFoundException, WbException -from pywb.warcserver.utils import ParamFormatter, res_template +from pywb.utils.format import ParamFormatter, res_template from pywb.warcserver.index.indexsource import FileIndexSource, RedisIndexSource from pywb.warcserver.index.cdxops import process_cdx diff --git a/pywb/warcserver/index/cdxobject.py b/pywb/warcserver/index/cdxobject.py index 3e8dddc5..b470e87b 100644 --- a/pywb/warcserver/index/cdxobject.py +++ b/pywb/warcserver/index/cdxobject.py @@ -10,7 +10,7 @@ from six.moves.urllib.parse import urlencode, quote from six.moves.urllib.parse import parse_qs from pywb.utils.wbexception import WbException -from pywb.utils.loaders import to_native_str +from warcio.utils import to_native_str from json import loads as json_decode from json import dumps as json_encode diff --git a/pywb/warcserver/index/indexsource.py b/pywb/warcserver/index/indexsource.py index 64a30400..e557ed4b 100644 --- a/pywb/warcserver/index/indexsource.py +++ b/pywb/warcserver/index/indexsource.py @@ -7,8 +7,8 @@ from warcio.timeutils import timestamp_now, pad_timestamp, PAD_14_DOWN from pywb.warcserver.index.cdxobject import CDXObject -from pywb.warcserver.utils import ParamFormatter, res_template -from pywb.warcserver.utils import MementoUtils +from pywb.utils.format import ParamFormatter, res_template +from pywb.utils.memento import MementoUtils import redis diff --git a/pywb/warcserver/resource/blockrecordloader.py b/pywb/warcserver/resource/blockrecordloader.py index 91d9ebff..a266318c 100644 --- a/pywb/warcserver/resource/blockrecordloader.py +++ b/pywb/warcserver/resource/blockrecordloader.py @@ -2,7 +2,7 @@ from warcio.bufferedreaders import DecompressingBufferedReader from warcio.recordloader import ArcWarcRecordLoader from pywb.utils.loaders import BlockLoader -from pywb.warcserver.utils import BUFF_SIZE +from pywb.utils.io import BUFF_SIZE #================================================================= diff --git a/pywb/warcserver/resource/pathresolvers.py b/pywb/warcserver/resource/pathresolvers.py index ff815348..8c5767e1 100644 --- a/pywb/warcserver/resource/pathresolvers.py +++ b/pywb/warcserver/resource/pathresolvers.py @@ -1,7 +1,7 @@ import redis +from warcio.utils import to_native_str from pywb.utils.binsearch import iter_exact -from pywb.utils.loaders import to_native_str from pywb.warcserver.index.indexsource import RedisIndexSource diff --git a/pywb/warcserver/resource/responseloader.py b/pywb/warcserver/resource/responseloader.py index c4ff4fdf..c7e57ca8 100644 --- a/pywb/warcserver/resource/responseloader.py +++ b/pywb/warcserver/resource/responseloader.py @@ -7,8 +7,9 @@ from warcio.statusandheaders import StatusAndHeaders, StatusAndHeadersParser from pywb.utils.wbexception import LiveResourceException, WbException -from pywb.warcserver.utils import MementoUtils, StreamIter, compress_gzip_iter -from pywb.warcserver.utils import ParamFormatter +from pywb.utils.memento import MementoUtils +from pywb.utils.io import StreamIter, compress_gzip_iter +from pywb.utils.format import ParamFormatter from pywb.warcserver.resource.resolvingloader import ResolvingLoader from pywb.warcserver.resource.pathresolvers import DefaultResolverMixin diff --git a/pywb/warcserver/test/test_handlers.py b/pywb/warcserver/test/test_handlers.py index 3c9e2a05..7057c77f 100644 --- a/pywb/warcserver/test/test_handlers.py +++ b/pywb/warcserver/test/test_handlers.py @@ -23,7 +23,7 @@ from pywb.warcserver.index.aggregator import GeventTimeoutAggregator, SimpleAggr from pywb.warcserver.index.aggregator import DirectoryIndexSource from pywb.warcserver.basewarcserver import BaseWarcServer -from pywb.warcserver.utils import MementoUtils +from pywb.utils.memento import MementoUtils sources = { diff --git a/pywb/warcserver/upstreamindexsource.py b/pywb/warcserver/upstreamindexsource.py index 9c9d46e7..ffb70a15 100644 --- a/pywb/warcserver/upstreamindexsource.py +++ b/pywb/warcserver/upstreamindexsource.py @@ -5,7 +5,7 @@ from pywb.utils.wbexception import NotFoundException from pywb.warcserver.index.cdxobject import CDXObject from pywb.warcserver.index.indexsource import BaseIndexSource, RemoteIndexSource from pywb.warcserver.resource.responseloader import LiveWebLoader -from pywb.warcserver.utils import ParamFormatter, res_template +from pywb.utils.format import ParamFormatter, res_template #============================================================================= diff --git a/pywb/warcserver/utils.py b/pywb/warcserver/utils.py deleted file mode 100644 index 3b25b529..00000000 --- a/pywb/warcserver/utils.py +++ /dev/null @@ -1,250 +0,0 @@ -import re -import six -import string -import yaml -import os -import zlib - -from contextlib import closing - -from warcio.timeutils import timestamp_to_http_date -from warcio.utils import BUFF_SIZE - -from pywb.utils.wbexception import BadRequestException -from pywb.utils.loaders import load_yaml_config - -from six.moves.urllib.parse import quote -from tempfile import SpooledTemporaryFile - - -LINK_SPLIT = re.compile(',\s*(?=[<])') -LINK_SEG_SPLIT = re.compile(';\s*') -LINK_URL = re.compile('<(.*)>') -LINK_PROP = re.compile('([\w]+)="([^"]+)') - -#============================================================================= -class MementoException(BadRequestException): - pass - - -#============================================================================= -class MementoUtils(object): - @staticmethod - def parse_links(link_header, def_name='timemap'): - links = LINK_SPLIT.split(link_header) - results = {} - mementos = [] - - for link in links: - props = LINK_SEG_SPLIT.split(link) - m = LINK_URL.match(props[0]) - if not m: - raise MementoException('Invalid Link Url: ' + props[0]) - - result = dict(url=m.group(1)) - key = '' - is_mem = False - - for prop in props[1:]: - m = LINK_PROP.match(prop) - if not m: - raise MementoException('Invalid prop ' + prop) - - name = m.group(1) - value = m.group(2) - - if name == 'rel': - if 'memento' in value: - is_mem = True - result[name] = value - elif value == 'self': - key = def_name - else: - key = value - else: - result[name] = value - - if key: - results[key] = result - elif is_mem: - mementos.append(result) - - results['mementos'] = mementos - return results - - @staticmethod - def make_timemap_memento_link(cdx, datetime=None, rel='memento', end=',\n'): - url = cdx.get('load_url') - if not url: - url = 'file://{0}:{1}:{2}'.format(cdx.get('filename'), cdx.get('offset'), cdx.get('length')) - - memento = '<{0}>; rel="{1}"; datetime="{2}"; src="{3}"' + end - - if not datetime: - datetime = timestamp_to_http_date(cdx['timestamp']) - - return memento.format(url, rel, datetime, cdx.get('source', '')) - - @staticmethod - def make_timemap(cdx_iter): - # get first memento as it'll be used for 'from' field - try: - first_cdx = six.next(cdx_iter) - from_date = timestamp_to_http_date(first_cdx['timestamp']) - except StopIteration: - first_cdx = None - return - - # first memento link - yield MementoUtils.make_timemap_memento_link(first_cdx, datetime=from_date) - - prev_cdx = None - - for cdx in cdx_iter: - if prev_cdx: - yield MementoUtils.make_timemap_memento_link(prev_cdx) - - prev_cdx = cdx - - # last memento link, if any - if prev_cdx: - yield MementoUtils.make_timemap_memento_link(prev_cdx, end='\n') - - @staticmethod - def make_link(url, type): - return '<{0}>; rel="{1}"'.format(url, type) - - @staticmethod - def make_memento_link(url, type, dt): - return '<{0}>; rel="{1}"; datetime="{2}"'.format(url, type, dt) - - -#============================================================================= -class ParamFormatter(string.Formatter): - def __init__(self, params, name='', prefix='param.'): - self.params = params - self.prefix = prefix - self.name = name - - def get_value(self, key, args, kwargs): - # First, try the named param 'param.{name}.{key}' - if self.name: - named_key = self.prefix + self.name + '.' + key - value = self.params.get(named_key) - if value is not None: - return value - - # Then, try 'param.{key}' - named_key = self.prefix + key - value = self.params.get(named_key) - if value is not None: - return value - - # try in extra params as just {key} - value = kwargs.get(key) - if value is not None: - return value - - # try in params as just '{key}' - value = self.params.get(key, '') - return value - - -#============================================================================= -def res_template(template, params, **extra_params): - formatter = params.get('_formatter') - if not formatter: - formatter = ParamFormatter(params) - - url = params.get('url', '') - qi = template.find('?') - if qi >= 0 and template.find('{url}') > qi: - url = quote(url) - - res = formatter.format(template, url=url, **extra_params) - - return res - - -#============================================================================= -def StreamIter(stream, header1=None, header2=None, size=BUFF_SIZE): - with closing(stream): - if header1: - yield header1 - - if header2: - yield header2 - - while True: - buff = stream.read(size) - if not buff: - break - yield buff - - -#============================================================================= -def chunk_encode_iter(orig_iter): - for chunk in orig_iter: - if not len(chunk): - continue - chunk_len = b'%X\r\n' % len(chunk) - yield chunk_len - yield chunk - yield b'\r\n' - - yield b'0\r\n\r\n' - - -#============================================================================= -def buffer_iter(status_headers, iterator, buff_size=BUFF_SIZE * 4): - out = SpooledTemporaryFile(buff_size) - size = 0 - - for buff in iterator: - size += len(buff) - out.write(buff) - - content_length_str = str(size) - # remove existing content length - status_headers.replace_header('Content-Length', - content_length_str) - - out.seek(0) - return StreamIter(out) - - -#============================================================================= -def compress_gzip_iter(orig_iter): - compressobj = zlib.compressobj(9, zlib.DEFLATED, zlib.MAX_WBITS + 16) - for chunk in orig_iter: - buff = compressobj.compress(chunk) - if len(buff) == 0: - continue - - yield buff - - yield compressobj.flush() - - -#============================================================================= -def load_config(main_env_var, main_default_file='', - overlay_env_var='', overlay_file=''): - - configfile = os.environ.get(main_env_var, main_default_file) - config = None - - if configfile: - configfile = os.path.expandvars(configfile) - - config = load_yaml_config(configfile) - - config = config or {} - - overlay_configfile = os.environ.get(overlay_env_var, overlay_file) - - if overlay_configfile: - overlay_configfile = os.path.expandvars(overlay_configfile) - config.update(load_yaml_config(overlay_configfile)) - - return config - diff --git a/pywb/warcserver/warcserver.py b/pywb/warcserver/warcserver.py index 46a60aa4..f82dccae 100644 --- a/pywb/warcserver/warcserver.py +++ b/pywb/warcserver/warcserver.py @@ -1,7 +1,6 @@ -from pywb.utils.loaders import load_yaml_config +from pywb.utils.loaders import load_yaml_config, load_overlay_config from pywb.warcserver.basewarcserver import BaseWarcServer -from pywb.warcserver.utils import load_config from pywb.warcserver.index.aggregator import CacheDirectoryIndexSource, RedisMultiKeyIndexSource from pywb.warcserver.index.aggregator import GeventTimeoutAggregator, SimpleAggregator @@ -40,7 +39,7 @@ class WarcServer(BaseWarcServer): if config_file: try: - file_config = load_config('PYWB_CONFIG_FILE', config_file) + file_config = load_overlay_config('PYWB_CONFIG_FILE', config_file) config.update(file_config) except Exception as e: if not custom_config: