1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 08:04:49 +01:00

refactor: split warcserver.utils into utils package:

- utils.io for stream/compression related utils
- utils.format for string formatting
- utils.memento for memento
- load_config -> utils.loaders.load_overlay_config
- also: use warcio.utils.to_native_str instead of utils.loaders.to_native_str
This commit is contained in:
Ilya Kreymer 2017-06-05 16:58:47 -07:00
parent 3bd682e3d3
commit d12f715d81
22 changed files with 54 additions and 280 deletions

View File

@ -7,7 +7,9 @@ from werkzeug.wsgi import pop_path_info
from six.moves.urllib.parse import urljoin
from six import iteritems
from pywb.utils.loaders import load_yaml_config, to_native_str
from warcio.utils import to_native_str
from pywb.utils.loaders import load_yaml_config
from pywb.utils.geventserver import GeventServer
from pywb.warcserver.warcserver import WarcServer

View File

@ -14,6 +14,8 @@ from pywb.rewrite.url_rewriter import UrlRewriter, SchemeOnlyUrlRewriter
from pywb.utils.wbexception import WbException
from pywb.utils.canonicalize import canonicalize
from pywb.utils.loaders import extract_client_cookie
from pywb.utils.io import BUFF_SIZE
from pywb.utils.memento import MementoUtils
from warcio.timeutils import http_date_to_timestamp
from warcio.bufferedreaders import BufferedReader
@ -22,9 +24,6 @@ from warcio.recordloader import ArcWarcRecordLoader
from pywb.warcserver.index.cdxobject import CDXObject
from pywb.apps.wbrequestresponse import WbResponse
from pywb.warcserver.utils import BUFF_SIZE
from pywb.warcserver.utils import MementoUtils
from pywb.rewrite.rewriteinputreq import RewriteInputRequest
from pywb.rewrite.templateview import JinjaEnv, HeadInsertView, TopFrameView, BaseInsertView

View File

@ -1,7 +1,7 @@
from pywb.utils.canonicalize import canonicalize
from pywb.warcserver.inputrequest import PostQueryExtractor
from pywb.warcserver.utils import BUFF_SIZE
from pywb.utils.io import BUFF_SIZE
from warcio.timeutils import iso_date_to_timestamp
from warcio.archiveiterator import ArchiveIterator

View File

@ -10,7 +10,7 @@ import portalocker
from warcio.timeutils import timestamp20_now
from warcio.warcwriter import BaseWARCWriter
from pywb.warcserver.utils import res_template
from pywb.utils.format import res_template
# ============================================================================

View File

@ -1,5 +1,5 @@
from pywb.warcserver.utils import StreamIter, BUFF_SIZE
from pywb.warcserver.utils import ParamFormatter, res_template
from pywb.utils.io import StreamIter, BUFF_SIZE
from pywb.utils.format import ParamFormatter, res_template
from pywb.warcserver.inputrequest import DirectWSGIInputRequest
from warcio.recordloader import ArcWarcRecordLoader

View File

@ -4,12 +4,13 @@ from io import BytesIO
import os
from pywb.utils.canonicalize import calc_search_range
from pywb.utils.format import res_template
from pywb.indexer.cdxindexer import write_cdx_index
from pywb.warcserver.index.cdxobject import CDXObject
from pywb.warcserver.index.indexsource import RedisIndexSource
from pywb.warcserver.index.aggregator import SimpleAggregator
from pywb.warcserver.utils import res_template
from pywb.recorder.filters import WriteRevisitDupePolicy

View File

@ -16,7 +16,7 @@ from pywb.recorder.multifilewarcwriter import PerRecordWARCWriter, MultiFileWARC
from pywb.recorder.filters import ExcludeSpecificHeaders, ExcludeHttpOnlyCookieHeaders
from pywb.recorder.filters import SkipDupePolicy, WriteDupePolicy, WriteRevisitDupePolicy
from pywb.warcserver.utils import MementoUtils
from pywb.utils.memento import MementoUtils
from pywb.warcserver.index.cdxobject import CDXObject

View File

@ -9,7 +9,7 @@ import re
import webencodings
import tempfile
from pywb.warcserver.utils import StreamIter, BUFF_SIZE
from pywb.utils.io import StreamIter, BUFF_SIZE
from pywb.utils.loaders import load_yaml_config

View File

@ -245,7 +245,7 @@ Exception: ('Invalid WbUrl: ', '')
from pywb.rewrite.wburl import WbUrl
from six.moves.urllib.parse import quote_plus, unquote_plus
from pywb.utils.loaders import to_native_str
from warcio.utils import to_native_str
from io import StringIO

View File

@ -44,7 +44,7 @@ import six
from six.moves.urllib.parse import urlsplit, urlunsplit
from six.moves.urllib.parse import quote_plus, quote, unquote_plus
from pywb.utils.loaders import to_native_str
from warcio.utils import to_native_str
#=================================================================

View File

@ -6,6 +6,7 @@ local and remote access
import os
import hmac
import requests
import yaml
import six
from six.moves.urllib.request import pathname2url, url2pathname
@ -18,7 +19,6 @@ import cgi
from io import open, BytesIO
from warcio.limitreader import LimitReader
from warcio.utils import to_native_str
try:
from boto import connect_s3
@ -46,9 +46,8 @@ def load(filename):
return BlockLoader().load(filename)
#=================================================================
# =============================================================================
def load_yaml_config(config_file):
import yaml
config = None
configdata = None
try:
@ -61,6 +60,29 @@ def load_yaml_config(config_file):
return config
# =============================================================================
def load_overlay_config(main_env_var, main_default_file='',
overlay_env_var='', overlay_file=''):
configfile = os.environ.get(main_env_var, main_default_file)
config = None
if configfile:
configfile = os.path.expandvars(configfile)
config = load_yaml_config(configfile)
config = config or {}
overlay_configfile = os.environ.get(overlay_env_var, overlay_file)
if overlay_configfile:
overlay_configfile = os.path.expandvars(overlay_configfile)
config.update(load_yaml_config(overlay_configfile))
return config
#=================================================================
def extract_client_cookie(env, cookie_name):
cookie_header = env.get('HTTP_COOKIE')

View File

@ -1,11 +1,11 @@
from pywb.utils.wbexception import BadRequestException, WbException
from pywb.utils.wbexception import NotFoundException
from pywb.utils.memento import MementoUtils
from warcio.recordloader import ArchiveLoadFailed
from pywb.warcserver.index.fuzzymatcher import FuzzyMatcher
from pywb.warcserver.resource.responseloader import WARCPathLoader, LiveWebLoader, VideoLoader
from pywb.warcserver.utils import MementoUtils
import six

View File

@ -12,7 +12,7 @@ from collections import deque
from itertools import chain
from pywb.utils.wbexception import NotFoundException, WbException
from pywb.warcserver.utils import ParamFormatter, res_template
from pywb.utils.format import ParamFormatter, res_template
from pywb.warcserver.index.indexsource import FileIndexSource, RedisIndexSource
from pywb.warcserver.index.cdxops import process_cdx

View File

@ -10,7 +10,7 @@ from six.moves.urllib.parse import urlencode, quote
from six.moves.urllib.parse import parse_qs
from pywb.utils.wbexception import WbException
from pywb.utils.loaders import to_native_str
from warcio.utils import to_native_str
from json import loads as json_decode
from json import dumps as json_encode

View File

@ -7,8 +7,8 @@ from warcio.timeutils import timestamp_now, pad_timestamp, PAD_14_DOWN
from pywb.warcserver.index.cdxobject import CDXObject
from pywb.warcserver.utils import ParamFormatter, res_template
from pywb.warcserver.utils import MementoUtils
from pywb.utils.format import ParamFormatter, res_template
from pywb.utils.memento import MementoUtils
import redis

View File

@ -2,7 +2,7 @@ from warcio.bufferedreaders import DecompressingBufferedReader
from warcio.recordloader import ArcWarcRecordLoader
from pywb.utils.loaders import BlockLoader
from pywb.warcserver.utils import BUFF_SIZE
from pywb.utils.io import BUFF_SIZE
#=================================================================

View File

@ -1,7 +1,7 @@
import redis
from warcio.utils import to_native_str
from pywb.utils.binsearch import iter_exact
from pywb.utils.loaders import to_native_str
from pywb.warcserver.index.indexsource import RedisIndexSource

View File

@ -7,8 +7,9 @@ from warcio.statusandheaders import StatusAndHeaders, StatusAndHeadersParser
from pywb.utils.wbexception import LiveResourceException, WbException
from pywb.warcserver.utils import MementoUtils, StreamIter, compress_gzip_iter
from pywb.warcserver.utils import ParamFormatter
from pywb.utils.memento import MementoUtils
from pywb.utils.io import StreamIter, compress_gzip_iter
from pywb.utils.format import ParamFormatter
from pywb.warcserver.resource.resolvingloader import ResolvingLoader
from pywb.warcserver.resource.pathresolvers import DefaultResolverMixin

View File

@ -23,7 +23,7 @@ from pywb.warcserver.index.aggregator import GeventTimeoutAggregator, SimpleAggr
from pywb.warcserver.index.aggregator import DirectoryIndexSource
from pywb.warcserver.basewarcserver import BaseWarcServer
from pywb.warcserver.utils import MementoUtils
from pywb.utils.memento import MementoUtils
sources = {

View File

@ -5,7 +5,7 @@ from pywb.utils.wbexception import NotFoundException
from pywb.warcserver.index.cdxobject import CDXObject
from pywb.warcserver.index.indexsource import BaseIndexSource, RemoteIndexSource
from pywb.warcserver.resource.responseloader import LiveWebLoader
from pywb.warcserver.utils import ParamFormatter, res_template
from pywb.utils.format import ParamFormatter, res_template
#=============================================================================

View File

@ -1,250 +0,0 @@
import re
import six
import string
import yaml
import os
import zlib
from contextlib import closing
from warcio.timeutils import timestamp_to_http_date
from warcio.utils import BUFF_SIZE
from pywb.utils.wbexception import BadRequestException
from pywb.utils.loaders import load_yaml_config
from six.moves.urllib.parse import quote
from tempfile import SpooledTemporaryFile
LINK_SPLIT = re.compile(',\s*(?=[<])')
LINK_SEG_SPLIT = re.compile(';\s*')
LINK_URL = re.compile('<(.*)>')
LINK_PROP = re.compile('([\w]+)="([^"]+)')
#=============================================================================
class MementoException(BadRequestException):
pass
#=============================================================================
class MementoUtils(object):
@staticmethod
def parse_links(link_header, def_name='timemap'):
links = LINK_SPLIT.split(link_header)
results = {}
mementos = []
for link in links:
props = LINK_SEG_SPLIT.split(link)
m = LINK_URL.match(props[0])
if not m:
raise MementoException('Invalid Link Url: ' + props[0])
result = dict(url=m.group(1))
key = ''
is_mem = False
for prop in props[1:]:
m = LINK_PROP.match(prop)
if not m:
raise MementoException('Invalid prop ' + prop)
name = m.group(1)
value = m.group(2)
if name == 'rel':
if 'memento' in value:
is_mem = True
result[name] = value
elif value == 'self':
key = def_name
else:
key = value
else:
result[name] = value
if key:
results[key] = result
elif is_mem:
mementos.append(result)
results['mementos'] = mementos
return results
@staticmethod
def make_timemap_memento_link(cdx, datetime=None, rel='memento', end=',\n'):
url = cdx.get('load_url')
if not url:
url = 'file://{0}:{1}:{2}'.format(cdx.get('filename'), cdx.get('offset'), cdx.get('length'))
memento = '<{0}>; rel="{1}"; datetime="{2}"; src="{3}"' + end
if not datetime:
datetime = timestamp_to_http_date(cdx['timestamp'])
return memento.format(url, rel, datetime, cdx.get('source', ''))
@staticmethod
def make_timemap(cdx_iter):
# get first memento as it'll be used for 'from' field
try:
first_cdx = six.next(cdx_iter)
from_date = timestamp_to_http_date(first_cdx['timestamp'])
except StopIteration:
first_cdx = None
return
# first memento link
yield MementoUtils.make_timemap_memento_link(first_cdx, datetime=from_date)
prev_cdx = None
for cdx in cdx_iter:
if prev_cdx:
yield MementoUtils.make_timemap_memento_link(prev_cdx)
prev_cdx = cdx
# last memento link, if any
if prev_cdx:
yield MementoUtils.make_timemap_memento_link(prev_cdx, end='\n')
@staticmethod
def make_link(url, type):
return '<{0}>; rel="{1}"'.format(url, type)
@staticmethod
def make_memento_link(url, type, dt):
return '<{0}>; rel="{1}"; datetime="{2}"'.format(url, type, dt)
#=============================================================================
class ParamFormatter(string.Formatter):
def __init__(self, params, name='', prefix='param.'):
self.params = params
self.prefix = prefix
self.name = name
def get_value(self, key, args, kwargs):
# First, try the named param 'param.{name}.{key}'
if self.name:
named_key = self.prefix + self.name + '.' + key
value = self.params.get(named_key)
if value is not None:
return value
# Then, try 'param.{key}'
named_key = self.prefix + key
value = self.params.get(named_key)
if value is not None:
return value
# try in extra params as just {key}
value = kwargs.get(key)
if value is not None:
return value
# try in params as just '{key}'
value = self.params.get(key, '')
return value
#=============================================================================
def res_template(template, params, **extra_params):
formatter = params.get('_formatter')
if not formatter:
formatter = ParamFormatter(params)
url = params.get('url', '')
qi = template.find('?')
if qi >= 0 and template.find('{url}') > qi:
url = quote(url)
res = formatter.format(template, url=url, **extra_params)
return res
#=============================================================================
def StreamIter(stream, header1=None, header2=None, size=BUFF_SIZE):
with closing(stream):
if header1:
yield header1
if header2:
yield header2
while True:
buff = stream.read(size)
if not buff:
break
yield buff
#=============================================================================
def chunk_encode_iter(orig_iter):
for chunk in orig_iter:
if not len(chunk):
continue
chunk_len = b'%X\r\n' % len(chunk)
yield chunk_len
yield chunk
yield b'\r\n'
yield b'0\r\n\r\n'
#=============================================================================
def buffer_iter(status_headers, iterator, buff_size=BUFF_SIZE * 4):
out = SpooledTemporaryFile(buff_size)
size = 0
for buff in iterator:
size += len(buff)
out.write(buff)
content_length_str = str(size)
# remove existing content length
status_headers.replace_header('Content-Length',
content_length_str)
out.seek(0)
return StreamIter(out)
#=============================================================================
def compress_gzip_iter(orig_iter):
compressobj = zlib.compressobj(9, zlib.DEFLATED, zlib.MAX_WBITS + 16)
for chunk in orig_iter:
buff = compressobj.compress(chunk)
if len(buff) == 0:
continue
yield buff
yield compressobj.flush()
#=============================================================================
def load_config(main_env_var, main_default_file='',
overlay_env_var='', overlay_file=''):
configfile = os.environ.get(main_env_var, main_default_file)
config = None
if configfile:
configfile = os.path.expandvars(configfile)
config = load_yaml_config(configfile)
config = config or {}
overlay_configfile = os.environ.get(overlay_env_var, overlay_file)
if overlay_configfile:
overlay_configfile = os.path.expandvars(overlay_configfile)
config.update(load_yaml_config(overlay_configfile))
return config

View File

@ -1,7 +1,6 @@
from pywb.utils.loaders import load_yaml_config
from pywb.utils.loaders import load_yaml_config, load_overlay_config
from pywb.warcserver.basewarcserver import BaseWarcServer
from pywb.warcserver.utils import load_config
from pywb.warcserver.index.aggregator import CacheDirectoryIndexSource, RedisMultiKeyIndexSource
from pywb.warcserver.index.aggregator import GeventTimeoutAggregator, SimpleAggregator
@ -40,7 +39,7 @@ class WarcServer(BaseWarcServer):
if config_file:
try:
file_config = load_config('PYWB_CONFIG_FILE', config_file)
file_config = load_overlay_config('PYWB_CONFIG_FILE', config_file)
config.update(file_config)
except Exception as e:
if not custom_config: