mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
s3 and zipnum fixes: (#253)
* s3 and zipnum fixes: - update s3 to use boto3 - ensure zipnum indexes (.idx, .summary) are picked up automatically via DirectoryAggregator - ensure showNumPages query always return a json object, ignoring output= - add tests for auto-configured zipnum indexes * reqs: add boto3 dependency, init boto Config only if avail * s3 loader: first try with credentials, then with no-cred config archive paths: don't add anything if path is fully qualified (contains '://') * s3 loader: on first load, if credentialed load fails, try uncredentialed fix typo tests: add zinum auto collection tests * zipnum page count query: don't add 'source' field to page count query (if 'url' key not present in dict) * s3 loader: fix no-range load, add test, update skip check to boto3 * fix spacing * boto -> boto3 rename error message, cleanup comments
This commit is contained in:
parent
22ff4bd976
commit
54b265aaa8
@ -5,12 +5,6 @@ MAINTAINER Ilya Kreymer <ikreymer at gmail.com>
|
||||
RUN mkdir /uwsgi
|
||||
COPY uwsgi.ini /uwsgi/
|
||||
|
||||
#RUN pip install gevent==1.1.2 certauth youtube-dl boto uwsgi urllib3
|
||||
#RUN pip install git+https://github.com/t0m/pyamf.git@python3
|
||||
#RUN pip install webassets pyyaml brotlipy
|
||||
#RUN pip install six chardet 'requests<2.12' redis jinja2 'surt>=0.3.0' webencodings portalocker
|
||||
|
||||
#RUN mkdir /pywb
|
||||
WORKDIR /pywb
|
||||
|
||||
ADD requirements.txt .
|
||||
|
@ -17,7 +17,7 @@ install:
|
||||
- "pip install coverage pytest-cov coveralls"
|
||||
- "pip install cffi"
|
||||
- "pip install pyopenssl"
|
||||
- "pip install certauth boto youtube-dl"
|
||||
- "pip install certauth boto3 youtube-dl"
|
||||
|
||||
build_script:
|
||||
- "python setup.py install"
|
||||
|
@ -1,6 +1,6 @@
|
||||
certauth
|
||||
youtube-dl
|
||||
boto
|
||||
boto3
|
||||
uwsgi
|
||||
git+https://github.com/t0m/pyamf.git@python3
|
||||
git+https://github.com/esnme/ultrajson.git
|
||||
|
@ -61,7 +61,7 @@ class FrontEndApp(object):
|
||||
static_path = config.get('static_url_path', 'pywb/static/').replace('/', os.path.sep)
|
||||
self.static_handler = StaticHandler(static_path)
|
||||
|
||||
self.all_coll = config.get('all_coll', None)
|
||||
self.cdx_api_endpoint = config.get('cdx_api_endpoint', '/cdx')
|
||||
|
||||
self._init_routes()
|
||||
|
||||
@ -90,9 +90,9 @@ class FrontEndApp(object):
|
||||
coll_prefix = '/<coll>'
|
||||
self.url_map.add(Rule('/', endpoint=self.serve_home))
|
||||
|
||||
self.url_map.add(Rule(coll_prefix + self.cdx_api_endpoint, endpoint=self.serve_cdx))
|
||||
self.url_map.add(Rule(coll_prefix + '/', endpoint=self.serve_coll_page))
|
||||
self.url_map.add(Rule(coll_prefix + '/timemap/<timemap_output>/<path:url>', endpoint=self.serve_content))
|
||||
self.url_map.add(Rule(coll_prefix + '/cdx', endpoint=self.serve_cdx))
|
||||
|
||||
if self.recorder_path:
|
||||
self.url_map.add(Rule(coll_prefix + self.RECORD_ROUTE + '/<path:url>', endpoint=self.serve_record))
|
||||
@ -197,7 +197,8 @@ class FrontEndApp(object):
|
||||
|
||||
content = view.render_to_string(environ,
|
||||
wb_prefix=wb_prefix,
|
||||
metadata=metadata)
|
||||
metadata=metadata,
|
||||
coll=coll)
|
||||
|
||||
return WbResponse.text_response(content, content_type='text/html; charset="utf-8"')
|
||||
|
||||
|
@ -22,7 +22,9 @@ from io import open, BytesIO
|
||||
from warcio.limitreader import LimitReader
|
||||
|
||||
try:
|
||||
from boto import connect_s3
|
||||
import boto3
|
||||
from botocore import UNSIGNED
|
||||
from botocore.client import Config
|
||||
s3_avail = True
|
||||
except ImportError: #pragma: no cover
|
||||
s3_avail = False
|
||||
@ -325,14 +327,14 @@ class HttpLoader(BaseLoader):
|
||||
#=================================================================
|
||||
class S3Loader(BaseLoader):
|
||||
def __init__(self, **kwargs):
|
||||
self.s3conn = None
|
||||
self.client = None
|
||||
self.aws_access_key_id = kwargs.get('aws_access_key_id')
|
||||
self.aws_secret_access_key = kwargs.get('aws_secret_access_key')
|
||||
|
||||
def load(self, url, offset, length):
|
||||
if not s3_avail: #pragma: no cover
|
||||
raise IOError('To load from s3 paths, ' +
|
||||
'you must install boto: pip install boto')
|
||||
'you must install boto3: pip install boto3')
|
||||
|
||||
aws_access_key_id = self.aws_access_key_id
|
||||
aws_secret_access_key = self.aws_secret_access_key
|
||||
@ -346,24 +348,45 @@ class S3Loader(BaseLoader):
|
||||
else:
|
||||
bucket_name = parts.netloc
|
||||
|
||||
if not self.s3conn:
|
||||
try:
|
||||
self.s3conn = connect_s3(aws_access_key_id, aws_secret_access_key)
|
||||
except Exception: #pragma: no cover
|
||||
self.s3conn = connect_s3(anon=True)
|
||||
|
||||
bucket = self.s3conn.get_bucket(bucket_name)
|
||||
|
||||
key = bucket.get_key(parts.path)
|
||||
key = parts.path[1:]
|
||||
|
||||
if offset == 0 and length == -1:
|
||||
headers = {}
|
||||
range_ = ''
|
||||
else:
|
||||
headers = {'Range': BlockLoader._make_range_header(offset, length)}
|
||||
range_ = BlockLoader._make_range_header(offset, length)
|
||||
|
||||
# Read range
|
||||
key.open_read(headers=headers)
|
||||
return key
|
||||
def s3_load(anon=False):
|
||||
if not self.client:
|
||||
if anon:
|
||||
config = Config(signature_version=UNSIGNED)
|
||||
else:
|
||||
config = None
|
||||
|
||||
client = boto3.client('s3', aws_access_key_id=aws_access_key_id,
|
||||
aws_secret_access_key=aws_secret_access_key,
|
||||
config=config)
|
||||
else:
|
||||
client = self.client
|
||||
|
||||
res = client.get_object(Bucket=bucket_name,
|
||||
Key=key,
|
||||
Range=range_)
|
||||
|
||||
if not self.client:
|
||||
self.client = client
|
||||
|
||||
return res
|
||||
|
||||
try:
|
||||
obj = s3_load(anon=False)
|
||||
|
||||
except Exception:
|
||||
if not self.client:
|
||||
obj = s3_load(anon=True)
|
||||
else:
|
||||
raise
|
||||
|
||||
return obj['Body']
|
||||
|
||||
|
||||
#=================================================================
|
||||
|
@ -93,7 +93,7 @@ test_cdx_dir = get_test_dir() + 'cdx/'
|
||||
|
||||
|
||||
def test_s3_read_1():
|
||||
pytest.importorskip('boto')
|
||||
pytest.importorskip('boto3')
|
||||
|
||||
res = BlockLoader().load('s3://commoncrawl/crawl-data/CC-MAIN-2015-11/segments/1424936462700.28/warc/CC-MAIN-20150226074102-00159-ip-10-28-5-156.ec2.internal.warc.gz',
|
||||
offset=53235662,
|
||||
@ -106,6 +106,19 @@ def test_s3_read_1():
|
||||
assert reader.readline() == b'WARC/1.0\r\n'
|
||||
assert reader.readline() == b'WARC-Type: response\r\n'
|
||||
|
||||
def test_s3_read_2():
|
||||
pytest.importorskip('boto3')
|
||||
|
||||
res = BlockLoader().load('s3://commoncrawl/crawl-data/CC-MAIN-2015-11/index.html')
|
||||
|
||||
buff = res.read()
|
||||
assert len(buff) == 2082
|
||||
|
||||
reader = DecompressingBufferedReader(BytesIO(buff))
|
||||
assert reader.readline() == b'<!DOCTYPE html>\n'
|
||||
|
||||
|
||||
|
||||
# Error
|
||||
def test_err_no_such_file():
|
||||
# no such file
|
||||
|
@ -16,7 +16,7 @@ def to_cdxj(cdx_iter, fields):
|
||||
return content_type, (cdx.to_cdxj(fields) for cdx in cdx_iter)
|
||||
|
||||
def to_json(cdx_iter, fields):
|
||||
content_type = 'application/x-ndjson'
|
||||
content_type = 'text/x-ndjson'
|
||||
return content_type, (cdx.to_json(fields) for cdx in cdx_iter)
|
||||
|
||||
def to_text(cdx_iter, fields):
|
||||
|
@ -17,6 +17,7 @@ from pywb.utils.format import ParamFormatter, res_template
|
||||
from pywb.warcserver.index.indexsource import FileIndexSource, RedisIndexSource
|
||||
from pywb.warcserver.index.cdxops import process_cdx
|
||||
from pywb.warcserver.index.query import CDXQuery
|
||||
from pywb.warcserver.index.zipnum import ZipNumIndexSource
|
||||
|
||||
import six
|
||||
import glob
|
||||
@ -55,6 +56,9 @@ class BaseAggregator(object):
|
||||
err_list = [(name, repr(wbe))]
|
||||
|
||||
def add_name(cdx, name):
|
||||
if not cdx.get('url'):
|
||||
return cdx
|
||||
|
||||
if cdx.get('source'):
|
||||
cdx['source'] = name + ':' + cdx['source']
|
||||
else:
|
||||
@ -245,10 +249,11 @@ class GeventTimeoutAggregator(TimeoutMixin, GeventMixin, BaseSourceListAggregato
|
||||
|
||||
#=============================================================================
|
||||
class BaseDirectoryIndexSource(BaseAggregator):
|
||||
def __init__(self, base_prefix, base_dir='', name=''):
|
||||
def __init__(self, base_prefix, base_dir='', name='', config=None):
|
||||
self.base_prefix = base_prefix
|
||||
self.base_dir = base_dir
|
||||
self.name = name
|
||||
self.config = config
|
||||
|
||||
def _iter_sources(self, params):
|
||||
the_dir = res_template(self.base_dir, params)
|
||||
@ -269,7 +274,10 @@ class BaseDirectoryIndexSource(BaseAggregator):
|
||||
for name in os.listdir(the_dir):
|
||||
filename = os.path.join(the_dir, name)
|
||||
|
||||
if filename.endswith(FileIndexSource.CDX_EXT):
|
||||
is_cdx = filename.endswith(FileIndexSource.CDX_EXT)
|
||||
is_zip = filename.endswith(ZipNumIndexSource.IDX_EXT)
|
||||
|
||||
if is_cdx or is_zip:
|
||||
#print('Adding ' + filename)
|
||||
rel_path = os.path.relpath(the_dir, self.base_prefix)
|
||||
if rel_path == '.':
|
||||
@ -280,7 +288,12 @@ class BaseDirectoryIndexSource(BaseAggregator):
|
||||
if self.name:
|
||||
full_name = self.name + ':' + full_name
|
||||
|
||||
yield full_name, FileIndexSource(filename)
|
||||
if is_cdx:
|
||||
index_src = FileIndexSource(filename)
|
||||
else:
|
||||
index_src = ZipNumIndexSource(filename, self.config)
|
||||
|
||||
yield full_name, index_src
|
||||
|
||||
def __repr__(self):
|
||||
return '{0}(file://{1})'.format(self.__class__.__name__,
|
||||
|
@ -7,8 +7,8 @@ from pywb.utils.canonicalize import calc_search_range
|
||||
class CDXQuery(object):
|
||||
def __init__(self, params):
|
||||
self.params = params
|
||||
url = self.url
|
||||
url = self.params.get('alt_url', url)
|
||||
alt_url = self.params.get('alt_url')
|
||||
url = alt_url or self.url
|
||||
if not self.params.get('matchType'):
|
||||
if url.startswith('*.'):
|
||||
url = self.params['url'] = url[2:]
|
||||
@ -19,6 +19,9 @@ class CDXQuery(object):
|
||||
else:
|
||||
self.params['matchType'] = 'exact'
|
||||
|
||||
if alt_url:
|
||||
self.params['alt_url'] = url
|
||||
|
||||
start, end = calc_search_range(url=url,
|
||||
match_type=self.params['matchType'],
|
||||
url_canon=self.params.get('_url_canon'))
|
||||
|
@ -175,7 +175,7 @@ def test_zip_prefix_load():
|
||||
cdx_iter, err = results
|
||||
results = list(cdx_iter)
|
||||
assert len(results) == 1, results
|
||||
assert results[0] == {"blocks": 38, "pages": 4, "pageSize": 10, "source": "zip"}
|
||||
assert results[0] == {"blocks": 38, "pages": 4, "pageSize": 10}
|
||||
|
||||
|
||||
# Test simple query
|
||||
|
@ -1,8 +1,9 @@
|
||||
from io import BytesIO
|
||||
|
||||
import os
|
||||
import collections
|
||||
import itertools
|
||||
import logging
|
||||
from io import BytesIO
|
||||
import datetime
|
||||
import json
|
||||
import six
|
||||
@ -20,8 +21,8 @@ from pywb.utils.loaders import BlockLoader, read_last_line
|
||||
from pywb.utils.binsearch import iter_range, linearsearch, search
|
||||
|
||||
|
||||
#=================================================================
|
||||
class ZipBlocks:
|
||||
# ============================================================================
|
||||
class ZipBlocks(object):
|
||||
def __init__(self, part, offset, length, count):
|
||||
self.part = part
|
||||
self.offset = offset
|
||||
@ -29,7 +30,19 @@ class ZipBlocks:
|
||||
self.count = count
|
||||
|
||||
|
||||
#=================================================================
|
||||
# ============================================================================
|
||||
class AlwaysJsonResponse(dict):
|
||||
def to_json(self, *args):
|
||||
return json.dumps(self)
|
||||
|
||||
def to_text(self, *args):
|
||||
return json.dumps(self)
|
||||
|
||||
def to_cdxj(self, *args):
|
||||
return json.dumps(self)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
#TODO: see if these could be combined with warc path resolvers
|
||||
|
||||
class LocMapResolver(object):
|
||||
@ -76,7 +89,7 @@ class LocMapResolver(object):
|
||||
return self.loc_map[part]
|
||||
|
||||
|
||||
#=================================================================
|
||||
# ============================================================================
|
||||
class LocPrefixResolver(object):
|
||||
""" Use a prefix lookup, where the prefix can either be a fixed
|
||||
string or can be a regex replacement of the index summary path
|
||||
@ -95,10 +108,11 @@ class LocPrefixResolver(object):
|
||||
return [self.prefix + part]
|
||||
|
||||
|
||||
#=================================================================
|
||||
# ============================================================================
|
||||
class ZipNumIndexSource(BaseIndexSource):
|
||||
DEFAULT_RELOAD_INTERVAL = 10 # in minutes
|
||||
DEFAULT_MAX_BLOCKS = 10
|
||||
IDX_EXT = ('.idx', '.summary')
|
||||
|
||||
def __init__(self, summary, config=None):
|
||||
self.max_blocks = self.DEFAULT_MAX_BLOCKS
|
||||
@ -118,7 +132,6 @@ class ZipNumIndexSource(BaseIndexSource):
|
||||
|
||||
reload_ival = config.get('reload_interval', reload_ival)
|
||||
|
||||
|
||||
if isinstance(loc, dict):
|
||||
self.loc_resolver = LocPrefixResolver(summary, loc)
|
||||
else:
|
||||
@ -132,23 +145,6 @@ class ZipNumIndexSource(BaseIndexSource):
|
||||
|
||||
self.blk_loader = BlockLoader(cookie_maker=cookie_maker)
|
||||
|
||||
# @staticmethod
|
||||
# def reload_timed(timestamp, val, delta, func):
|
||||
# now = datetime.datetime.now()
|
||||
# if now - timestamp >= delta:
|
||||
# func()
|
||||
# return now
|
||||
# return None
|
||||
#
|
||||
# def reload_loc(self):
|
||||
# reload_time = self.reload_timed(self.loc_update_time,
|
||||
# self.loc_map,
|
||||
# self.reload_interval,
|
||||
# self.load_loc)
|
||||
#
|
||||
# if reload_time:
|
||||
# self.loc_update_time = reload_time
|
||||
|
||||
def load_index(self, params):
|
||||
self.loc_resolver.load_loc()
|
||||
return self._do_load_cdx(self.summary, CDXQuery(params))
|
||||
@ -177,12 +173,12 @@ class ZipNumIndexSource(BaseIndexSource):
|
||||
|
||||
return gen_cdx()
|
||||
|
||||
|
||||
def _page_info(self, pages, pagesize, blocks):
|
||||
info = dict(pages=pages,
|
||||
info = AlwaysJsonResponse(
|
||||
pages=pages,
|
||||
pageSize=pagesize,
|
||||
blocks=blocks)
|
||||
#return json.dumps(info) + '\n'
|
||||
|
||||
return info
|
||||
|
||||
def compute_page_range(self, reader, query):
|
||||
@ -338,7 +334,6 @@ class ZipNumIndexSource(BaseIndexSource):
|
||||
a line iterator which decompresses and returns one line at a time,
|
||||
bounded by query.key and query.end_key
|
||||
"""
|
||||
|
||||
if (logging.getLogger().getEffectiveLevel() <= logging.DEBUG):
|
||||
msg = 'Loading {b.count} blocks from {loc}:{b.offset}+{b.length}'
|
||||
logging.debug(msg.format(b=blocks, loc=location))
|
||||
@ -391,7 +386,7 @@ class ZipNumIndexSource(BaseIndexSource):
|
||||
if value.startswith('file://'):
|
||||
value = value[7:]
|
||||
|
||||
if is_zipnum or value.endswith(('.idx', '.summary')):
|
||||
if is_zipnum or value.endswith(cls.IDX_EXT):
|
||||
return cls(value, None)
|
||||
|
||||
@classmethod
|
||||
|
@ -78,8 +78,9 @@ class WarcServer(BaseWarcServer):
|
||||
templ = self.config.get(name)
|
||||
|
||||
def get_full_path(path):
|
||||
if '://' not in path:
|
||||
path = os.path.join(self.AUTO_COLL_TEMPL, path, '')
|
||||
if abs_path and '://' not in path:
|
||||
if abs_path:
|
||||
path = os.path.join(abs_path, path)
|
||||
return path
|
||||
|
||||
@ -94,7 +95,8 @@ class WarcServer(BaseWarcServer):
|
||||
return
|
||||
|
||||
dir_source = CacheDirectoryIndexSource(base_prefix=self.root_dir,
|
||||
base_dir=self.index_paths)
|
||||
base_dir=self.index_paths,
|
||||
config=self.config)
|
||||
|
||||
return DefaultResourceHandler(dir_source, self.archive_paths)
|
||||
|
||||
|
50
tests/test_zipnum_auto_dir.py
Normal file
50
tests/test_zipnum_auto_dir.py
Normal file
@ -0,0 +1,50 @@
|
||||
from .base_config_test import BaseConfigTest, CollsDirMixin
|
||||
from pywb.manager.manager import main as manager
|
||||
|
||||
from pywb.warcserver.index.cdxobject import CDXObject
|
||||
import shutil
|
||||
from pywb import get_test_dir
|
||||
import os
|
||||
import json
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class TestZipnumAutoDir(CollsDirMixin, BaseConfigTest):
|
||||
@classmethod
|
||||
def setup_class(cls):
|
||||
super(TestZipnumAutoDir, cls).setup_class('config_test.yaml')
|
||||
|
||||
manager(['init', 'testzip'])
|
||||
|
||||
cls.archive_dir = os.path.join(cls.root_dir, '_test_colls', 'testzip', 'archive')
|
||||
cls.index_dir = os.path.join(cls.root_dir, '_test_colls', 'testzip', 'indexes')
|
||||
|
||||
zip_cdx = os.path.join(get_test_dir(), 'zipcdx')
|
||||
|
||||
shutil.copy(os.path.join(zip_cdx, 'zipnum-sample.idx'), cls.index_dir)
|
||||
shutil.copy(os.path.join(zip_cdx, 'zipnum-sample.cdx.gz'), cls.index_dir)
|
||||
shutil.copy(os.path.join(zip_cdx, 'zipnum-sample.loc'), cls.index_dir)
|
||||
|
||||
shutil.copy(os.path.join(get_test_dir(), 'warcs', 'iana.warc.gz'), cls.archive_dir)
|
||||
|
||||
def test_cdxj_query(self):
|
||||
res = self.testapp.get('/testzip/cdx?url=iana.org/domains/*')
|
||||
assert len(res.text.rstrip().split('\n')) == 9
|
||||
|
||||
def test_num_pages_query(self):
|
||||
res = self.testapp.get('/testzip/cdx?url=http://iana.org/domains/&matchType=domain&showNumPages=true&pageSize=4')
|
||||
res.content_type = 'text/json'
|
||||
assert(res.json == {"blocks": 38, "pages": 10, "pageSize": 4})
|
||||
|
||||
def test_paged_index_query(self):
|
||||
res = self.testapp.get('/testzip/cdx?url=http://iana.org/domains/&matchType=domain&output=json&showPagedIndex=true&pageSize=4&page=1')
|
||||
|
||||
lines = [json.loads(line) for line in res.text.rstrip().split('\n')]
|
||||
|
||||
assert lines[0] == {"urlkey": "org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200912", "part": "zipnum", "offset": 1150, "length": 235, "lineno": 5}
|
||||
assert lines[1] == {"urlkey": "org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201240", "part": "zipnum", "offset": 1385, "length": 307, "lineno": 6}
|
||||
assert lines[2] == {"urlkey": "org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200654", "part": "zipnum", "offset": 1692, "length": 235, "lineno": 7}
|
||||
assert lines[3] == {"urlkey": "org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200816", "part": "zipnum", "offset": 1927, "length": 231, "lineno": 8}
|
||||
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user