diff --git a/Dockerfile b/Dockerfile index 73063b33..b2ec37f8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,12 +5,6 @@ MAINTAINER Ilya Kreymer RUN mkdir /uwsgi COPY uwsgi.ini /uwsgi/ -#RUN pip install gevent==1.1.2 certauth youtube-dl boto uwsgi urllib3 -#RUN pip install git+https://github.com/t0m/pyamf.git@python3 -#RUN pip install webassets pyyaml brotlipy -#RUN pip install six chardet 'requests<2.12' redis jinja2 'surt>=0.3.0' webencodings portalocker - -#RUN mkdir /pywb WORKDIR /pywb ADD requirements.txt . diff --git a/appveyor.yml b/appveyor.yml index 57957ae1..48626f21 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -17,7 +17,7 @@ install: - "pip install coverage pytest-cov coveralls" - "pip install cffi" - "pip install pyopenssl" - - "pip install certauth boto youtube-dl" + - "pip install certauth boto3 youtube-dl" build_script: - "python setup.py install" diff --git a/extra_requirements.txt b/extra_requirements.txt index 9e60f57b..2edeb5de 100644 --- a/extra_requirements.txt +++ b/extra_requirements.txt @@ -1,6 +1,6 @@ certauth youtube-dl -boto +boto3 uwsgi git+https://github.com/t0m/pyamf.git@python3 git+https://github.com/esnme/ultrajson.git diff --git a/pywb/apps/frontendapp.py b/pywb/apps/frontendapp.py index b7255939..f3712927 100644 --- a/pywb/apps/frontendapp.py +++ b/pywb/apps/frontendapp.py @@ -61,7 +61,7 @@ class FrontEndApp(object): static_path = config.get('static_url_path', 'pywb/static/').replace('/', os.path.sep) self.static_handler = StaticHandler(static_path) - self.all_coll = config.get('all_coll', None) + self.cdx_api_endpoint = config.get('cdx_api_endpoint', '/cdx') self._init_routes() @@ -90,9 +90,9 @@ class FrontEndApp(object): coll_prefix = '/' self.url_map.add(Rule('/', endpoint=self.serve_home)) + self.url_map.add(Rule(coll_prefix + self.cdx_api_endpoint, endpoint=self.serve_cdx)) self.url_map.add(Rule(coll_prefix + '/', endpoint=self.serve_coll_page)) self.url_map.add(Rule(coll_prefix + '/timemap//', endpoint=self.serve_content)) - self.url_map.add(Rule(coll_prefix + '/cdx', endpoint=self.serve_cdx)) if self.recorder_path: self.url_map.add(Rule(coll_prefix + self.RECORD_ROUTE + '/', endpoint=self.serve_record)) @@ -197,7 +197,8 @@ class FrontEndApp(object): content = view.render_to_string(environ, wb_prefix=wb_prefix, - metadata=metadata) + metadata=metadata, + coll=coll) return WbResponse.text_response(content, content_type='text/html; charset="utf-8"') diff --git a/pywb/utils/loaders.py b/pywb/utils/loaders.py index 4b6c4b74..dcda32d4 100644 --- a/pywb/utils/loaders.py +++ b/pywb/utils/loaders.py @@ -22,7 +22,9 @@ from io import open, BytesIO from warcio.limitreader import LimitReader try: - from boto import connect_s3 + import boto3 + from botocore import UNSIGNED + from botocore.client import Config s3_avail = True except ImportError: #pragma: no cover s3_avail = False @@ -325,14 +327,14 @@ class HttpLoader(BaseLoader): #================================================================= class S3Loader(BaseLoader): def __init__(self, **kwargs): - self.s3conn = None + self.client = None self.aws_access_key_id = kwargs.get('aws_access_key_id') self.aws_secret_access_key = kwargs.get('aws_secret_access_key') def load(self, url, offset, length): if not s3_avail: #pragma: no cover raise IOError('To load from s3 paths, ' + - 'you must install boto: pip install boto') + 'you must install boto3: pip install boto3') aws_access_key_id = self.aws_access_key_id aws_secret_access_key = self.aws_secret_access_key @@ -346,24 +348,45 @@ class S3Loader(BaseLoader): else: bucket_name = parts.netloc - if not self.s3conn: - try: - self.s3conn = connect_s3(aws_access_key_id, aws_secret_access_key) - except Exception: #pragma: no cover - self.s3conn = connect_s3(anon=True) - - bucket = self.s3conn.get_bucket(bucket_name) - - key = bucket.get_key(parts.path) + key = parts.path[1:] if offset == 0 and length == -1: - headers = {} + range_ = '' else: - headers = {'Range': BlockLoader._make_range_header(offset, length)} + range_ = BlockLoader._make_range_header(offset, length) - # Read range - key.open_read(headers=headers) - return key + def s3_load(anon=False): + if not self.client: + if anon: + config = Config(signature_version=UNSIGNED) + else: + config = None + + client = boto3.client('s3', aws_access_key_id=aws_access_key_id, + aws_secret_access_key=aws_secret_access_key, + config=config) + else: + client = self.client + + res = client.get_object(Bucket=bucket_name, + Key=key, + Range=range_) + + if not self.client: + self.client = client + + return res + + try: + obj = s3_load(anon=False) + + except Exception: + if not self.client: + obj = s3_load(anon=True) + else: + raise + + return obj['Body'] #================================================================= diff --git a/pywb/utils/test/test_loaders.py b/pywb/utils/test/test_loaders.py index d62e9626..4a217616 100644 --- a/pywb/utils/test/test_loaders.py +++ b/pywb/utils/test/test_loaders.py @@ -93,7 +93,7 @@ test_cdx_dir = get_test_dir() + 'cdx/' def test_s3_read_1(): - pytest.importorskip('boto') + pytest.importorskip('boto3') res = BlockLoader().load('s3://commoncrawl/crawl-data/CC-MAIN-2015-11/segments/1424936462700.28/warc/CC-MAIN-20150226074102-00159-ip-10-28-5-156.ec2.internal.warc.gz', offset=53235662, @@ -106,6 +106,19 @@ def test_s3_read_1(): assert reader.readline() == b'WARC/1.0\r\n' assert reader.readline() == b'WARC-Type: response\r\n' +def test_s3_read_2(): + pytest.importorskip('boto3') + + res = BlockLoader().load('s3://commoncrawl/crawl-data/CC-MAIN-2015-11/index.html') + + buff = res.read() + assert len(buff) == 2082 + + reader = DecompressingBufferedReader(BytesIO(buff)) + assert reader.readline() == b'\n' + + + # Error def test_err_no_such_file(): # no such file diff --git a/pywb/warcserver/handlers.py b/pywb/warcserver/handlers.py index a8f2dd11..35507f3e 100644 --- a/pywb/warcserver/handlers.py +++ b/pywb/warcserver/handlers.py @@ -16,7 +16,7 @@ def to_cdxj(cdx_iter, fields): return content_type, (cdx.to_cdxj(fields) for cdx in cdx_iter) def to_json(cdx_iter, fields): - content_type = 'application/x-ndjson' + content_type = 'text/x-ndjson' return content_type, (cdx.to_json(fields) for cdx in cdx_iter) def to_text(cdx_iter, fields): diff --git a/pywb/warcserver/index/aggregator.py b/pywb/warcserver/index/aggregator.py index 77506ab8..36d3e3b3 100644 --- a/pywb/warcserver/index/aggregator.py +++ b/pywb/warcserver/index/aggregator.py @@ -17,6 +17,7 @@ from pywb.utils.format import ParamFormatter, res_template from pywb.warcserver.index.indexsource import FileIndexSource, RedisIndexSource from pywb.warcserver.index.cdxops import process_cdx from pywb.warcserver.index.query import CDXQuery +from pywb.warcserver.index.zipnum import ZipNumIndexSource import six import glob @@ -55,6 +56,9 @@ class BaseAggregator(object): err_list = [(name, repr(wbe))] def add_name(cdx, name): + if not cdx.get('url'): + return cdx + if cdx.get('source'): cdx['source'] = name + ':' + cdx['source'] else: @@ -245,10 +249,11 @@ class GeventTimeoutAggregator(TimeoutMixin, GeventMixin, BaseSourceListAggregato #============================================================================= class BaseDirectoryIndexSource(BaseAggregator): - def __init__(self, base_prefix, base_dir='', name=''): + def __init__(self, base_prefix, base_dir='', name='', config=None): self.base_prefix = base_prefix self.base_dir = base_dir self.name = name + self.config = config def _iter_sources(self, params): the_dir = res_template(self.base_dir, params) @@ -269,7 +274,10 @@ class BaseDirectoryIndexSource(BaseAggregator): for name in os.listdir(the_dir): filename = os.path.join(the_dir, name) - if filename.endswith(FileIndexSource.CDX_EXT): + is_cdx = filename.endswith(FileIndexSource.CDX_EXT) + is_zip = filename.endswith(ZipNumIndexSource.IDX_EXT) + + if is_cdx or is_zip: #print('Adding ' + filename) rel_path = os.path.relpath(the_dir, self.base_prefix) if rel_path == '.': @@ -280,7 +288,12 @@ class BaseDirectoryIndexSource(BaseAggregator): if self.name: full_name = self.name + ':' + full_name - yield full_name, FileIndexSource(filename) + if is_cdx: + index_src = FileIndexSource(filename) + else: + index_src = ZipNumIndexSource(filename, self.config) + + yield full_name, index_src def __repr__(self): return '{0}(file://{1})'.format(self.__class__.__name__, diff --git a/pywb/warcserver/index/query.py b/pywb/warcserver/index/query.py index e88ddfbf..64538a59 100644 --- a/pywb/warcserver/index/query.py +++ b/pywb/warcserver/index/query.py @@ -7,8 +7,8 @@ from pywb.utils.canonicalize import calc_search_range class CDXQuery(object): def __init__(self, params): self.params = params - url = self.url - url = self.params.get('alt_url', url) + alt_url = self.params.get('alt_url') + url = alt_url or self.url if not self.params.get('matchType'): if url.startswith('*.'): url = self.params['url'] = url[2:] @@ -19,6 +19,9 @@ class CDXQuery(object): else: self.params['matchType'] = 'exact' + if alt_url: + self.params['alt_url'] = url + start, end = calc_search_range(url=url, match_type=self.params['matchType'], url_canon=self.params.get('_url_canon')) diff --git a/pywb/warcserver/index/test/test_zipnum.py b/pywb/warcserver/index/test/test_zipnum.py index b5a5863c..32aee8d4 100644 --- a/pywb/warcserver/index/test/test_zipnum.py +++ b/pywb/warcserver/index/test/test_zipnum.py @@ -175,7 +175,7 @@ def test_zip_prefix_load(): cdx_iter, err = results results = list(cdx_iter) assert len(results) == 1, results - assert results[0] == {"blocks": 38, "pages": 4, "pageSize": 10, "source": "zip"} + assert results[0] == {"blocks": 38, "pages": 4, "pageSize": 10} # Test simple query diff --git a/pywb/warcserver/index/zipnum.py b/pywb/warcserver/index/zipnum.py index 84d9ec02..6037b2cd 100644 --- a/pywb/warcserver/index/zipnum.py +++ b/pywb/warcserver/index/zipnum.py @@ -1,8 +1,9 @@ +from io import BytesIO + import os import collections import itertools import logging -from io import BytesIO import datetime import json import six @@ -20,8 +21,8 @@ from pywb.utils.loaders import BlockLoader, read_last_line from pywb.utils.binsearch import iter_range, linearsearch, search -#================================================================= -class ZipBlocks: +# ============================================================================ +class ZipBlocks(object): def __init__(self, part, offset, length, count): self.part = part self.offset = offset @@ -29,7 +30,19 @@ class ZipBlocks: self.count = count -#================================================================= +# ============================================================================ +class AlwaysJsonResponse(dict): + def to_json(self, *args): + return json.dumps(self) + + def to_text(self, *args): + return json.dumps(self) + + def to_cdxj(self, *args): + return json.dumps(self) + + +# ============================================================================ #TODO: see if these could be combined with warc path resolvers class LocMapResolver(object): @@ -76,7 +89,7 @@ class LocMapResolver(object): return self.loc_map[part] -#================================================================= +# ============================================================================ class LocPrefixResolver(object): """ Use a prefix lookup, where the prefix can either be a fixed string or can be a regex replacement of the index summary path @@ -95,10 +108,11 @@ class LocPrefixResolver(object): return [self.prefix + part] -#================================================================= +# ============================================================================ class ZipNumIndexSource(BaseIndexSource): DEFAULT_RELOAD_INTERVAL = 10 # in minutes DEFAULT_MAX_BLOCKS = 10 + IDX_EXT = ('.idx', '.summary') def __init__(self, summary, config=None): self.max_blocks = self.DEFAULT_MAX_BLOCKS @@ -118,7 +132,6 @@ class ZipNumIndexSource(BaseIndexSource): reload_ival = config.get('reload_interval', reload_ival) - if isinstance(loc, dict): self.loc_resolver = LocPrefixResolver(summary, loc) else: @@ -132,23 +145,6 @@ class ZipNumIndexSource(BaseIndexSource): self.blk_loader = BlockLoader(cookie_maker=cookie_maker) -# @staticmethod -# def reload_timed(timestamp, val, delta, func): -# now = datetime.datetime.now() -# if now - timestamp >= delta: -# func() -# return now -# return None -# -# def reload_loc(self): -# reload_time = self.reload_timed(self.loc_update_time, -# self.loc_map, -# self.reload_interval, -# self.load_loc) -# -# if reload_time: -# self.loc_update_time = reload_time - def load_index(self, params): self.loc_resolver.load_loc() return self._do_load_cdx(self.summary, CDXQuery(params)) @@ -177,12 +173,12 @@ class ZipNumIndexSource(BaseIndexSource): return gen_cdx() - def _page_info(self, pages, pagesize, blocks): - info = dict(pages=pages, + info = AlwaysJsonResponse( + pages=pages, pageSize=pagesize, blocks=blocks) - #return json.dumps(info) + '\n' + return info def compute_page_range(self, reader, query): @@ -338,7 +334,6 @@ class ZipNumIndexSource(BaseIndexSource): a line iterator which decompresses and returns one line at a time, bounded by query.key and query.end_key """ - if (logging.getLogger().getEffectiveLevel() <= logging.DEBUG): msg = 'Loading {b.count} blocks from {loc}:{b.offset}+{b.length}' logging.debug(msg.format(b=blocks, loc=location)) @@ -391,7 +386,7 @@ class ZipNumIndexSource(BaseIndexSource): if value.startswith('file://'): value = value[7:] - if is_zipnum or value.endswith(('.idx', '.summary')): + if is_zipnum or value.endswith(cls.IDX_EXT): return cls(value, None) @classmethod diff --git a/pywb/warcserver/warcserver.py b/pywb/warcserver/warcserver.py index f9df3cb2..e51665e0 100644 --- a/pywb/warcserver/warcserver.py +++ b/pywb/warcserver/warcserver.py @@ -78,9 +78,10 @@ class WarcServer(BaseWarcServer): templ = self.config.get(name) def get_full_path(path): - path = os.path.join(self.AUTO_COLL_TEMPL, path, '') - if abs_path and '://' not in path: - path = os.path.join(abs_path, path) + if '://' not in path: + path = os.path.join(self.AUTO_COLL_TEMPL, path, '') + if abs_path: + path = os.path.join(abs_path, path) return path if isinstance(templ, str): @@ -94,7 +95,8 @@ class WarcServer(BaseWarcServer): return dir_source = CacheDirectoryIndexSource(base_prefix=self.root_dir, - base_dir=self.index_paths) + base_dir=self.index_paths, + config=self.config) return DefaultResourceHandler(dir_source, self.archive_paths) diff --git a/tests/test_zipnum_auto_dir.py b/tests/test_zipnum_auto_dir.py new file mode 100644 index 00000000..7a3f77b5 --- /dev/null +++ b/tests/test_zipnum_auto_dir.py @@ -0,0 +1,50 @@ +from .base_config_test import BaseConfigTest, CollsDirMixin +from pywb.manager.manager import main as manager + +from pywb.warcserver.index.cdxobject import CDXObject +import shutil +from pywb import get_test_dir +import os +import json + + +# ============================================================================ +class TestZipnumAutoDir(CollsDirMixin, BaseConfigTest): + @classmethod + def setup_class(cls): + super(TestZipnumAutoDir, cls).setup_class('config_test.yaml') + + manager(['init', 'testzip']) + + cls.archive_dir = os.path.join(cls.root_dir, '_test_colls', 'testzip', 'archive') + cls.index_dir = os.path.join(cls.root_dir, '_test_colls', 'testzip', 'indexes') + + zip_cdx = os.path.join(get_test_dir(), 'zipcdx') + + shutil.copy(os.path.join(zip_cdx, 'zipnum-sample.idx'), cls.index_dir) + shutil.copy(os.path.join(zip_cdx, 'zipnum-sample.cdx.gz'), cls.index_dir) + shutil.copy(os.path.join(zip_cdx, 'zipnum-sample.loc'), cls.index_dir) + + shutil.copy(os.path.join(get_test_dir(), 'warcs', 'iana.warc.gz'), cls.archive_dir) + + def test_cdxj_query(self): + res = self.testapp.get('/testzip/cdx?url=iana.org/domains/*') + assert len(res.text.rstrip().split('\n')) == 9 + + def test_num_pages_query(self): + res = self.testapp.get('/testzip/cdx?url=http://iana.org/domains/&matchType=domain&showNumPages=true&pageSize=4') + res.content_type = 'text/json' + assert(res.json == {"blocks": 38, "pages": 10, "pageSize": 4}) + + def test_paged_index_query(self): + res = self.testapp.get('/testzip/cdx?url=http://iana.org/domains/&matchType=domain&output=json&showPagedIndex=true&pageSize=4&page=1') + + lines = [json.loads(line) for line in res.text.rstrip().split('\n')] + + assert lines[0] == {"urlkey": "org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200912", "part": "zipnum", "offset": 1150, "length": 235, "lineno": 5} + assert lines[1] == {"urlkey": "org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201240", "part": "zipnum", "offset": 1385, "length": 307, "lineno": 6} + assert lines[2] == {"urlkey": "org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200654", "part": "zipnum", "offset": 1692, "length": 235, "lineno": 7} + assert lines[3] == {"urlkey": "org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200816", "part": "zipnum", "offset": 1927, "length": 231, "lineno": 8} + + +