1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00
pywb/tests/test_zipnum_auto_dir.py
Ilya Kreymer 54b265aaa8 s3 and zipnum fixes: (#253)
* s3 and zipnum fixes:
- update s3 to use boto3
- ensure zipnum indexes (.idx, .summary) are picked up automatically via DirectoryAggregator
- ensure showNumPages query always return a json object, ignoring output=
- add tests for auto-configured zipnum indexes

* reqs: add boto3 dependency, init boto Config only if avail

* s3 loader: first try with credentials, then with no-cred config
archive paths: don't add anything if path is fully qualified (contains '://')

* s3 loader: on first load, if credentialed load fails, try uncredentialed
fix typo
tests: add zinum auto collection tests

* zipnum page count query: don't add 'source' field to page count query (if 'url' key not present in dict)

* s3 loader: fix no-range load, add test, update skip check to boto3

* fix spacing

* boto -> boto3 rename error message, cleanup comments
2017-10-11 15:33:57 -07:00

51 lines
2.4 KiB
Python

from .base_config_test import BaseConfigTest, CollsDirMixin
from pywb.manager.manager import main as manager
from pywb.warcserver.index.cdxobject import CDXObject
import shutil
from pywb import get_test_dir
import os
import json
# ============================================================================
class TestZipnumAutoDir(CollsDirMixin, BaseConfigTest):
@classmethod
def setup_class(cls):
super(TestZipnumAutoDir, cls).setup_class('config_test.yaml')
manager(['init', 'testzip'])
cls.archive_dir = os.path.join(cls.root_dir, '_test_colls', 'testzip', 'archive')
cls.index_dir = os.path.join(cls.root_dir, '_test_colls', 'testzip', 'indexes')
zip_cdx = os.path.join(get_test_dir(), 'zipcdx')
shutil.copy(os.path.join(zip_cdx, 'zipnum-sample.idx'), cls.index_dir)
shutil.copy(os.path.join(zip_cdx, 'zipnum-sample.cdx.gz'), cls.index_dir)
shutil.copy(os.path.join(zip_cdx, 'zipnum-sample.loc'), cls.index_dir)
shutil.copy(os.path.join(get_test_dir(), 'warcs', 'iana.warc.gz'), cls.archive_dir)
def test_cdxj_query(self):
res = self.testapp.get('/testzip/cdx?url=iana.org/domains/*')
assert len(res.text.rstrip().split('\n')) == 9
def test_num_pages_query(self):
res = self.testapp.get('/testzip/cdx?url=http://iana.org/domains/&matchType=domain&showNumPages=true&pageSize=4')
res.content_type = 'text/json'
assert(res.json == {"blocks": 38, "pages": 10, "pageSize": 4})
def test_paged_index_query(self):
res = self.testapp.get('/testzip/cdx?url=http://iana.org/domains/&matchType=domain&output=json&showPagedIndex=true&pageSize=4&page=1')
lines = [json.loads(line) for line in res.text.rstrip().split('\n')]
assert lines[0] == {"urlkey": "org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200912", "part": "zipnum", "offset": 1150, "length": 235, "lineno": 5}
assert lines[1] == {"urlkey": "org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201240", "part": "zipnum", "offset": 1385, "length": 307, "lineno": 6}
assert lines[2] == {"urlkey": "org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200654", "part": "zipnum", "offset": 1692, "length": 235, "lineno": 7}
assert lines[3] == {"urlkey": "org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200816", "part": "zipnum", "offset": 1927, "length": 231, "lineno": 8}