1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

add first set of zipnum tests #17

still need to test timed reload, multi sources
This commit is contained in:
Ilya Kreymer 2014-02-27 12:33:11 -08:00
parent 7863b2bade
commit bff39626b5
5 changed files with 50 additions and 4 deletions

View File

@ -214,7 +214,7 @@ def create_cdx_source(filename, config):
if filename.endswith('.cdx'):
return CDXFile(filename)
if filename.endswith('.summary'):
if filename.endswith(('.summary', '.idx')):
return ZipNumCluster(filename, config)
return None

View File

@ -142,8 +142,8 @@ org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db tex
('filename', 'dupes.warc.gz')]
# NOTE: external dependency -- need self-contained test
#>>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'example.com', output = 'raw', limit = '2')
#>>> pprint.pprint(x.next().items())
>>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'example.com', output = 'raw', limit = '2')
>>> pprint.pprint(x.next().items())
[('urlkey', 'com,example)/'),
('timestamp', '20020120142510'),
('original', 'http://example.com:80/'),
@ -172,6 +172,7 @@ def cdx_ops_test(url, sources = [test_cdx_dir + 'iana.cdx'], **kwparams):
results = server.load_cdx(**kwparams)
for x in results:
x = x.replace('\t', ' ')
sys.stdout.write(x)

View File

@ -0,0 +1,44 @@
"""
>>> zip_ops_test(url = 'http://iana.org')
org,iana)/ 20140126200624 http://www.iana.org/ text/html 200 OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 2258 334 iana.warc.gz
org,iana)/ 20140127171238 http://iana.org unk 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 343 1858 dupes.warc.gz
org,iana)/ 20140127171238 http://www.iana.org/ warc/revisit - OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 536 2678 dupes.warc.gz
# test idx index (tabs replacad with 4 spaces)
>>> zip_ops_test(url = 'http://iana.org/domains/', matchType = 'prefix', showPagedIndex = True)
org,iana)/dnssec 20140126201307 zipnum 8511 373
org,iana)/domains/int 20140126201239 zipnum 8884 353
org,iana)/domains/root/servers 20140126201227 zipnum 9237 386
>>> zip_ops_test(url = 'http://iana.org/domains/', matchType = 'prefix')
org,iana)/domains/arpa 20140126201248 http://www.iana.org/domains/arpa text/html 200 QOFZZRN6JIKAL2JRL6ZC2VVG42SPKGHT - - 2939 759039 iana.warc.gz
org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
org,iana)/domains/idn-tables 20140126201127 http://www.iana.org/domains/idn-tables text/html 200 HNCUFTJMOQOGAEY6T56KVC3T7TVLKGEW - - 8118 715878 iana.warc.gz
org,iana)/domains/int 20140126201239 http://www.iana.org/domains/int text/html 200 X32BBNNORV4SPEHTQF5KI5NFHSKTZK6Q - - 2482 746788 iana.warc.gz
org,iana)/domains/reserved 20140126201054 http://www.iana.org/domains/reserved text/html 200 R5AAEQX5XY5X5DG66B23ODN5DUBWRA27 - - 3573 701457 iana.warc.gz
org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz
org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz
org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz
"""
from cdxserver_test import cdx_ops_test
from pywb import get_test_dir
test_zipnum = get_test_dir() + 'zipcdx/zipnum-sample.idx'
print test_zipnum
def zip_ops_test(url, **kwargs):
sources = test_zipnum
cdx_ops_test(url, sources, **kwargs)
if __name__ == "__main__":
import doctest
doctest.testmod()

View File

@ -1 +1 @@
zipnum ./sample_archive/zipcdx/zipnum-sample.cdx.gz
zipnum ./sample_archive/zipcdx/zipnum-sample.cdx.gz

View File

@ -15,6 +15,7 @@ setuptools.setup(name='pywb',
provides=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite'],
package_data={'pywb': ['ui/*', 'static/*', '*.yaml']},
data_files = [('sample_archive/cdx/', glob.glob('sample_archive/cdx/*')),
('sample_archive/zipcdx/', glob.glob('sample_archive/zipcdx/*')),
('sample_archive/warcs/', glob.glob('sample_archive/warcs/*')),
('sample_archive/text_content/', glob.glob('sample_archive/text_content/*'))],
install_requires=['uwsgi', 'rfc3987', 'chardet', 'redis', 'jinja2', 'surt', 'pyyaml', 'WebTest','pytest'],