From bff39626b52c322d15856e7098e77162d84c8c53 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 27 Feb 2014 12:33:11 -0800 Subject: [PATCH] add first set of zipnum tests #17 still need to test timed reload, multi sources --- pywb/cdx/cdxserver.py | 2 +- pywb/cdx/test/cdxserver_test.py | 5 +-- pywb/cdx/test/zipnum_test.py | 44 +++++++++++++++++++++++++ sample_archive/zipcdx/zipnum-sample.loc | 2 +- setup.py | 1 + 5 files changed, 50 insertions(+), 4 deletions(-) create mode 100644 pywb/cdx/test/zipnum_test.py diff --git a/pywb/cdx/cdxserver.py b/pywb/cdx/cdxserver.py index 8eff842c..fd0c14e9 100644 --- a/pywb/cdx/cdxserver.py +++ b/pywb/cdx/cdxserver.py @@ -214,7 +214,7 @@ def create_cdx_source(filename, config): if filename.endswith('.cdx'): return CDXFile(filename) - if filename.endswith('.summary'): + if filename.endswith(('.summary', '.idx')): return ZipNumCluster(filename, config) return None diff --git a/pywb/cdx/test/cdxserver_test.py b/pywb/cdx/test/cdxserver_test.py index 384d7187..44483ca4 100644 --- a/pywb/cdx/test/cdxserver_test.py +++ b/pywb/cdx/test/cdxserver_test.py @@ -142,8 +142,8 @@ org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db tex ('filename', 'dupes.warc.gz')] # NOTE: external dependency -- need self-contained test -#>>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'example.com', output = 'raw', limit = '2') -#>>> pprint.pprint(x.next().items()) +>>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'example.com', output = 'raw', limit = '2') +>>> pprint.pprint(x.next().items()) [('urlkey', 'com,example)/'), ('timestamp', '20020120142510'), ('original', 'http://example.com:80/'), @@ -172,6 +172,7 @@ def cdx_ops_test(url, sources = [test_cdx_dir + 'iana.cdx'], **kwparams): results = server.load_cdx(**kwparams) for x in results: + x = x.replace('\t', ' ') sys.stdout.write(x) diff --git a/pywb/cdx/test/zipnum_test.py b/pywb/cdx/test/zipnum_test.py new file mode 100644 index 00000000..7c98309a --- /dev/null +++ b/pywb/cdx/test/zipnum_test.py @@ -0,0 +1,44 @@ +""" +>>> zip_ops_test(url = 'http://iana.org') +org,iana)/ 20140126200624 http://www.iana.org/ text/html 200 OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 2258 334 iana.warc.gz +org,iana)/ 20140127171238 http://iana.org unk 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 343 1858 dupes.warc.gz +org,iana)/ 20140127171238 http://www.iana.org/ warc/revisit - OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 536 2678 dupes.warc.gz + +# test idx index (tabs replacad with 4 spaces) +>>> zip_ops_test(url = 'http://iana.org/domains/', matchType = 'prefix', showPagedIndex = True) +org,iana)/dnssec 20140126201307 zipnum 8511 373 +org,iana)/domains/int 20140126201239 zipnum 8884 353 +org,iana)/domains/root/servers 20140126201227 zipnum 9237 386 + +>>> zip_ops_test(url = 'http://iana.org/domains/', matchType = 'prefix') +org,iana)/domains/arpa 20140126201248 http://www.iana.org/domains/arpa text/html 200 QOFZZRN6JIKAL2JRL6ZC2VVG42SPKGHT - - 2939 759039 iana.warc.gz +org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz +org,iana)/domains/idn-tables 20140126201127 http://www.iana.org/domains/idn-tables text/html 200 HNCUFTJMOQOGAEY6T56KVC3T7TVLKGEW - - 8118 715878 iana.warc.gz +org,iana)/domains/int 20140126201239 http://www.iana.org/domains/int text/html 200 X32BBNNORV4SPEHTQF5KI5NFHSKTZK6Q - - 2482 746788 iana.warc.gz +org,iana)/domains/reserved 20140126201054 http://www.iana.org/domains/reserved text/html 200 R5AAEQX5XY5X5DG66B23ODN5DUBWRA27 - - 3573 701457 iana.warc.gz +org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz +org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz +org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz +org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz + +""" + + + + +from cdxserver_test import cdx_ops_test + +from pywb import get_test_dir +test_zipnum = get_test_dir() + 'zipcdx/zipnum-sample.idx' +print test_zipnum + +def zip_ops_test(url, **kwargs): + sources = test_zipnum + cdx_ops_test(url, sources, **kwargs) + + +if __name__ == "__main__": + import doctest + doctest.testmod() + + diff --git a/sample_archive/zipcdx/zipnum-sample.loc b/sample_archive/zipcdx/zipnum-sample.loc index 249e1071..df4f3196 100644 --- a/sample_archive/zipcdx/zipnum-sample.loc +++ b/sample_archive/zipcdx/zipnum-sample.loc @@ -1 +1 @@ -zipnum ./sample_archive/zipcdx/zipnum-sample.cdx.gz +zipnum ./sample_archive/zipcdx/zipnum-sample.cdx.gz diff --git a/setup.py b/setup.py index 94c1bca7..307506fe 100755 --- a/setup.py +++ b/setup.py @@ -15,6 +15,7 @@ setuptools.setup(name='pywb', provides=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite'], package_data={'pywb': ['ui/*', 'static/*', '*.yaml']}, data_files = [('sample_archive/cdx/', glob.glob('sample_archive/cdx/*')), + ('sample_archive/zipcdx/', glob.glob('sample_archive/zipcdx/*')), ('sample_archive/warcs/', glob.glob('sample_archive/warcs/*')), ('sample_archive/text_content/', glob.glob('sample_archive/text_content/*'))], install_requires=['uwsgi', 'rfc3987', 'chardet', 'redis', 'jinja2', 'surt', 'pyyaml', 'WebTest','pytest'],