diff --git a/.travis.yml b/.travis.yml index 09218b17..77b52ef5 100644 --- a/.travis.yml +++ b/.travis.yml @@ -13,6 +13,7 @@ sudo: false install: - "pip install 'argparse>=1.2.1' --allow-all-external" - pip install boto certauth + - pip install git+https://github.com/esnme/ultrajson.git - python setup.py -q install - pip install coverage pytest-cov coveralls --use-mirrors diff --git a/pywb/warc/cdxindexer.py b/pywb/warc/cdxindexer.py index 819194ae..122df7f4 100644 --- a/pywb/warc/cdxindexer.py +++ b/pywb/warc/cdxindexer.py @@ -3,10 +3,22 @@ import sys # Use ujson if available try: - from ujson import dumps as json_encode -except: - from json import dumps as json_encode + from ujson import dumps as ujson_dumps + try: + assert (ujson_dumps('http://example.com/', + escape_forward_slashes=False) == + '"http://example.com/"') + except Exception as e: # pragma: no cover + sys.stderr.write('ujson w/o forward-slash escaping not available,\ +defaulting to regular json\n') + raise + + def json_encode(obj): + return ujson_dumps(obj, escape_forward_slashes=False) + +except: # pragma: no cover + from json import dumps as json_encode try: # pragma: no cover from collections import OrderedDict diff --git a/pywb/warc/test/test_indexing.py b/pywb/warc/test/test_indexing.py index df8dd385..bb3f110e 100644 --- a/pywb/warc/test/test_indexing.py +++ b/pywb/warc/test/test_indexing.py @@ -8,12 +8,6 @@ com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 20 com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz -# warc.gz -- minimal CDXJ ->>> print_cdx_index('example.warc.gz', minimal=True, cdxj=True) -com,example)/?example=1 20140103030321 {"url": "http://example.com?example=1", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "1043", "offset": "333", "filename": "example.warc.gz"} -com,example)/?example=1 20140103030341 {"url": "http://example.com?example=1", "mime": "warc/revisit", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "553", "offset": "1864", "filename": "example.warc.gz"} -org,iana)/domains/example 20140128051539 {"url": "http://www.iana.org/domains/example", "digest": "JZ622UA23G5ZU6Y3XAKH4LINONUEICEG", "length": "577", "offset": "2907", "filename": "example.warc.gz"} - # warc.gz -- parse all >>> print_cdx_index('example.warc.gz', include_all=True) CDX N b a m s k r M S V g @@ -23,14 +17,6 @@ com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit com,example)/?example=1 20140103030341 http://example.com?example=1 - - - - - 490 2417 example.warc.gz org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz -# warc.gz -- parse all -- CDXJ ->>> print_cdx_index('example.warc.gz', include_all=True, cdxj=True) -com,example)/?example=1 20140103030321 {"url": "http://example.com?example=1", "mime": "text/html", "status": "200", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "1043", "offset": "333", "filename": "example.warc.gz"} -com,example)/?example=1 20140103030321 {"url": "http://example.com?example=1", "length": "488", "offset": "1376", "filename": "example.warc.gz"} -com,example)/?example=1 20140103030341 {"url": "http://example.com?example=1", "mime": "warc/revisit", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "553", "offset": "1864", "filename": "example.warc.gz"} -com,example)/?example=1 20140103030341 {"url": "http://example.com?example=1", "length": "490", "offset": "2417", "filename": "example.warc.gz"} -org,iana)/domains/example 20140128051539 {"url": "http://www.iana.org/domains/example", "mime": "text/html", "status": "302", "digest": "JZ622UA23G5ZU6Y3XAKH4LINONUEICEG", "length": "577", "offset": "2907", "filename": "example.warc.gz"} - # warc >>> print_cdx_index('example.warc') CDX N b a m s k r M S V g @@ -52,14 +38,6 @@ org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example tex CDX N b a m s k r M S V g com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 856 171 example.arc.gz -# arc.gz -- json ->>> print_cdx_index('example.arc.gz', cdxj=True) -com,example)/ 20140216050221 {"url": "http://example.com/", "mime": "text/html", "status": "200", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "856", "offset": "171", "filename": "example.arc.gz"} - -# arc.gz -- minimal + json ->>> print_cdx_index('example.arc.gz', cdxj=True, minimal=True) -com,example)/ 20140216050221 {"url": "http://example.com/", "digest": "PEWDX5GTH66WU74WBPGFECIYBMPMP3FP", "length": "856", "offset": "171", "filename": "example.arc.gz"} - # arc >>> print_cdx_index('example.arc') CDX N b a m s k r M S V g @@ -210,6 +188,8 @@ from pywb import get_test_dir from pywb.warc.cdxindexer import write_cdx_index, main, cdx_filename +from pywb.cdx.cdxobject import CDXObject + from io import BytesIO import sys @@ -302,6 +282,54 @@ def test_non_chunked_gzip_err(): print_cdx_index('example-bad.warc.gz.bad') +def parse_cdxj(string): + lines = string.split('\n') + if lines[0] == '': + lines = lines[1:] + cdxlist = map(CDXObject, lines) + return map(dict, cdxlist) + + +def test_cdxj_warc_minimal(): + # cdxj minimal + res = cdx_index('example.warc.gz', minimal=True, cdxj=True) + + assert parse_cdxj(res) == parse_cdxj(""" +com,example)/?example=1 20140103030321 {"url": "http://example.com?example=1", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "1043", "offset": "333", "filename": "example.warc.gz"} +com,example)/?example=1 20140103030341 {"url": "http://example.com?example=1", "mime": "warc/revisit", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "553", "offset": "1864", "filename": "example.warc.gz"} +org,iana)/domains/example 20140128051539 {"url": "http://www.iana.org/domains/example", "digest": "JZ622UA23G5ZU6Y3XAKH4LINONUEICEG", "length": "577", "offset": "2907", "filename": "example.warc.gz"} +""") + + +def test_cdxj_warc_all(): + # warc.gz -- parse all -- CDXJ + res = cdx_index('example.warc.gz', include_all=True, cdxj=True) + + assert parse_cdxj(res) == parse_cdxj(""" +com,example)/?example=1 20140103030321 {"url": "http://example.com?example=1", "mime": "text/html", "status": "200", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "1043", "offset": "333", "filename": "example.warc.gz"} +com,example)/?example=1 20140103030321 {"url": "http://example.com?example=1", "length": "488", "offset": "1376", "filename": "example.warc.gz"} +com,example)/?example=1 20140103030341 {"url": "http://example.com?example=1", "mime": "warc/revisit", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "553", "offset": "1864", "filename": "example.warc.gz"} +com,example)/?example=1 20140103030341 {"url": "http://example.com?example=1", "length": "490", "offset": "2417", "filename": "example.warc.gz"} +org,iana)/domains/example 20140128051539 {"url": "http://www.iana.org/domains/example", "mime": "text/html", "status": "302", "digest": "JZ622UA23G5ZU6Y3XAKH4LINONUEICEG", "length": "577", "offset": "2907", "filename": "example.warc.gz"} +""") + +def test_cdxj_arc(): + # arc.gz -- json + res = cdx_index('example.arc.gz', cdxj=True) + assert parse_cdxj(res) == parse_cdxj(""" +com,example)/ 20140216050221 {"url": "http://example.com/", "mime": "text/html", "status": "200", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "856", "offset": "171", "filename": "example.arc.gz"} +""") + +def test_cdxj_arc_minimal(): + # arc.gz -- minimal + json + res = cdx_index('example.arc.gz', cdxj=True, minimal=True) + assert parse_cdxj(res) == parse_cdxj(""" +com,example)/ 20140216050221 {"url": "http://example.com/", "digest": "PEWDX5GTH66WU74WBPGFECIYBMPMP3FP", "length": "856", "offset": "171", "filename": "example.arc.gz"} +""") + + + + if __name__ == "__main__": import doctest doctest.testmod() diff --git a/tests/test_auto_colls.py b/tests/test_auto_colls.py index 72291f88..e79d1269 100644 --- a/tests/test_auto_colls.py +++ b/tests/test_auto_colls.py @@ -16,6 +16,7 @@ from pywb.manager.manager import main import pywb.manager.autoindex from pywb.warc.cdxindexer import main as cdxindexer_main +from pywb.cdx.cdxobject import CDXObject from pywb import get_test_dir from pywb.framework.wsgi_wrappers import init_app @@ -457,7 +458,11 @@ class TestManagedColls(object): assert all(x.endswith('.cdxj') for x in cdxjs) with open(os.path.join(migrate_dir, 'iana.cdxj')) as fh: - assert fh.readline().startswith('org,iana)/ 20140126200624 {"url": "http://www.iana.org/",') + cdx = CDXObject(fh.readline()) + assert cdx['urlkey'] == 'org,iana)/' + assert cdx['timestamp'] == '20140126200624' + assert cdx['url'] == 'http://www.iana.org/' + #assert fh.readline().startswith('org,iana)/ 20140126200624 {"url": "http://www.iana.org/",') # Nothing else to migrate main(['cdx-convert', migrate_dir])