1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

cdxindexer: if latest ujson (with forward slash not-escaping) is available, use that when indexing, closes #140

tests: update indexer CDXJ tests to be order-independent
travis: install ujson for testing
This commit is contained in:
Ilya Kreymer 2015-10-22 17:41:42 -07:00
parent c003a96618
commit e37636de84
4 changed files with 72 additions and 26 deletions

View File

@ -13,6 +13,7 @@ sudo: false
install:
- "pip install 'argparse>=1.2.1' --allow-all-external"
- pip install boto certauth
- pip install git+https://github.com/esnme/ultrajson.git
- python setup.py -q install
- pip install coverage pytest-cov coveralls --use-mirrors

View File

@ -3,10 +3,22 @@ import sys
# Use ujson if available
try:
from ujson import dumps as json_encode
except:
from json import dumps as json_encode
from ujson import dumps as ujson_dumps
try:
assert (ujson_dumps('http://example.com/',
escape_forward_slashes=False) ==
'"http://example.com/"')
except Exception as e: # pragma: no cover
sys.stderr.write('ujson w/o forward-slash escaping not available,\
defaulting to regular json\n')
raise
def json_encode(obj):
return ujson_dumps(obj, escape_forward_slashes=False)
except: # pragma: no cover
from json import dumps as json_encode
try: # pragma: no cover
from collections import OrderedDict

View File

@ -8,12 +8,6 @@ com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 20
com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz
org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
# warc.gz -- minimal CDXJ
>>> print_cdx_index('example.warc.gz', minimal=True, cdxj=True)
com,example)/?example=1 20140103030321 {"url": "http://example.com?example=1", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "1043", "offset": "333", "filename": "example.warc.gz"}
com,example)/?example=1 20140103030341 {"url": "http://example.com?example=1", "mime": "warc/revisit", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "553", "offset": "1864", "filename": "example.warc.gz"}
org,iana)/domains/example 20140128051539 {"url": "http://www.iana.org/domains/example", "digest": "JZ622UA23G5ZU6Y3XAKH4LINONUEICEG", "length": "577", "offset": "2907", "filename": "example.warc.gz"}
# warc.gz -- parse all
>>> print_cdx_index('example.warc.gz', include_all=True)
CDX N b a m s k r M S V g
@ -23,14 +17,6 @@ com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit
com,example)/?example=1 20140103030341 http://example.com?example=1 - - - - - 490 2417 example.warc.gz
org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
# warc.gz -- parse all -- CDXJ
>>> print_cdx_index('example.warc.gz', include_all=True, cdxj=True)
com,example)/?example=1 20140103030321 {"url": "http://example.com?example=1", "mime": "text/html", "status": "200", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "1043", "offset": "333", "filename": "example.warc.gz"}
com,example)/?example=1 20140103030321 {"url": "http://example.com?example=1", "length": "488", "offset": "1376", "filename": "example.warc.gz"}
com,example)/?example=1 20140103030341 {"url": "http://example.com?example=1", "mime": "warc/revisit", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "553", "offset": "1864", "filename": "example.warc.gz"}
com,example)/?example=1 20140103030341 {"url": "http://example.com?example=1", "length": "490", "offset": "2417", "filename": "example.warc.gz"}
org,iana)/domains/example 20140128051539 {"url": "http://www.iana.org/domains/example", "mime": "text/html", "status": "302", "digest": "JZ622UA23G5ZU6Y3XAKH4LINONUEICEG", "length": "577", "offset": "2907", "filename": "example.warc.gz"}
# warc
>>> print_cdx_index('example.warc')
CDX N b a m s k r M S V g
@ -52,14 +38,6 @@ org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example tex
CDX N b a m s k r M S V g
com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 856 171 example.arc.gz
# arc.gz -- json
>>> print_cdx_index('example.arc.gz', cdxj=True)
com,example)/ 20140216050221 {"url": "http://example.com/", "mime": "text/html", "status": "200", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "856", "offset": "171", "filename": "example.arc.gz"}
# arc.gz -- minimal + json
>>> print_cdx_index('example.arc.gz', cdxj=True, minimal=True)
com,example)/ 20140216050221 {"url": "http://example.com/", "digest": "PEWDX5GTH66WU74WBPGFECIYBMPMP3FP", "length": "856", "offset": "171", "filename": "example.arc.gz"}
# arc
>>> print_cdx_index('example.arc')
CDX N b a m s k r M S V g
@ -210,6 +188,8 @@ from pywb import get_test_dir
from pywb.warc.cdxindexer import write_cdx_index, main, cdx_filename
from pywb.cdx.cdxobject import CDXObject
from io import BytesIO
import sys
@ -302,6 +282,54 @@ def test_non_chunked_gzip_err():
print_cdx_index('example-bad.warc.gz.bad')
def parse_cdxj(string):
lines = string.split('\n')
if lines[0] == '':
lines = lines[1:]
cdxlist = map(CDXObject, lines)
return map(dict, cdxlist)
def test_cdxj_warc_minimal():
# cdxj minimal
res = cdx_index('example.warc.gz', minimal=True, cdxj=True)
assert parse_cdxj(res) == parse_cdxj("""
com,example)/?example=1 20140103030321 {"url": "http://example.com?example=1", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "1043", "offset": "333", "filename": "example.warc.gz"}
com,example)/?example=1 20140103030341 {"url": "http://example.com?example=1", "mime": "warc/revisit", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "553", "offset": "1864", "filename": "example.warc.gz"}
org,iana)/domains/example 20140128051539 {"url": "http://www.iana.org/domains/example", "digest": "JZ622UA23G5ZU6Y3XAKH4LINONUEICEG", "length": "577", "offset": "2907", "filename": "example.warc.gz"}
""")
def test_cdxj_warc_all():
# warc.gz -- parse all -- CDXJ
res = cdx_index('example.warc.gz', include_all=True, cdxj=True)
assert parse_cdxj(res) == parse_cdxj("""
com,example)/?example=1 20140103030321 {"url": "http://example.com?example=1", "mime": "text/html", "status": "200", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "1043", "offset": "333", "filename": "example.warc.gz"}
com,example)/?example=1 20140103030321 {"url": "http://example.com?example=1", "length": "488", "offset": "1376", "filename": "example.warc.gz"}
com,example)/?example=1 20140103030341 {"url": "http://example.com?example=1", "mime": "warc/revisit", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "553", "offset": "1864", "filename": "example.warc.gz"}
com,example)/?example=1 20140103030341 {"url": "http://example.com?example=1", "length": "490", "offset": "2417", "filename": "example.warc.gz"}
org,iana)/domains/example 20140128051539 {"url": "http://www.iana.org/domains/example", "mime": "text/html", "status": "302", "digest": "JZ622UA23G5ZU6Y3XAKH4LINONUEICEG", "length": "577", "offset": "2907", "filename": "example.warc.gz"}
""")
def test_cdxj_arc():
# arc.gz -- json
res = cdx_index('example.arc.gz', cdxj=True)
assert parse_cdxj(res) == parse_cdxj("""
com,example)/ 20140216050221 {"url": "http://example.com/", "mime": "text/html", "status": "200", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "856", "offset": "171", "filename": "example.arc.gz"}
""")
def test_cdxj_arc_minimal():
# arc.gz -- minimal + json
res = cdx_index('example.arc.gz', cdxj=True, minimal=True)
assert parse_cdxj(res) == parse_cdxj("""
com,example)/ 20140216050221 {"url": "http://example.com/", "digest": "PEWDX5GTH66WU74WBPGFECIYBMPMP3FP", "length": "856", "offset": "171", "filename": "example.arc.gz"}
""")
if __name__ == "__main__":
import doctest
doctest.testmod()

View File

@ -16,6 +16,7 @@ from pywb.manager.manager import main
import pywb.manager.autoindex
from pywb.warc.cdxindexer import main as cdxindexer_main
from pywb.cdx.cdxobject import CDXObject
from pywb import get_test_dir
from pywb.framework.wsgi_wrappers import init_app
@ -457,7 +458,11 @@ class TestManagedColls(object):
assert all(x.endswith('.cdxj') for x in cdxjs)
with open(os.path.join(migrate_dir, 'iana.cdxj')) as fh:
assert fh.readline().startswith('org,iana)/ 20140126200624 {"url": "http://www.iana.org/",')
cdx = CDXObject(fh.readline())
assert cdx['urlkey'] == 'org,iana)/'
assert cdx['timestamp'] == '20140126200624'
assert cdx['url'] == 'http://www.iana.org/'
#assert fh.readline().startswith('org,iana)/ 20140126200624 {"url": "http://www.iana.org/",')
# Nothing else to migrate
main(['cdx-convert', migrate_dir])