mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
cdxindexer: if latest ujson (with forward slash not-escaping) is available, use that when indexing, closes #140
tests: update indexer CDXJ tests to be order-independent travis: install ujson for testing
This commit is contained in:
parent
c003a96618
commit
e37636de84
@ -13,6 +13,7 @@ sudo: false
|
||||
install:
|
||||
- "pip install 'argparse>=1.2.1' --allow-all-external"
|
||||
- pip install boto certauth
|
||||
- pip install git+https://github.com/esnme/ultrajson.git
|
||||
- python setup.py -q install
|
||||
- pip install coverage pytest-cov coveralls --use-mirrors
|
||||
|
||||
|
@ -3,10 +3,22 @@ import sys
|
||||
|
||||
# Use ujson if available
|
||||
try:
|
||||
from ujson import dumps as json_encode
|
||||
except:
|
||||
from json import dumps as json_encode
|
||||
from ujson import dumps as ujson_dumps
|
||||
|
||||
try:
|
||||
assert (ujson_dumps('http://example.com/',
|
||||
escape_forward_slashes=False) ==
|
||||
'"http://example.com/"')
|
||||
except Exception as e: # pragma: no cover
|
||||
sys.stderr.write('ujson w/o forward-slash escaping not available,\
|
||||
defaulting to regular json\n')
|
||||
raise
|
||||
|
||||
def json_encode(obj):
|
||||
return ujson_dumps(obj, escape_forward_slashes=False)
|
||||
|
||||
except: # pragma: no cover
|
||||
from json import dumps as json_encode
|
||||
|
||||
try: # pragma: no cover
|
||||
from collections import OrderedDict
|
||||
|
@ -8,12 +8,6 @@ com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 20
|
||||
com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz
|
||||
org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
|
||||
|
||||
# warc.gz -- minimal CDXJ
|
||||
>>> print_cdx_index('example.warc.gz', minimal=True, cdxj=True)
|
||||
com,example)/?example=1 20140103030321 {"url": "http://example.com?example=1", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "1043", "offset": "333", "filename": "example.warc.gz"}
|
||||
com,example)/?example=1 20140103030341 {"url": "http://example.com?example=1", "mime": "warc/revisit", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "553", "offset": "1864", "filename": "example.warc.gz"}
|
||||
org,iana)/domains/example 20140128051539 {"url": "http://www.iana.org/domains/example", "digest": "JZ622UA23G5ZU6Y3XAKH4LINONUEICEG", "length": "577", "offset": "2907", "filename": "example.warc.gz"}
|
||||
|
||||
# warc.gz -- parse all
|
||||
>>> print_cdx_index('example.warc.gz', include_all=True)
|
||||
CDX N b a m s k r M S V g
|
||||
@ -23,14 +17,6 @@ com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit
|
||||
com,example)/?example=1 20140103030341 http://example.com?example=1 - - - - - 490 2417 example.warc.gz
|
||||
org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
|
||||
|
||||
# warc.gz -- parse all -- CDXJ
|
||||
>>> print_cdx_index('example.warc.gz', include_all=True, cdxj=True)
|
||||
com,example)/?example=1 20140103030321 {"url": "http://example.com?example=1", "mime": "text/html", "status": "200", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "1043", "offset": "333", "filename": "example.warc.gz"}
|
||||
com,example)/?example=1 20140103030321 {"url": "http://example.com?example=1", "length": "488", "offset": "1376", "filename": "example.warc.gz"}
|
||||
com,example)/?example=1 20140103030341 {"url": "http://example.com?example=1", "mime": "warc/revisit", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "553", "offset": "1864", "filename": "example.warc.gz"}
|
||||
com,example)/?example=1 20140103030341 {"url": "http://example.com?example=1", "length": "490", "offset": "2417", "filename": "example.warc.gz"}
|
||||
org,iana)/domains/example 20140128051539 {"url": "http://www.iana.org/domains/example", "mime": "text/html", "status": "302", "digest": "JZ622UA23G5ZU6Y3XAKH4LINONUEICEG", "length": "577", "offset": "2907", "filename": "example.warc.gz"}
|
||||
|
||||
# warc
|
||||
>>> print_cdx_index('example.warc')
|
||||
CDX N b a m s k r M S V g
|
||||
@ -52,14 +38,6 @@ org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example tex
|
||||
CDX N b a m s k r M S V g
|
||||
com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 856 171 example.arc.gz
|
||||
|
||||
# arc.gz -- json
|
||||
>>> print_cdx_index('example.arc.gz', cdxj=True)
|
||||
com,example)/ 20140216050221 {"url": "http://example.com/", "mime": "text/html", "status": "200", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "856", "offset": "171", "filename": "example.arc.gz"}
|
||||
|
||||
# arc.gz -- minimal + json
|
||||
>>> print_cdx_index('example.arc.gz', cdxj=True, minimal=True)
|
||||
com,example)/ 20140216050221 {"url": "http://example.com/", "digest": "PEWDX5GTH66WU74WBPGFECIYBMPMP3FP", "length": "856", "offset": "171", "filename": "example.arc.gz"}
|
||||
|
||||
# arc
|
||||
>>> print_cdx_index('example.arc')
|
||||
CDX N b a m s k r M S V g
|
||||
@ -210,6 +188,8 @@ from pywb import get_test_dir
|
||||
|
||||
from pywb.warc.cdxindexer import write_cdx_index, main, cdx_filename
|
||||
|
||||
from pywb.cdx.cdxobject import CDXObject
|
||||
|
||||
from io import BytesIO
|
||||
import sys
|
||||
|
||||
@ -302,6 +282,54 @@ def test_non_chunked_gzip_err():
|
||||
print_cdx_index('example-bad.warc.gz.bad')
|
||||
|
||||
|
||||
def parse_cdxj(string):
|
||||
lines = string.split('\n')
|
||||
if lines[0] == '':
|
||||
lines = lines[1:]
|
||||
cdxlist = map(CDXObject, lines)
|
||||
return map(dict, cdxlist)
|
||||
|
||||
|
||||
def test_cdxj_warc_minimal():
|
||||
# cdxj minimal
|
||||
res = cdx_index('example.warc.gz', minimal=True, cdxj=True)
|
||||
|
||||
assert parse_cdxj(res) == parse_cdxj("""
|
||||
com,example)/?example=1 20140103030321 {"url": "http://example.com?example=1", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "1043", "offset": "333", "filename": "example.warc.gz"}
|
||||
com,example)/?example=1 20140103030341 {"url": "http://example.com?example=1", "mime": "warc/revisit", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "553", "offset": "1864", "filename": "example.warc.gz"}
|
||||
org,iana)/domains/example 20140128051539 {"url": "http://www.iana.org/domains/example", "digest": "JZ622UA23G5ZU6Y3XAKH4LINONUEICEG", "length": "577", "offset": "2907", "filename": "example.warc.gz"}
|
||||
""")
|
||||
|
||||
|
||||
def test_cdxj_warc_all():
|
||||
# warc.gz -- parse all -- CDXJ
|
||||
res = cdx_index('example.warc.gz', include_all=True, cdxj=True)
|
||||
|
||||
assert parse_cdxj(res) == parse_cdxj("""
|
||||
com,example)/?example=1 20140103030321 {"url": "http://example.com?example=1", "mime": "text/html", "status": "200", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "1043", "offset": "333", "filename": "example.warc.gz"}
|
||||
com,example)/?example=1 20140103030321 {"url": "http://example.com?example=1", "length": "488", "offset": "1376", "filename": "example.warc.gz"}
|
||||
com,example)/?example=1 20140103030341 {"url": "http://example.com?example=1", "mime": "warc/revisit", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "553", "offset": "1864", "filename": "example.warc.gz"}
|
||||
com,example)/?example=1 20140103030341 {"url": "http://example.com?example=1", "length": "490", "offset": "2417", "filename": "example.warc.gz"}
|
||||
org,iana)/domains/example 20140128051539 {"url": "http://www.iana.org/domains/example", "mime": "text/html", "status": "302", "digest": "JZ622UA23G5ZU6Y3XAKH4LINONUEICEG", "length": "577", "offset": "2907", "filename": "example.warc.gz"}
|
||||
""")
|
||||
|
||||
def test_cdxj_arc():
|
||||
# arc.gz -- json
|
||||
res = cdx_index('example.arc.gz', cdxj=True)
|
||||
assert parse_cdxj(res) == parse_cdxj("""
|
||||
com,example)/ 20140216050221 {"url": "http://example.com/", "mime": "text/html", "status": "200", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "856", "offset": "171", "filename": "example.arc.gz"}
|
||||
""")
|
||||
|
||||
def test_cdxj_arc_minimal():
|
||||
# arc.gz -- minimal + json
|
||||
res = cdx_index('example.arc.gz', cdxj=True, minimal=True)
|
||||
assert parse_cdxj(res) == parse_cdxj("""
|
||||
com,example)/ 20140216050221 {"url": "http://example.com/", "digest": "PEWDX5GTH66WU74WBPGFECIYBMPMP3FP", "length": "856", "offset": "171", "filename": "example.arc.gz"}
|
||||
""")
|
||||
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import doctest
|
||||
doctest.testmod()
|
||||
|
@ -16,6 +16,7 @@ from pywb.manager.manager import main
|
||||
import pywb.manager.autoindex
|
||||
|
||||
from pywb.warc.cdxindexer import main as cdxindexer_main
|
||||
from pywb.cdx.cdxobject import CDXObject
|
||||
|
||||
from pywb import get_test_dir
|
||||
from pywb.framework.wsgi_wrappers import init_app
|
||||
@ -457,7 +458,11 @@ class TestManagedColls(object):
|
||||
assert all(x.endswith('.cdxj') for x in cdxjs)
|
||||
|
||||
with open(os.path.join(migrate_dir, 'iana.cdxj')) as fh:
|
||||
assert fh.readline().startswith('org,iana)/ 20140126200624 {"url": "http://www.iana.org/",')
|
||||
cdx = CDXObject(fh.readline())
|
||||
assert cdx['urlkey'] == 'org,iana)/'
|
||||
assert cdx['timestamp'] == '20140126200624'
|
||||
assert cdx['url'] == 'http://www.iana.org/'
|
||||
#assert fh.readline().startswith('org,iana)/ 20140126200624 {"url": "http://www.iana.org/",')
|
||||
|
||||
# Nothing else to migrate
|
||||
main(['cdx-convert', migrate_dir])
|
||||
|
Loading…
x
Reference in New Issue
Block a user