1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-30 18:55:31 +02:00
pywb/pywb/indexer/test/test_indexing.py
John Berlin dfc3033117 Added skipping of metadata records with mime = text/anvl to cdxindexer.py. (#366)
Updated test_indexing.py to include a test for no-indexing metadata records with mime == text/anvl
Fixes https://github.com/webrecorder/webrecorderplayer-electron/issues/63.
2018-08-20 15:04:09 -07:00

469 lines
22 KiB
Python

r"""
#=================================================================
# warc.gz
>>> print_cdx_index('example.warc.gz')
CDX N b a m s k r M S V g
com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz
com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz
org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
# warc.gz -- parse all
>>> print_cdx_index('example.warc.gz', include_all=True)
CDX N b a m s k r M S V g
com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz
com,example)/?example=1 20140103030321 http://example.com?example=1 - - - - - 488 1376 example.warc.gz
com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz
com,example)/?example=1 20140103030341 http://example.com?example=1 - - - - - 490 2417 example.warc.gz
org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
# warc
>>> print_cdx_index('example.warc')
CDX N b a m s k r M S V g
com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1987 460 example.warc
com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 896 3161 example.warc
org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 854 4771 example.warc
# warc all
# note: length of request record set to 1 byte less then record to test truncation handling
>>> print_cdx_index('example.warc', include_all=True)
CDX N b a m s k r M S V g
com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1987 460 example.warc
com,example)/?example=1 20140103030321 http://example.com?example=1 - - - - - 706 2451 example.warc
com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 896 3161 example.warc
com,example)/?example=1 20140103030341 http://example.com?example=1 - - - - - 703 4061 example.warc
org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 854 4771 example.warc
# arc.gz
>>> print_cdx_index('example.arc.gz')
CDX N b a m s k r M S V g
com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 856 171 example.arc.gz
# arc
>>> print_cdx_index('example.arc')
CDX N b a m s k r M S V g
com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1656 151 example.arc
# arc.gz
>>> print_cdx_index('example.arc.gz', arc2warc=True)
CDX N b a m s k r M S V g
com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 856 171 example.arc.gz
# arc
>>> print_cdx_index('example.arc', arc2warc=True)
CDX N b a m s k r M S V g
com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1656 151 example.arc
# wget warc, includes metadata by default
>>> print_cdx_index('example-wget-1-14.warc.gz')
CDX N b a m s k r M S V g
com,example)/ 20140216012908 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1151 792 example-wget-1-14.warc.gz
org,gnu)/software/wget/warc/manifest.txt 20140216012908 metadata://gnu.org/software/wget/warc/MANIFEST.txt text/plain - SWUF4CK2XMZSOKSA7SDT7M7NUGWH2TRE - - 315 1943 example-wget-1-14.warc.gz
org,gnu)/software/wget/warc/wget_arguments.txt 20140216012908 metadata://gnu.org/software/wget/warc/wget_arguments.txt text/plain - UCXDCGORD6K4RJT5NUQGKE2PKEG4ZZD6 - - 340 2258 example-wget-1-14.warc.gz
org,gnu)/software/wget/warc/wget.log 20140216012908 metadata://gnu.org/software/wget/warc/wget.log text/plain - 2ULE2LD5UOWDXGACCT624TU5BVKACRQ4 - - 599 2598 example-wget-1-14.warc.gz
# wget warc, includes metadata and request
>>> print_cdx_index('example-wget-1-14.warc.gz', include_all=True)
CDX N b a m s k r M S V g
com,example)/ 20140216012908 http://example.com/ - - - - - 394 398 example-wget-1-14.warc.gz
com,example)/ 20140216012908 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1151 792 example-wget-1-14.warc.gz
org,gnu)/software/wget/warc/manifest.txt 20140216012908 metadata://gnu.org/software/wget/warc/MANIFEST.txt text/plain - SWUF4CK2XMZSOKSA7SDT7M7NUGWH2TRE - - 315 1943 example-wget-1-14.warc.gz
org,gnu)/software/wget/warc/wget_arguments.txt 20140216012908 metadata://gnu.org/software/wget/warc/wget_arguments.txt text/plain - UCXDCGORD6K4RJT5NUQGKE2PKEG4ZZD6 - - 340 2258 example-wget-1-14.warc.gz
org,gnu)/software/wget/warc/wget.log 20140216012908 metadata://gnu.org/software/wget/warc/wget.log text/plain - 2ULE2LD5UOWDXGACCT624TU5BVKACRQ4 - - 599 2598 example-wget-1-14.warc.gz
# wpull warc, includes metadata by default
>>> print_cdx_index('example-wpull.warc.gz')
CDX N b a m s k r M S V g
com,example)/ 20150330235046 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1150 2031 example-wpull.warc.gz
urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 example-wpull.warc.gz
# bad arcs -- test error edge cases
>>> print_cdx_index('bad.arc', include_all=True)
CDX N b a m s k r M S V g
com,example)/ 20140401000000 http://example.com/ text/html 200 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 67 134 bad.arc
com,example)/ 20140102000000 http://example.com/ text/plain 200 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 59 202 bad.arc
com,example)/ 20140401000000 http://example.com/ text/html 200 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 68 262 bad.arc
# POST request tests
#=================================================================
# no post append, no requests
>>> print_cdx_index('post-test.warc.gz')
CDX N b a m s k r M S V g
org,httpbin)/post 20140610000859 http://httpbin.org/post application/json 200 M532K5WS4GY2H4OVZO6HRPOP47A7KDWU - - 720 0 post-test.warc.gz
org,httpbin)/post 20140610001151 http://httpbin.org/post application/json 200 M7YCTM7HS3YKYQTAWQVMQSQZBNEOXGU2 - - 723 1196 post-test.warc.gz
org,httpbin)/post?foo=bar 20140610001255 http://httpbin.org/post?foo=bar application/json 200 B6E5P6JUZI6UPDTNO4L2BCHMGLTNCUAJ - - 723 2395 post-test.warc.gz
# post append
>>> print_cdx_index('post-test.warc.gz', append_post=True)
CDX N b a m s k r M S V g
org,httpbin)/post?foo=bar&test=abc 20140610000859 http://httpbin.org/post application/json 200 M532K5WS4GY2H4OVZO6HRPOP47A7KDWU - - 720 0 post-test.warc.gz
org,httpbin)/post?a=1&b=[]&c=3 20140610001151 http://httpbin.org/post application/json 200 M7YCTM7HS3YKYQTAWQVMQSQZBNEOXGU2 - - 723 1196 post-test.warc.gz
org,httpbin)/post?data=^&foo=bar 20140610001255 http://httpbin.org/post?foo=bar application/json 200 B6E5P6JUZI6UPDTNO4L2BCHMGLTNCUAJ - - 723 2395 post-test.warc.gz
# no post append, requests included
>>> print_cdx_index('post-test.warc.gz', include_all=True)
CDX N b a m s k r M S V g
org,httpbin)/post 20140610000859 http://httpbin.org/post application/json 200 M532K5WS4GY2H4OVZO6HRPOP47A7KDWU - - 720 0 post-test.warc.gz
org,httpbin)/post 20140610000859 http://httpbin.org/post application/x-www-form-urlencoded - - - - 476 720 post-test.warc.gz
org,httpbin)/post 20140610001151 http://httpbin.org/post application/json 200 M7YCTM7HS3YKYQTAWQVMQSQZBNEOXGU2 - - 723 1196 post-test.warc.gz
org,httpbin)/post 20140610001151 http://httpbin.org/post application/x-www-form-urlencoded - - - - 476 1919 post-test.warc.gz
org,httpbin)/post?foo=bar 20140610001255 http://httpbin.org/post?foo=bar application/json 200 B6E5P6JUZI6UPDTNO4L2BCHMGLTNCUAJ - - 723 2395 post-test.warc.gz
org,httpbin)/post?foo=bar 20140610001255 http://httpbin.org/post?foo=bar application/x-www-form-urlencoded - - - - 475 3118 post-test.warc.gz
# post append + requests included
>>> print_cdx_index('post-test.warc.gz', include_all=True, append_post=True)
CDX N b a m s k r M S V g
org,httpbin)/post?foo=bar&test=abc 20140610000859 http://httpbin.org/post application/json 200 M532K5WS4GY2H4OVZO6HRPOP47A7KDWU - - 720 0 post-test.warc.gz
org,httpbin)/post?foo=bar&test=abc 20140610000859 http://httpbin.org/post application/x-www-form-urlencoded - - - - 476 720 post-test.warc.gz
org,httpbin)/post?a=1&b=[]&c=3 20140610001151 http://httpbin.org/post application/json 200 M7YCTM7HS3YKYQTAWQVMQSQZBNEOXGU2 - - 723 1196 post-test.warc.gz
org,httpbin)/post?a=1&b=[]&c=3 20140610001151 http://httpbin.org/post application/x-www-form-urlencoded - - - - 476 1919 post-test.warc.gz
org,httpbin)/post?data=^&foo=bar 20140610001255 http://httpbin.org/post?foo=bar application/json 200 B6E5P6JUZI6UPDTNO4L2BCHMGLTNCUAJ - - 723 2395 post-test.warc.gz
org,httpbin)/post?data=^&foo=bar 20140610001255 http://httpbin.org/post?foo=bar application/x-www-form-urlencoded - - - - 475 3118 post-test.warc.gz
# post append + minimal = error
>>> print_cdx_index('example.arc.gz', append_post=True, minimal=True)
Traceback (most recent call last):
Exception: Sorry, minimal index option and append POST options can not be used together
# Test with custom verbs/protocol
#================================================================
# no validation
>>> print_cdx_index('example-extra.warc')
CDX N b a m s k r M S V g
com,example)/?example=2 20140103030321 http://example.com?example=2 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1987 0 example-extra.warc
com,example)/?example=2 20140603030341 http://example.com?example=2 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 504 2701 example-extra.warc
com,example)/?example=2 20140103030321 http://example.com?example=2 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1987 3207 example-extra.warc
com,example)/?example=2 20140603030341 http://example.com?example=2 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 504 5910 example-extra.warc
>>> print_cdx_index('example-extra.warc', verify_http=True) # doctest: +IGNORE_EXCEPTION_DETAIL
Traceback (most recent call last):
StatusAndHeadersParserException: Expected Status Line starting with ['HTTP/1.0', 'HTTP/1.1'] - Found: HTTPX/1.1 200 OK
# Test CLI interface -- (check for num lines)
#=================================================================
# test sort, multiple inputs
>>> cli_lines(['--sort', '-', TEST_WARC_DIR])
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz
urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 example-wpull.warc.gz
Total: 212
# test sort, multiple inputs, recursive, from base test dir
>>> cli_lines(['--sort', '-r', '-', get_test_dir()])
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 warcs/example-url-agnostic-revisit.warc.gz
urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 warcs/example-wpull.warc.gz
Total: 212
# test sort, 9-field, multiple inputs, all records + post query
>>> cli_lines(['--sort', '-a', '-p', '-9', TEST_WARC_DIR])
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - 355 example-url-agnostic-revisit.warc.gz
urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - 3181 example-wpull.warc.gz
Total: 407
# test writing to stdout
>>> cli_lines(['-', TEST_WARC_DIR + 'example.warc.gz'])
com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz
org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
Total: 4
# test writing to stdout ('-' omitted)
>>> cli_lines([TEST_WARC_DIR + 'example.warc.gz'])
com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz
org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
Total: 4
# test custom root dir for cdx filenames, singlw warc
>>> cli_lines(['--dir-root', get_test_dir() + 'other/', TEST_WARC_DIR + 'example.warc.gz'])
com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 ../warcs/example.warc.gz
org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 ../warcs/example.warc.gz
Total: 4
# test custom root dir for cdx filenames, dir input
>>> cli_lines(['--sort', '--dir-root', get_test_dir() + 'other/', TEST_WARC_DIR])
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 ../warcs/example-url-agnostic-revisit.warc.gz
urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 ../warcs/example-wpull.warc.gz
Total: 212
# test writing to temp dir, also use unicode filename
>>> cli_lines_with_dir(TEST_WARC_DIR + 'example.warc.gz')
example.cdx
com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz
org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
Total: 4
"""
from pywb import get_test_dir
from pywb.indexer.cdxindexer import write_cdx_index, main, cdx_filename
from pywb.warcserver.index.cdxobject import CDXObject
from io import BytesIO
import sys
import os
import shutil
import tempfile
from pytest import raises
TEST_CDX_DIR = get_test_dir() + 'cdx/'
TEST_WARC_DIR = get_test_dir() + 'warcs/'
def read_fully(cdx):
with open(TEST_CDX_DIR + cdx, 'rb') as fh:
curr = BytesIO()
while True:
b = fh.read()
if not b:
break
curr.write(b)
return curr.getvalue()
def cdx_index(warc, **options):
buff = BytesIO()
with open(TEST_WARC_DIR + warc, 'rb') as fh:
write_cdx_index(buff, fh, warc, **options)
return buff.getvalue()
def print_cdx_index(*args, **kwargs):
sys.stdout.write(cdx_index(*args, **kwargs).decode('utf-8'))
def assert_cdx_match(cdx, warc, sort=False):
assert read_fully(cdx) == cdx_index(warc, sort=sort)
def test_sorted_warc_gz():
assert_cdx_match('example.cdx', 'example.warc.gz', sort=True)
assert_cdx_match('dupes.cdx', 'dupes.warc.gz', sort=True)
assert_cdx_match('iana.cdx', 'iana.warc.gz', sort=True)
def cli_lines(cmds):
buff = BytesIO()
orig = sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else None
sys.stdout.buffer = buff
main(cmds)
sys.stdout.buffer = orig
lines = buff.getvalue().rstrip().split(b'\n')
# print first, last, num lines
print(lines[1].decode('utf-8'))
print(lines[-1].decode('utf-8'))
print('Total: ' + str(len(lines)))
def cli_lines_with_dir(input_):
try:
lines = None
tmp_dir = None
tmp_dir = tempfile.mkdtemp()
main([tmp_dir, input_])
filename = cdx_filename(os.path.basename(input_))
print(filename)
with open(os.path.join(tmp_dir, filename), 'rb') as fh:
lines = fh.read(8192).rstrip().split(b'\n')
finally:
try:
if tmp_dir:
shutil.rmtree(tmp_dir)
except OSError as exc:
if exc.errno != 2:
raise
if not lines:
return
# print first, last, num lines
print(lines[1].decode('utf-8'))
print(lines[-1].decode('utf-8'))
print('Total: ' + str(len(lines)))
def test_non_chunked_gzip_err():
with raises(Exception):
print_cdx_index('example-bad.warc.gz.bad')
def parse_cdxj(string):
lines = string.split(b'\n')
if lines[0] == b'':
lines = lines[1:]
cdxlist = list(map(CDXObject, lines))
return list(map(dict, cdxlist))
def test_cdxj_warc_minimal():
# cdxj minimal
res = cdx_index('example.warc.gz', minimal=True, cdxj=True)
assert parse_cdxj(res) == parse_cdxj(b"""
com,example)/?example=1 20140103030321 {"url": "http://example.com?example=1", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "1043", "offset": "333", "filename": "example.warc.gz"}
com,example)/?example=1 20140103030341 {"url": "http://example.com?example=1", "mime": "warc/revisit", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "553", "offset": "1864", "filename": "example.warc.gz"}
org,iana)/domains/example 20140128051539 {"url": "http://www.iana.org/domains/example", "digest": "JZ622UA23G5ZU6Y3XAKH4LINONUEICEG", "length": "577", "offset": "2907", "filename": "example.warc.gz"}
""")
def test_cdxj_warc_all():
# warc.gz -- parse all -- CDXJ
res = cdx_index('example.warc.gz', include_all=True, cdxj=True)
assert parse_cdxj(res) == parse_cdxj(b"""
com,example)/?example=1 20140103030321 {"url": "http://example.com?example=1", "mime": "text/html", "status": "200", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "1043", "offset": "333", "filename": "example.warc.gz"}
com,example)/?example=1 20140103030321 {"url": "http://example.com?example=1", "length": "488", "offset": "1376", "filename": "example.warc.gz"}
com,example)/?example=1 20140103030341 {"url": "http://example.com?example=1", "mime": "warc/revisit", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "553", "offset": "1864", "filename": "example.warc.gz"}
com,example)/?example=1 20140103030341 {"url": "http://example.com?example=1", "length": "490", "offset": "2417", "filename": "example.warc.gz"}
org,iana)/domains/example 20140128051539 {"url": "http://www.iana.org/domains/example", "mime": "text/html", "status": "302", "digest": "JZ622UA23G5ZU6Y3XAKH4LINONUEICEG", "length": "577", "offset": "2907", "filename": "example.warc.gz"}
""")
def test_cdxj_arc():
# arc.gz -- json
res = cdx_index('example.arc.gz', cdxj=True)
assert parse_cdxj(res) == parse_cdxj(b"""
com,example)/ 20140216050221 {"url": "http://example.com/", "mime": "text/html", "status": "200", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "856", "offset": "171", "filename": "example.arc.gz"}
""")
def test_cdxj_arc_minimal():
# arc.gz -- minimal + json
res = cdx_index('example.arc.gz', cdxj=True, minimal=True)
assert parse_cdxj(res) == parse_cdxj(b"""
com,example)/ 20140216050221 {"url": "http://example.com/", "digest": "PEWDX5GTH66WU74WBPGFECIYBMPMP3FP", "length": "856", "offset": "171", "filename": "example.arc.gz"}
""")
def test_cdxj_arc_conv():
# arc.gz -- json
res = cdx_index('example.arc.gz', cdxj=True, arc2warc=True)
assert parse_cdxj(res) == parse_cdxj(b"""
com,example)/ 20140216050221 {"url": "http://example.com/", "mime": "text/html", "status": "200", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "856", "offset": "171", "filename": "example.arc.gz"}
""")
def test_cdxj_arc_minimal_conv():
# arc.gz -- minimal + json
res = cdx_index('example.arc.gz', cdxj=True, minimal=True, arc2warc=True)
assert parse_cdxj(res) == parse_cdxj(b"""
com,example)/ 20140216050221 {"url": "http://example.com/", "digest": "PEWDX5GTH66WU74WBPGFECIYBMPMP3FP", "length": "856", "offset": "171", "filename": "example.arc.gz"}
""")
def test_cdxj_empty():
options = dict(cdxj=True)
buff = BytesIO()
empty = BytesIO()
write_cdx_index(buff, empty, 'empty.warc.gz', **options)
assert buff.getvalue() == b''
def test_cdxj_middle_empty_records():
empty_gzip_record = b'\x1f\x8b\x08\x00\x00\x00\x00\x00\x00\x03\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00'
new_warc = BytesIO()
with open(TEST_WARC_DIR + 'example2.warc.gz', 'rb') as fh:
new_warc.write(empty_gzip_record)
new_warc.write(fh.read())
new_warc.write(empty_gzip_record)
new_warc.write(empty_gzip_record)
fh.seek(0)
new_warc.write(fh.read())
options = dict(cdxj=True)
buff = BytesIO()
new_warc.seek(0)
write_cdx_index(buff, new_warc, 'empty.warc.gz', **options)
lines = buff.getvalue().rstrip().split(b'\n')
assert len(lines) == 2, lines
def test_invalid_decoding_uri_py2():
test_data = b'\
WARC/1.0\r\n\
WARC-Type: resource\r\n\
WARC-Record-ID: <urn:uuid:12345678-feb0-11e6-8f83-68a86d1772ce>\r\n\
WARC-Target-URI: http://example.com/\xc3\x83\xc2\xa9\r\n\
WARC-Date: 2000-01-01T00:00:00Z\r\n\
Content-Type: text/plain\r\n\
Content-Length: 4\r\n\
\r\n\
ABCD\r\n\
\r\n'
options = dict(include_all=True)
buff = BytesIO()
test_record = BytesIO(test_data)
write_cdx_index(buff, test_record, 'test.warc.gz', **options)
assert buff.getvalue() == b"""\
CDX N b a m s k r M S V g
com,example)/%c3%83%c2%a9 20000101000000 http://example.com/\xc3\x83\xc2\xa9 text/plain - 7MXYLSEFM7Z4RTU3PGOHYVDEFUGHWQPW - - 222 0 test.warc.gz
"""
def test_no_index_metadata_mime_textanvl():
test_data = b'\
WARC/0.18\r\n\
WARC-Type: response\r\n\
WARC-Record-ID: <urn:uuid:1fd7789c-9cd5-47ea-b7ba-2a97dc06680b>\r\n\
WARC-Target-URI: http://example.com/xyz.pdf\r\n\
WARC-Date: 2014-04-01T05:20:11Z\r\n\
WARC-Payload-Digest: sha1:EDIYL6WNHDY62TPKUCPSEWMOAAGYTOAS\r\n\
Content-Type: application/http; msgtype=response\r\n\
Content-Length: 4\r\n\
\r\n\
ABCD\r\n\
\r\n\
\r\n\
\r\n\
WARC/0.18\r\n\
WARC-Type: metadata\r\n\
WARC-Record-ID: <urn:uuid:0735267f-5749-4c02-b08b-955af5d76032>\r\n\
WARC-Target-URI: http://example.com/xyz.pdf\r\n\
WARC-Date: 2014-04-01T05:20:11Z\r\n\
WARC-Payload-Digest: sha1:EDIYL6WNHDY62TPKUCPSEWMOAAGYTOAS\r\n\
Content-Type: text/anvl\r\n\
Content-Length: 4\r\n\
\r\n\
ABCD\r\n\
\r\n\
'
options = dict(include_all=True)
buff = BytesIO()
test_record = BytesIO(test_data)
write_cdx_index(buff, test_record, 'test.warc.gz', **options)
assert buff.getvalue() == b"""\
CDX N b a m s k r M S V g
com,example)/xyz.pdf 20140401052011 http://example.com/xyz.pdf application/http 200 EDIYL6WNHDY62TPKUCPSEWMOAAGYTOAS - - 310 0 test.warc.gz
"""
if __name__ == "__main__":
import doctest
doctest.testmod()