mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-31 03:04:12 +02:00
382 lines
15 KiB
Python
382 lines
15 KiB
Python
|
|
"""
|
|
Test loading different types of records from a variety of formats
|
|
|
|
# Load response record from compressed WARC
|
|
>>> load_test_archive('example.warc.gz', '333', '1043')
|
|
(('warc', 'response'),
|
|
StatusAndHeaders(protocol = 'WARC/1.0', statusline = '', headers = [ ('WARC-Type', 'response'),
|
|
('WARC-Record-ID', '<urn:uuid:6d058047-ede2-4a13-be79-90c17c631dd4>'),
|
|
('WARC-Date', '2014-01-03T03:03:21Z'),
|
|
('Content-Length', '1610'),
|
|
('Content-Type', 'application/http; msgtype=response'),
|
|
('WARC-Payload-Digest', 'sha1:B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'),
|
|
('WARC-Target-URI', 'http://example.com?example=1'),
|
|
('WARC-Warcinfo-ID', '<urn:uuid:fbd6cf0a-6160-4550-b343-12188dc05234>')]),
|
|
StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
|
|
('Cache-Control', 'max-age=604800'),
|
|
('Content-Type', 'text/html'),
|
|
('Date', 'Fri, 03 Jan 2014 03:03:21 GMT'),
|
|
('Etag', '"359670651"'),
|
|
('Expires', 'Fri, 10 Jan 2014 03:03:21 GMT'),
|
|
('Last-Modified', 'Fri, 09 Aug 2013 23:54:35 GMT'),
|
|
('Server', 'ECS (sjc/4FCE)'),
|
|
('X-Cache', 'HIT'),
|
|
('x-ec-custom-error', '1'),
|
|
('Content-Length', '1270'),
|
|
('Connection', 'close')]))
|
|
|
|
# Load revisit record from compressed WARC
|
|
>>> load_test_archive('example.warc.gz', '1864', '553')
|
|
(('warc', 'revisit'),
|
|
StatusAndHeaders(protocol = 'WARC/1.0', statusline = '', headers = [ ('WARC-Type', 'revisit'),
|
|
('WARC-Record-ID', '<urn:uuid:3619f5b0-d967-44be-8f24-762098d427c4>'),
|
|
('WARC-Date', '2014-01-03T03:03:41Z'),
|
|
('Content-Length', '340'),
|
|
('Content-Type', 'application/http; msgtype=response'),
|
|
('WARC-Payload-Digest', 'sha1:B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'),
|
|
('WARC-Target-URI', 'http://example.com?example=1'),
|
|
('WARC-Warcinfo-ID', '<urn:uuid:fbd6cf0a-6160-4550-b343-12188dc05234>'),
|
|
( 'WARC-Profile',
|
|
'http://netpreserve.org/warc/0.18/revisit/identical-payload-digest'),
|
|
('WARC-Refers-To-Target-URI', 'http://example.com?example=1'),
|
|
('WARC-Refers-To-Date', '2014-01-03T03:03:21Z')]),
|
|
StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
|
|
('Cache-Control', 'max-age=604800'),
|
|
('Content-Type', 'text/html'),
|
|
('Date', 'Fri, 03 Jan 2014 03:03:41 GMT'),
|
|
('Etag', '"359670651"'),
|
|
('Expires', 'Fri, 10 Jan 2014 03:03:41 GMT'),
|
|
('Last-Modified', 'Fri, 09 Aug 2013 23:54:35 GMT'),
|
|
('Server', 'ECS (sjc/4FCE)'),
|
|
('X-Cache', 'HIT'),
|
|
('x-ec-custom-error', '1'),
|
|
('Content-Length', '1270'),
|
|
('Connection', 'close')]))
|
|
|
|
# request parsing
|
|
>>> load_test_archive('example.warc.gz', '1376', '488')
|
|
(('warc', 'request'),
|
|
StatusAndHeaders(protocol = 'WARC/1.0', statusline = '', headers = [ ('WARC-Type', 'request'),
|
|
('WARC-Record-ID', '<urn:uuid:9a3ffea5-9556-4790-a6bf-c15231fd6b97>'),
|
|
('WARC-Date', '2014-01-03T03:03:21Z'),
|
|
('Content-Length', '323'),
|
|
('Content-Type', 'application/http; msgtype=request'),
|
|
('WARC-Concurrent-To', '<urn:uuid:6d058047-ede2-4a13-be79-90c17c631dd4>'),
|
|
('WARC-Target-URI', 'http://example.com?example=1'),
|
|
('WARC-Warcinfo-ID', '<urn:uuid:fbd6cf0a-6160-4550-b343-12188dc05234>')]),
|
|
StatusAndHeaders(protocol = 'GET', statusline = '/?example=1 HTTP/1.1', headers = [ ('Connection', 'close'),
|
|
( 'Accept',
|
|
'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'),
|
|
('Accept-Language', 'en-US,en;q=0.8'),
|
|
( 'User-Agent',
|
|
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.57 Safari/537.36 (via Wayback Save Page)'),
|
|
('Host', 'example.com')]))
|
|
|
|
|
|
StatusAndHeaders(protocol = '', statusline = '204 No Content', headers = []))
|
|
|
|
|
|
# Test of record loading based on cdx line
|
|
# Print parsed http headers + 2 lines of content
|
|
# ==============================================================================
|
|
|
|
# Test loading from compressed ARC based on cdx line
|
|
>>> load_from_cdx_test('com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 856 171 example.arc.gz')
|
|
StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
|
|
('Cache-Control', 'max-age=604800'),
|
|
('Content-Type', 'text/html'),
|
|
('Date', 'Sun, 16 Feb 2014 05:02:20 GMT'),
|
|
('Etag', '"359670651"'),
|
|
('Expires', 'Sun, 23 Feb 2014 05:02:20 GMT'),
|
|
('Last-Modified', 'Fri, 09 Aug 2013 23:54:35 GMT'),
|
|
('Server', 'ECS (sjc/4FCE)'),
|
|
('X-Cache', 'HIT'),
|
|
('x-ec-custom-error', '1'),
|
|
('Content-Length', '1270')])
|
|
<!doctype html>
|
|
<html>
|
|
|
|
# Uncompressed arc
|
|
>>> load_from_cdx_test('com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1656 151 example.arc')
|
|
StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
|
|
('Cache-Control', 'max-age=604800'),
|
|
('Content-Type', 'text/html'),
|
|
('Date', 'Sun, 16 Feb 2014 05:02:20 GMT'),
|
|
('Etag', '"359670651"'),
|
|
('Expires', 'Sun, 23 Feb 2014 05:02:20 GMT'),
|
|
('Last-Modified', 'Fri, 09 Aug 2013 23:54:35 GMT'),
|
|
('Server', 'ECS (sjc/4FCE)'),
|
|
('X-Cache', 'HIT'),
|
|
('x-ec-custom-error', '1'),
|
|
('Content-Length', '1270')])
|
|
<!doctype html>
|
|
<html>
|
|
|
|
|
|
# Test loading from compressed WARC based on cdx line
|
|
>>> load_from_cdx_test('com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz')
|
|
StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
|
|
('Cache-Control', 'max-age=604800'),
|
|
('Content-Type', 'text/html'),
|
|
('Date', 'Fri, 03 Jan 2014 03:03:21 GMT'),
|
|
('Etag', '"359670651"'),
|
|
('Expires', 'Fri, 10 Jan 2014 03:03:21 GMT'),
|
|
('Last-Modified', 'Fri, 09 Aug 2013 23:54:35 GMT'),
|
|
('Server', 'ECS (sjc/4FCE)'),
|
|
('X-Cache', 'HIT'),
|
|
('x-ec-custom-error', '1'),
|
|
('Content-Length', '1270'),
|
|
('Connection', 'close')])
|
|
<!doctype html>
|
|
<html>
|
|
|
|
# Uncompressed WARC
|
|
>>> load_from_cdx_test('com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1987 460 example.warc')
|
|
StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
|
|
('Cache-Control', 'max-age=604800'),
|
|
('Content-Type', 'text/html'),
|
|
('Date', 'Fri, 03 Jan 2014 03:03:21 GMT'),
|
|
('Etag', '"359670651"'),
|
|
('Expires', 'Fri, 10 Jan 2014 03:03:21 GMT'),
|
|
('Last-Modified', 'Fri, 09 Aug 2013 23:54:35 GMT'),
|
|
('Server', 'ECS (sjc/4FCE)'),
|
|
('X-Cache', 'HIT'),
|
|
('x-ec-custom-error', '1'),
|
|
('Content-Length', '1270'),
|
|
('Connection', 'close')])
|
|
<!doctype html>
|
|
<html>
|
|
|
|
# Test cdx w/ revisit, also no length specified
|
|
>>> load_from_cdx_test('com,example)/?example=1 20140103030341 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - - 1864 example.warc.gz 1043 333 example.warc.gz')
|
|
StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
|
|
('Cache-Control', 'max-age=604800'),
|
|
('Content-Type', 'text/html'),
|
|
('Date', 'Fri, 03 Jan 2014 03:03:41 GMT'),
|
|
('Etag', '"359670651"'),
|
|
('Expires', 'Fri, 10 Jan 2014 03:03:41 GMT'),
|
|
('Last-Modified', 'Fri, 09 Aug 2013 23:54:35 GMT'),
|
|
('Server', 'ECS (sjc/4FCE)'),
|
|
('X-Cache', 'HIT'),
|
|
('x-ec-custom-error', '1'),
|
|
('Content-Length', '1270'),
|
|
('Connection', 'close')])
|
|
<!doctype html>
|
|
<html>
|
|
|
|
# Test loading warc created by wget 1.14
|
|
>>> load_from_cdx_test('com,example)/ 20140216012908 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1151 792 example-wget-1-14.warc.gz')
|
|
StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
|
|
('Cache-Control', 'max-age=604800'),
|
|
('Content-Type', 'text/html'),
|
|
('Date', 'Sun, 16 Feb 2014 01:29:08 GMT'),
|
|
('Etag', '"359670651"'),
|
|
('Expires', 'Sun, 23 Feb 2014 01:29:08 GMT'),
|
|
('Last-Modified', 'Fri, 09 Aug 2013 23:54:35 GMT'),
|
|
('Server', 'ECS (sjc/4FB4)'),
|
|
('X-Cache', 'HIT'),
|
|
('x-ec-custom-error', '1'),
|
|
('Content-Length', '1270')])
|
|
<!doctype html>
|
|
<html>
|
|
|
|
|
|
# Test Url Agnostic Revisit Resolving
|
|
# ==============================================================================
|
|
>>> load_from_cdx_test(URL_AGNOSTIC_ORIG_CDX)
|
|
StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
|
|
('Content-Type', 'text/html; charset=UTF-8'),
|
|
('Date', 'Tue, 02 Jul 2013 19:54:02 GMT'),
|
|
('ETag', '"780602-4f6-4db31b2978ec0"'),
|
|
('Last-Modified', 'Thu, 25 Apr 2013 16:13:23 GMT'),
|
|
('Server', 'ECS (sjc/4FCE)'),
|
|
('X-Cache', 'HIT'),
|
|
('Content-Length', '1270'),
|
|
('Connection', 'close')])
|
|
<!doctype html>
|
|
<html>
|
|
|
|
>>> load_from_cdx_test(URL_AGNOSTIC_REVISIT_CDX)
|
|
StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
|
|
('Content-Type', 'text/html; charset=UTF-8'),
|
|
('Date', 'Mon, 29 Jul 2013 19:51:51 GMT'),
|
|
('ETag', '"780602-4f6-4db31b2978ec0"'),
|
|
('Last-Modified', 'Thu, 25 Apr 2013 16:13:23 GMT'),
|
|
('Server', 'ECS (sjc/4FCE)'),
|
|
('X-Cache', 'HIT'),
|
|
('Content-Length', '1270'),
|
|
('Connection', 'close')])
|
|
<!doctype html>
|
|
<html>
|
|
|
|
>>> load_from_cdx_test(URL_AGNOSTIC_REVISIT_NO_DIGEST_CDX)
|
|
StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '200 OK', headers = [ ('Accept-Ranges', 'bytes'),
|
|
('Content-Type', 'text/html; charset=UTF-8'),
|
|
('Date', 'Mon, 29 Jul 2013 19:51:51 GMT'),
|
|
('ETag', '"780602-4f6-4db31b2978ec0"'),
|
|
('Last-Modified', 'Thu, 25 Apr 2013 16:13:23 GMT'),
|
|
('Server', 'ECS (sjc/4FCE)'),
|
|
('X-Cache', 'HIT'),
|
|
('Content-Length', '1270'),
|
|
('Connection', 'close')])
|
|
<!doctype html>
|
|
<html>
|
|
|
|
# Error Handling
|
|
# ==============================================================================
|
|
|
|
# Warc not found, keep track of failed files
|
|
>>> failed_files = []
|
|
>>> load_from_cdx_test('com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 856 170 x-not-found-x.warc.gz',\
|
|
failed_files=failed_files)
|
|
Exception: ArchiveLoadFailed
|
|
|
|
# ensure failed_files being filled
|
|
>>> failed_files
|
|
['x-not-found-x.warc.gz']
|
|
|
|
>>> load_from_cdx_test('com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 856 170 x-not-found-x.warc.gz',\
|
|
failed_files=failed_files)
|
|
Exception: ArchiveLoadFailed
|
|
|
|
|
|
# Invalid WARC Offset
|
|
>>> load_from_cdx_test('com,example)/?example=1 20140103030341 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1860 example.warc.gz 1043 333 example.warc.gz')
|
|
Exception: ArchiveLoadFailed
|
|
|
|
|
|
# Invalid ARC Offset
|
|
>>> load_from_cdx_test('com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 856 170 example.arc.gz')
|
|
Exception: ArchiveLoadFailed
|
|
|
|
|
|
# Error Expected with revisit -- invalid offset on original
|
|
>>> load_from_cdx_test('com,example)/?example=1 20140103030341 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz 1043 330 example.warc.gz')
|
|
Exception: ArchiveLoadFailed
|
|
|
|
|
|
# Test EOF
|
|
>>> parse_stream_error(stream=None, statusline='', known_format='warc')
|
|
Exception: EOFError
|
|
|
|
>>> parse_stream_error(stream=None, statusline='', known_format='arc')
|
|
Exception: EOFError
|
|
|
|
# Invalid ARC
|
|
>>> parse_stream_error(stream=None, statusline='ABC', known_format='arc')
|
|
Exception: ArchiveLoadFailed
|
|
|
|
# Invalid WARC
|
|
>>> parse_stream_error(stream=None, statusline='ABC', known_format='warc')
|
|
Exception: ArchiveLoadFailed
|
|
|
|
# Revisit Errors
|
|
# original missing
|
|
>>> load_from_cdx_test('com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - - 1864 example.warc.gz - - -', reraise=True)
|
|
Traceback (most recent call last):
|
|
ArchiveLoadFailed: Missing Revisit Original
|
|
|
|
# original warc not found
|
|
>>> load_from_cdx_test('com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - - 1864 example.warc.gz 100 10 someunknown.warc.gz', reraise=True)
|
|
Traceback (most recent call last):
|
|
ArchiveLoadFailed: Missing Revisit Original
|
|
|
|
# no revisit func available
|
|
>>> load_from_cdx_test(URL_AGNOSTIC_REVISIT_CDX, revisit_func=None, reraise=True)
|
|
Traceback (most recent call last):
|
|
ArchiveLoadFailed: Original for revisit could not be loaded
|
|
|
|
# url-agnostic original found, but could not be loaded
|
|
>>> load_from_cdx_test(URL_AGNOSTIC_REVISIT_CDX, revisit_func=load_orig_bad_cdx, reraise=True)
|
|
Traceback (most recent call last):
|
|
ArchiveLoadFailed: Original for revisit could not be loaded
|
|
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import pprint
|
|
|
|
from pywb.warc.recordloader import ArcWarcRecordLoader, ArchiveLoadFailed
|
|
from pywb.warc.pathresolvers import make_best_resolvers
|
|
from pywb.warc.resolvingloader import ResolvingLoader
|
|
from pywb.cdx.cdxobject import CDXObject
|
|
|
|
from pywb import get_test_dir
|
|
|
|
#==============================================================================
|
|
test_warc_dir = get_test_dir() + 'warcs/'
|
|
|
|
|
|
URL_AGNOSTIC_ORIG_CDX = 'org,iana,example)/ 20130702195402 http://example.iana.org/ \
|
|
text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - \
|
|
1001 353 example-url-agnostic-orig.warc.gz'
|
|
|
|
URL_AGNOSTIC_REVISIT_CDX = 'com,example)/ 20130729195151 http://test@example.com/ \
|
|
warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - \
|
|
591 355 example-url-agnostic-revisit.warc.gz'
|
|
|
|
URL_AGNOSTIC_REVISIT_NO_DIGEST_CDX = 'com,example)/ 20130729195151 http://test@example.com/ \
|
|
warc/revisit - - - - \
|
|
591 355 example-url-agnostic-revisit.warc.gz'
|
|
|
|
BAD_ORIG_CDX = 'org,iana,example)/ 20130702195401 http://example.iana.org/ \
|
|
text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - \
|
|
1001 353 someunknown.warc.gz'
|
|
|
|
|
|
#==============================================================================
|
|
def load_test_archive(test_file, offset, length):
|
|
path = test_warc_dir + test_file
|
|
|
|
testloader = ArcWarcRecordLoader()
|
|
|
|
archive = testloader.load(path, offset, length)
|
|
|
|
pprint.pprint(((archive.format, archive.rec_type),
|
|
archive.rec_headers, archive.status_headers))
|
|
|
|
|
|
#==============================================================================
|
|
def load_orig_bad_cdx(self):
|
|
return [CDXObject(BAD_ORIG_CDX),
|
|
CDXObject(BAD_ORIG_CDX)]
|
|
|
|
|
|
#==============================================================================
|
|
def load_orig_cdx(self):
|
|
return [CDXObject(BAD_ORIG_CDX),
|
|
CDXObject(URL_AGNOSTIC_ORIG_CDX)]
|
|
|
|
|
|
#==============================================================================
|
|
def load_from_cdx_test(cdx, revisit_func=load_orig_cdx, reraise=False,
|
|
failed_files=None):
|
|
resolve_loader = ResolvingLoader(test_warc_dir)
|
|
cdx = CDXObject(cdx)
|
|
|
|
try:
|
|
(headers, stream) = resolve_loader(cdx, failed_files, revisit_func)
|
|
print headers
|
|
sys.stdout.write(stream.readline())
|
|
sys.stdout.write(stream.readline())
|
|
except ArchiveLoadFailed as e:
|
|
if reraise:
|
|
raise
|
|
else:
|
|
print 'Exception: ' + e.__class__.__name__
|
|
|
|
|
|
#==============================================================================
|
|
def parse_stream_error(**params):
|
|
try:
|
|
return ArcWarcRecordLoader().parse_record_stream(**params)
|
|
except Exception as e:
|
|
print 'Exception: ' + e.__class__.__name__
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import doctest
|
|
doctest.testmod()
|