mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
warc/cdx: include metadata and resource records in default cdx index
emit 200 and 204 responses for metadata and resource, though write '-' to cdx (for compatibility for now) include content-length in resource/metadata records
This commit is contained in:
parent
c9273ee5ed
commit
61ce53a0e0
@ -168,6 +168,8 @@ class ArchiveIndexEntry(object):
|
|||||||
self.status = status_headers.get_statuscode()
|
self.status = status_headers.get_statuscode()
|
||||||
if not self.status:
|
if not self.status:
|
||||||
self.status = '-'
|
self.status = '-'
|
||||||
|
if self.status == '204' and 'Error' in status_headers.statusline:
|
||||||
|
self.status = '-'
|
||||||
|
|
||||||
def set_rec_info(self, offset, length, digest):
|
def set_rec_info(self, offset, length, digest):
|
||||||
self.offset = str(offset)
|
self.offset = str(offset)
|
||||||
@ -314,11 +316,11 @@ def parse_warc_record(record):
|
|||||||
get_header('Content-Type'),
|
get_header('Content-Type'),
|
||||||
def_mime)
|
def_mime)
|
||||||
|
|
||||||
# status
|
# status -- only for response records (by convention):
|
||||||
if record.rec_type in ('request', 'revisit'):
|
if record.rec_type == 'response':
|
||||||
entry.status = '-'
|
|
||||||
else:
|
|
||||||
entry.extract_status(record.status_headers)
|
entry.extract_status(record.status_headers)
|
||||||
|
else:
|
||||||
|
entry.status = '-'
|
||||||
|
|
||||||
# digest
|
# digest
|
||||||
entry.digest = record.rec_headers.get_header('WARC-Payload-Digest')
|
entry.digest = record.rec_headers.get_header('WARC-Payload-Digest')
|
||||||
|
@ -36,7 +36,7 @@ class ArchiveLoadFailed(WbException):
|
|||||||
#=================================================================
|
#=================================================================
|
||||||
class ArcWarcRecordLoader:
|
class ArcWarcRecordLoader:
|
||||||
# Standard ARC v1.0 headers
|
# Standard ARC v1.0 headers
|
||||||
# TODO: support ARV v2.0 also?
|
# TODO: support ARC v2.0 also?
|
||||||
ARC_HEADERS = ["uri", "ip-address", "archive-date",
|
ARC_HEADERS = ["uri", "ip-address", "archive-date",
|
||||||
"content-type", "length"]
|
"content-type", "length"]
|
||||||
|
|
||||||
@ -128,9 +128,14 @@ class ArcWarcRecordLoader:
|
|||||||
# limit stream to the length for all valid records
|
# limit stream to the length for all valid records
|
||||||
stream = LimitReader.wrap_stream(stream, length)
|
stream = LimitReader.wrap_stream(stream, length)
|
||||||
|
|
||||||
# if empty record (error or otherwise) set status to -
|
# if empty record (error or otherwise) set status to 204
|
||||||
if length == 0:
|
if length == 0:
|
||||||
status_headers = StatusAndHeaders('- None', [])
|
if is_err:
|
||||||
|
msg = '204 Possible Error'
|
||||||
|
else:
|
||||||
|
msg = '204 No Content'
|
||||||
|
|
||||||
|
status_headers = StatusAndHeaders(msg, [])
|
||||||
|
|
||||||
# response record or non-empty revisit: parse HTTP status and headers!
|
# response record or non-empty revisit: parse HTTP status and headers!
|
||||||
elif (rec_type in ('response', 'revisit') and
|
elif (rec_type in ('response', 'revisit') and
|
||||||
@ -144,8 +149,10 @@ class ArcWarcRecordLoader:
|
|||||||
|
|
||||||
# everything else: create a no-status entry, set content-type
|
# everything else: create a no-status entry, set content-type
|
||||||
else:
|
else:
|
||||||
content_type_header = [('Content-Type', content_type)]
|
content_type_header = [('Content-Type', content_type),
|
||||||
status_headers = StatusAndHeaders('- OK', content_type_header)
|
('Content-Length', length)]
|
||||||
|
|
||||||
|
status_headers = StatusAndHeaders('200 OK', content_type_header)
|
||||||
|
|
||||||
return ArcWarcRecord(the_format, rec_type,
|
return ArcWarcRecord(the_format, rec_type,
|
||||||
rec_headers, stream, status_headers,
|
rec_headers, stream, status_headers,
|
||||||
|
@ -43,12 +43,16 @@ com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ
|
|||||||
CDX N b a m s k r M S V g
|
CDX N b a m s k r M S V g
|
||||||
com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1656 151 example.arc
|
com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1656 151 example.arc
|
||||||
|
|
||||||
# wget warc, just responses
|
# wget warc, includes metadata by default
|
||||||
>>> print_cdx_index('example-wget-1-14.warc.gz')
|
>>> print_cdx_index('example-wget-1-14.warc.gz')
|
||||||
CDX N b a m s k r M S V g
|
CDX N b a m s k r M S V g
|
||||||
com,example)/ 20140216012908 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1151 792 example-wget-1-14.warc.gz
|
com,example)/ 20140216012908 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1151 792 example-wget-1-14.warc.gz
|
||||||
|
metadata)/gnu.org/software/wget/warc/manifest.txt 20140216012908 metadata://gnu.org/software/wget/warc/MANIFEST.txt text/plain - SWUF4CK2XMZSOKSA7SDT7M7NUGWH2TRE - - 315 1943 example-wget-1-14.warc.gz
|
||||||
|
metadata)/gnu.org/software/wget/warc/wget_arguments.txt 20140216012908 metadata://gnu.org/software/wget/warc/wget_arguments.txt text/plain - UCXDCGORD6K4RJT5NUQGKE2PKEG4ZZD6 - - 340 2258 example-wget-1-14.warc.gz
|
||||||
|
metadata)/gnu.org/software/wget/warc/wget.log 20140216012908 metadata://gnu.org/software/wget/warc/wget.log text/plain - 2ULE2LD5UOWDXGACCT624TU5BVKACRQ4 - - 599 2598 example-wget-1-14.warc.gz
|
||||||
|
|
||||||
# wget warc include all w/ metadata
|
|
||||||
|
# wget warc, includes metadata and request
|
||||||
>>> print_cdx_index('example-wget-1-14.warc.gz', include_all=True)
|
>>> print_cdx_index('example-wget-1-14.warc.gz', include_all=True)
|
||||||
CDX N b a m s k r M S V g
|
CDX N b a m s k r M S V g
|
||||||
com,example)/ 20140216012908 http://example.com/ - - - - - 394 398 example-wget-1-14.warc.gz
|
com,example)/ 20140216012908 http://example.com/ - - - - - 394 398 example-wget-1-14.warc.gz
|
||||||
@ -110,32 +114,32 @@ org,httpbin)/post?data=^&foo=bar 20140610001255 http://httpbin.org/post?foo=bar
|
|||||||
>>> cli_lines(['--sort', '-', TEST_WARC_DIR])
|
>>> cli_lines(['--sort', '-', TEST_WARC_DIR])
|
||||||
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz
|
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz
|
||||||
org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1001 353 example-url-agnostic-orig.warc.gz
|
org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1001 353 example-url-agnostic-orig.warc.gz
|
||||||
200
|
Total: 206
|
||||||
|
|
||||||
# test sort, multiple inputs, all records + post query
|
# test sort, multiple inputs, all records + post query
|
||||||
>>> cli_lines(['--sort', '-a', '-p', '-9', TEST_WARC_DIR])
|
>>> cli_lines(['--sort', '-a', '-p', '-9', TEST_WARC_DIR])
|
||||||
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - 355 example-url-agnostic-revisit.warc.gz
|
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - 355 example-url-agnostic-revisit.warc.gz
|
||||||
org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - 353 example-url-agnostic-orig.warc.gz
|
org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - 353 example-url-agnostic-orig.warc.gz
|
||||||
398
|
Total: 398
|
||||||
|
|
||||||
# test writing to stdout
|
# test writing to stdout
|
||||||
>>> cli_lines(['-', TEST_WARC_DIR + 'example.warc.gz'])
|
>>> cli_lines(['-', TEST_WARC_DIR + 'example.warc.gz'])
|
||||||
com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz
|
com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz
|
||||||
org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
|
org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
|
||||||
4
|
Total: 4
|
||||||
|
|
||||||
# test writing to stdout ('-' omitted)
|
# test writing to stdout ('-' omitted)
|
||||||
>>> cli_lines([TEST_WARC_DIR + 'example.warc.gz'])
|
>>> cli_lines([TEST_WARC_DIR + 'example.warc.gz'])
|
||||||
com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz
|
com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz
|
||||||
org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
|
org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
|
||||||
4
|
Total: 4
|
||||||
|
|
||||||
# test writing to temp dir, also use unicode filename
|
# test writing to temp dir, also use unicode filename
|
||||||
>>> cli_lines_with_dir(unicode(TEST_WARC_DIR + 'example.warc.gz'))
|
>>> cli_lines_with_dir(unicode(TEST_WARC_DIR + 'example.warc.gz'))
|
||||||
example.cdx
|
example.cdx
|
||||||
com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz
|
com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz
|
||||||
org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
|
org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
|
||||||
4
|
Total: 4
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from pywb import get_test_dir
|
from pywb import get_test_dir
|
||||||
@ -191,9 +195,9 @@ def cli_lines(cmds):
|
|||||||
lines = buff.getvalue().rstrip().split('\n')
|
lines = buff.getvalue().rstrip().split('\n')
|
||||||
|
|
||||||
# print first, last, num lines
|
# print first, last, num lines
|
||||||
print (lines[1])
|
print(lines[1])
|
||||||
print (lines[-1])
|
print(lines[-1])
|
||||||
print len(lines)
|
print('Total: ' + str(len(lines)))
|
||||||
|
|
||||||
def cli_lines_with_dir(input_):
|
def cli_lines_with_dir(input_):
|
||||||
try:
|
try:
|
||||||
@ -224,6 +228,6 @@ def cli_lines_with_dir(input_):
|
|||||||
# print first, last, num lines
|
# print first, last, num lines
|
||||||
print (lines[1])
|
print (lines[1])
|
||||||
print (lines[-1])
|
print (lines[-1])
|
||||||
print len(lines)
|
print('Total: ' + str(len(lines)))
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user