mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
warc/cdx: include metadata and resource records in default cdx index
emit 200 and 204 responses for metadata and resource, though write '-' to cdx (for compatibility for now) include content-length in resource/metadata records
This commit is contained in:
parent
c9273ee5ed
commit
61ce53a0e0
@ -168,6 +168,8 @@ class ArchiveIndexEntry(object):
|
||||
self.status = status_headers.get_statuscode()
|
||||
if not self.status:
|
||||
self.status = '-'
|
||||
if self.status == '204' and 'Error' in status_headers.statusline:
|
||||
self.status = '-'
|
||||
|
||||
def set_rec_info(self, offset, length, digest):
|
||||
self.offset = str(offset)
|
||||
@ -314,11 +316,11 @@ def parse_warc_record(record):
|
||||
get_header('Content-Type'),
|
||||
def_mime)
|
||||
|
||||
# status
|
||||
if record.rec_type in ('request', 'revisit'):
|
||||
entry.status = '-'
|
||||
else:
|
||||
# status -- only for response records (by convention):
|
||||
if record.rec_type == 'response':
|
||||
entry.extract_status(record.status_headers)
|
||||
else:
|
||||
entry.status = '-'
|
||||
|
||||
# digest
|
||||
entry.digest = record.rec_headers.get_header('WARC-Payload-Digest')
|
||||
|
@ -36,7 +36,7 @@ class ArchiveLoadFailed(WbException):
|
||||
#=================================================================
|
||||
class ArcWarcRecordLoader:
|
||||
# Standard ARC v1.0 headers
|
||||
# TODO: support ARV v2.0 also?
|
||||
# TODO: support ARC v2.0 also?
|
||||
ARC_HEADERS = ["uri", "ip-address", "archive-date",
|
||||
"content-type", "length"]
|
||||
|
||||
@ -128,9 +128,14 @@ class ArcWarcRecordLoader:
|
||||
# limit stream to the length for all valid records
|
||||
stream = LimitReader.wrap_stream(stream, length)
|
||||
|
||||
# if empty record (error or otherwise) set status to -
|
||||
# if empty record (error or otherwise) set status to 204
|
||||
if length == 0:
|
||||
status_headers = StatusAndHeaders('- None', [])
|
||||
if is_err:
|
||||
msg = '204 Possible Error'
|
||||
else:
|
||||
msg = '204 No Content'
|
||||
|
||||
status_headers = StatusAndHeaders(msg, [])
|
||||
|
||||
# response record or non-empty revisit: parse HTTP status and headers!
|
||||
elif (rec_type in ('response', 'revisit') and
|
||||
@ -144,8 +149,10 @@ class ArcWarcRecordLoader:
|
||||
|
||||
# everything else: create a no-status entry, set content-type
|
||||
else:
|
||||
content_type_header = [('Content-Type', content_type)]
|
||||
status_headers = StatusAndHeaders('- OK', content_type_header)
|
||||
content_type_header = [('Content-Type', content_type),
|
||||
('Content-Length', length)]
|
||||
|
||||
status_headers = StatusAndHeaders('200 OK', content_type_header)
|
||||
|
||||
return ArcWarcRecord(the_format, rec_type,
|
||||
rec_headers, stream, status_headers,
|
||||
|
@ -43,12 +43,16 @@ com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ
|
||||
CDX N b a m s k r M S V g
|
||||
com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1656 151 example.arc
|
||||
|
||||
# wget warc, just responses
|
||||
# wget warc, includes metadata by default
|
||||
>>> print_cdx_index('example-wget-1-14.warc.gz')
|
||||
CDX N b a m s k r M S V g
|
||||
com,example)/ 20140216012908 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1151 792 example-wget-1-14.warc.gz
|
||||
metadata)/gnu.org/software/wget/warc/manifest.txt 20140216012908 metadata://gnu.org/software/wget/warc/MANIFEST.txt text/plain - SWUF4CK2XMZSOKSA7SDT7M7NUGWH2TRE - - 315 1943 example-wget-1-14.warc.gz
|
||||
metadata)/gnu.org/software/wget/warc/wget_arguments.txt 20140216012908 metadata://gnu.org/software/wget/warc/wget_arguments.txt text/plain - UCXDCGORD6K4RJT5NUQGKE2PKEG4ZZD6 - - 340 2258 example-wget-1-14.warc.gz
|
||||
metadata)/gnu.org/software/wget/warc/wget.log 20140216012908 metadata://gnu.org/software/wget/warc/wget.log text/plain - 2ULE2LD5UOWDXGACCT624TU5BVKACRQ4 - - 599 2598 example-wget-1-14.warc.gz
|
||||
|
||||
# wget warc include all w/ metadata
|
||||
|
||||
# wget warc, includes metadata and request
|
||||
>>> print_cdx_index('example-wget-1-14.warc.gz', include_all=True)
|
||||
CDX N b a m s k r M S V g
|
||||
com,example)/ 20140216012908 http://example.com/ - - - - - 394 398 example-wget-1-14.warc.gz
|
||||
@ -110,32 +114,32 @@ org,httpbin)/post?data=^&foo=bar 20140610001255 http://httpbin.org/post?foo=bar
|
||||
>>> cli_lines(['--sort', '-', TEST_WARC_DIR])
|
||||
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz
|
||||
org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1001 353 example-url-agnostic-orig.warc.gz
|
||||
200
|
||||
Total: 206
|
||||
|
||||
# test sort, multiple inputs, all records + post query
|
||||
>>> cli_lines(['--sort', '-a', '-p', '-9', TEST_WARC_DIR])
|
||||
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - 355 example-url-agnostic-revisit.warc.gz
|
||||
org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - 353 example-url-agnostic-orig.warc.gz
|
||||
398
|
||||
Total: 398
|
||||
|
||||
# test writing to stdout
|
||||
>>> cli_lines(['-', TEST_WARC_DIR + 'example.warc.gz'])
|
||||
com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz
|
||||
org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
|
||||
4
|
||||
Total: 4
|
||||
|
||||
# test writing to stdout ('-' omitted)
|
||||
>>> cli_lines([TEST_WARC_DIR + 'example.warc.gz'])
|
||||
com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz
|
||||
org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
|
||||
4
|
||||
Total: 4
|
||||
|
||||
# test writing to temp dir, also use unicode filename
|
||||
>>> cli_lines_with_dir(unicode(TEST_WARC_DIR + 'example.warc.gz'))
|
||||
example.cdx
|
||||
com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz
|
||||
org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
|
||||
4
|
||||
Total: 4
|
||||
"""
|
||||
|
||||
from pywb import get_test_dir
|
||||
@ -191,9 +195,9 @@ def cli_lines(cmds):
|
||||
lines = buff.getvalue().rstrip().split('\n')
|
||||
|
||||
# print first, last, num lines
|
||||
print (lines[1])
|
||||
print (lines[-1])
|
||||
print len(lines)
|
||||
print(lines[1])
|
||||
print(lines[-1])
|
||||
print('Total: ' + str(len(lines)))
|
||||
|
||||
def cli_lines_with_dir(input_):
|
||||
try:
|
||||
@ -224,6 +228,6 @@ def cli_lines_with_dir(input_):
|
||||
# print first, last, num lines
|
||||
print (lines[1])
|
||||
print (lines[-1])
|
||||
print len(lines)
|
||||
print('Total: ' + str(len(lines)))
|
||||
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user