1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

warc/cdx: include metadata and resource records in default cdx index

emit 200 and 204 responses for metadata and resource, though write '-'
to cdx (for compatibility for now)
include content-length in resource/metadata records
This commit is contained in:
Ilya Kreymer 2014-10-28 10:29:50 -07:00
parent c9273ee5ed
commit 61ce53a0e0
3 changed files with 33 additions and 20 deletions

View File

@ -168,6 +168,8 @@ class ArchiveIndexEntry(object):
self.status = status_headers.get_statuscode()
if not self.status:
self.status = '-'
if self.status == '204' and 'Error' in status_headers.statusline:
self.status = '-'
def set_rec_info(self, offset, length, digest):
self.offset = str(offset)
@ -314,11 +316,11 @@ def parse_warc_record(record):
get_header('Content-Type'),
def_mime)
# status
if record.rec_type in ('request', 'revisit'):
entry.status = '-'
else:
# status -- only for response records (by convention):
if record.rec_type == 'response':
entry.extract_status(record.status_headers)
else:
entry.status = '-'
# digest
entry.digest = record.rec_headers.get_header('WARC-Payload-Digest')

View File

@ -36,7 +36,7 @@ class ArchiveLoadFailed(WbException):
#=================================================================
class ArcWarcRecordLoader:
# Standard ARC v1.0 headers
# TODO: support ARV v2.0 also?
# TODO: support ARC v2.0 also?
ARC_HEADERS = ["uri", "ip-address", "archive-date",
"content-type", "length"]
@ -128,9 +128,14 @@ class ArcWarcRecordLoader:
# limit stream to the length for all valid records
stream = LimitReader.wrap_stream(stream, length)
# if empty record (error or otherwise) set status to -
# if empty record (error or otherwise) set status to 204
if length == 0:
status_headers = StatusAndHeaders('- None', [])
if is_err:
msg = '204 Possible Error'
else:
msg = '204 No Content'
status_headers = StatusAndHeaders(msg, [])
# response record or non-empty revisit: parse HTTP status and headers!
elif (rec_type in ('response', 'revisit') and
@ -144,8 +149,10 @@ class ArcWarcRecordLoader:
# everything else: create a no-status entry, set content-type
else:
content_type_header = [('Content-Type', content_type)]
status_headers = StatusAndHeaders('- OK', content_type_header)
content_type_header = [('Content-Type', content_type),
('Content-Length', length)]
status_headers = StatusAndHeaders('200 OK', content_type_header)
return ArcWarcRecord(the_format, rec_type,
rec_headers, stream, status_headers,

View File

@ -43,12 +43,16 @@ com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ
CDX N b a m s k r M S V g
com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1656 151 example.arc
# wget warc, just responses
# wget warc, includes metadata by default
>>> print_cdx_index('example-wget-1-14.warc.gz')
CDX N b a m s k r M S V g
com,example)/ 20140216012908 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1151 792 example-wget-1-14.warc.gz
metadata)/gnu.org/software/wget/warc/manifest.txt 20140216012908 metadata://gnu.org/software/wget/warc/MANIFEST.txt text/plain - SWUF4CK2XMZSOKSA7SDT7M7NUGWH2TRE - - 315 1943 example-wget-1-14.warc.gz
metadata)/gnu.org/software/wget/warc/wget_arguments.txt 20140216012908 metadata://gnu.org/software/wget/warc/wget_arguments.txt text/plain - UCXDCGORD6K4RJT5NUQGKE2PKEG4ZZD6 - - 340 2258 example-wget-1-14.warc.gz
metadata)/gnu.org/software/wget/warc/wget.log 20140216012908 metadata://gnu.org/software/wget/warc/wget.log text/plain - 2ULE2LD5UOWDXGACCT624TU5BVKACRQ4 - - 599 2598 example-wget-1-14.warc.gz
# wget warc include all w/ metadata
# wget warc, includes metadata and request
>>> print_cdx_index('example-wget-1-14.warc.gz', include_all=True)
CDX N b a m s k r M S V g
com,example)/ 20140216012908 http://example.com/ - - - - - 394 398 example-wget-1-14.warc.gz
@ -110,32 +114,32 @@ org,httpbin)/post?data=^&foo=bar 20140610001255 http://httpbin.org/post?foo=bar
>>> cli_lines(['--sort', '-', TEST_WARC_DIR])
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz
org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1001 353 example-url-agnostic-orig.warc.gz
200
Total: 206
# test sort, multiple inputs, all records + post query
>>> cli_lines(['--sort', '-a', '-p', '-9', TEST_WARC_DIR])
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - 355 example-url-agnostic-revisit.warc.gz
org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - 353 example-url-agnostic-orig.warc.gz
398
Total: 398
# test writing to stdout
>>> cli_lines(['-', TEST_WARC_DIR + 'example.warc.gz'])
com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz
org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
4
Total: 4
# test writing to stdout ('-' omitted)
>>> cli_lines([TEST_WARC_DIR + 'example.warc.gz'])
com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz
org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
4
Total: 4
# test writing to temp dir, also use unicode filename
>>> cli_lines_with_dir(unicode(TEST_WARC_DIR + 'example.warc.gz'))
example.cdx
com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz
org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
4
Total: 4
"""
from pywb import get_test_dir
@ -191,9 +195,9 @@ def cli_lines(cmds):
lines = buff.getvalue().rstrip().split('\n')
# print first, last, num lines
print (lines[1])
print (lines[-1])
print len(lines)
print(lines[1])
print(lines[-1])
print('Total: ' + str(len(lines)))
def cli_lines_with_dir(input_):
try:
@ -224,6 +228,6 @@ def cli_lines_with_dir(input_):
# print first, last, num lines
print (lines[1])
print (lines[-1])
print len(lines)
print('Total: ' + str(len(lines)))