From 61ce53a0e0f9e9cc99ea4c07dba9704cf77f5779 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 28 Oct 2014 10:29:50 -0700 Subject: [PATCH] warc/cdx: include metadata and resource records in default cdx index emit 200 and 204 responses for metadata and resource, though write '-' to cdx (for compatibility for now) include content-length in resource/metadata records --- pywb/warc/archiveiterator.py | 10 ++++++---- pywb/warc/recordloader.py | 17 ++++++++++++----- pywb/warc/test/test_indexing.py | 26 +++++++++++++++----------- 3 files changed, 33 insertions(+), 20 deletions(-) diff --git a/pywb/warc/archiveiterator.py b/pywb/warc/archiveiterator.py index 6e9488e9..659bd2e1 100644 --- a/pywb/warc/archiveiterator.py +++ b/pywb/warc/archiveiterator.py @@ -168,6 +168,8 @@ class ArchiveIndexEntry(object): self.status = status_headers.get_statuscode() if not self.status: self.status = '-' + if self.status == '204' and 'Error' in status_headers.statusline: + self.status = '-' def set_rec_info(self, offset, length, digest): self.offset = str(offset) @@ -314,11 +316,11 @@ def parse_warc_record(record): get_header('Content-Type'), def_mime) - # status - if record.rec_type in ('request', 'revisit'): - entry.status = '-' - else: + # status -- only for response records (by convention): + if record.rec_type == 'response': entry.extract_status(record.status_headers) + else: + entry.status = '-' # digest entry.digest = record.rec_headers.get_header('WARC-Payload-Digest') diff --git a/pywb/warc/recordloader.py b/pywb/warc/recordloader.py index 12b93b9c..67cc9e22 100644 --- a/pywb/warc/recordloader.py +++ b/pywb/warc/recordloader.py @@ -36,7 +36,7 @@ class ArchiveLoadFailed(WbException): #================================================================= class ArcWarcRecordLoader: # Standard ARC v1.0 headers - # TODO: support ARV v2.0 also? + # TODO: support ARC v2.0 also? ARC_HEADERS = ["uri", "ip-address", "archive-date", "content-type", "length"] @@ -128,9 +128,14 @@ class ArcWarcRecordLoader: # limit stream to the length for all valid records stream = LimitReader.wrap_stream(stream, length) - # if empty record (error or otherwise) set status to - + # if empty record (error or otherwise) set status to 204 if length == 0: - status_headers = StatusAndHeaders('- None', []) + if is_err: + msg = '204 Possible Error' + else: + msg = '204 No Content' + + status_headers = StatusAndHeaders(msg, []) # response record or non-empty revisit: parse HTTP status and headers! elif (rec_type in ('response', 'revisit') and @@ -144,8 +149,10 @@ class ArcWarcRecordLoader: # everything else: create a no-status entry, set content-type else: - content_type_header = [('Content-Type', content_type)] - status_headers = StatusAndHeaders('- OK', content_type_header) + content_type_header = [('Content-Type', content_type), + ('Content-Length', length)] + + status_headers = StatusAndHeaders('200 OK', content_type_header) return ArcWarcRecord(the_format, rec_type, rec_headers, stream, status_headers, diff --git a/pywb/warc/test/test_indexing.py b/pywb/warc/test/test_indexing.py index b90e9d65..cb8dc4bb 100644 --- a/pywb/warc/test/test_indexing.py +++ b/pywb/warc/test/test_indexing.py @@ -43,12 +43,16 @@ com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ CDX N b a m s k r M S V g com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1656 151 example.arc -# wget warc, just responses +# wget warc, includes metadata by default >>> print_cdx_index('example-wget-1-14.warc.gz') CDX N b a m s k r M S V g com,example)/ 20140216012908 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1151 792 example-wget-1-14.warc.gz +metadata)/gnu.org/software/wget/warc/manifest.txt 20140216012908 metadata://gnu.org/software/wget/warc/MANIFEST.txt text/plain - SWUF4CK2XMZSOKSA7SDT7M7NUGWH2TRE - - 315 1943 example-wget-1-14.warc.gz +metadata)/gnu.org/software/wget/warc/wget_arguments.txt 20140216012908 metadata://gnu.org/software/wget/warc/wget_arguments.txt text/plain - UCXDCGORD6K4RJT5NUQGKE2PKEG4ZZD6 - - 340 2258 example-wget-1-14.warc.gz +metadata)/gnu.org/software/wget/warc/wget.log 20140216012908 metadata://gnu.org/software/wget/warc/wget.log text/plain - 2ULE2LD5UOWDXGACCT624TU5BVKACRQ4 - - 599 2598 example-wget-1-14.warc.gz -# wget warc include all w/ metadata + +# wget warc, includes metadata and request >>> print_cdx_index('example-wget-1-14.warc.gz', include_all=True) CDX N b a m s k r M S V g com,example)/ 20140216012908 http://example.com/ - - - - - 394 398 example-wget-1-14.warc.gz @@ -110,32 +114,32 @@ org,httpbin)/post?data=^&foo=bar 20140610001255 http://httpbin.org/post?foo=bar >>> cli_lines(['--sort', '-', TEST_WARC_DIR]) com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1001 353 example-url-agnostic-orig.warc.gz -200 +Total: 206 # test sort, multiple inputs, all records + post query >>> cli_lines(['--sort', '-a', '-p', '-9', TEST_WARC_DIR]) com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - 355 example-url-agnostic-revisit.warc.gz org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - 353 example-url-agnostic-orig.warc.gz -398 +Total: 398 # test writing to stdout >>> cli_lines(['-', TEST_WARC_DIR + 'example.warc.gz']) com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz -4 +Total: 4 # test writing to stdout ('-' omitted) >>> cli_lines([TEST_WARC_DIR + 'example.warc.gz']) com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz -4 +Total: 4 # test writing to temp dir, also use unicode filename >>> cli_lines_with_dir(unicode(TEST_WARC_DIR + 'example.warc.gz')) example.cdx com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz -4 +Total: 4 """ from pywb import get_test_dir @@ -191,9 +195,9 @@ def cli_lines(cmds): lines = buff.getvalue().rstrip().split('\n') # print first, last, num lines - print (lines[1]) - print (lines[-1]) - print len(lines) + print(lines[1]) + print(lines[-1]) + print('Total: ' + str(len(lines))) def cli_lines_with_dir(input_): try: @@ -224,6 +228,6 @@ def cli_lines_with_dir(input_): # print first, last, num lines print (lines[1]) print (lines[-1]) - print len(lines) + print('Total: ' + str(len(lines)))