From 61ce53a0e0f9e9cc99ea4c07dba9704cf77f5779 Mon Sep 17 00:00:00 2001
From: Ilya Kreymer <ikreymer@gmail.com>
Date: Tue, 28 Oct 2014 10:29:50 -0700
Subject: [PATCH] warc/cdx: include metadata and resource records in default
 cdx index emit 200 and 204 responses for metadata and resource, though write
 '-' to cdx (for compatibility for now) include content-length in
 resource/metadata records

---
 pywb/warc/archiveiterator.py    | 10 ++++++----
 pywb/warc/recordloader.py       | 17 ++++++++++++-----
 pywb/warc/test/test_indexing.py | 26 +++++++++++++++-----------
 3 files changed, 33 insertions(+), 20 deletions(-)

diff --git a/pywb/warc/archiveiterator.py b/pywb/warc/archiveiterator.py
index 6e9488e9..659bd2e1 100644
--- a/pywb/warc/archiveiterator.py
+++ b/pywb/warc/archiveiterator.py
@@ -168,6 +168,8 @@ class ArchiveIndexEntry(object):
         self.status = status_headers.get_statuscode()
         if not self.status:
             self.status = '-'
+        if self.status == '204' and 'Error' in status_headers.statusline:
+            self.status = '-'
 
     def set_rec_info(self, offset, length, digest):
         self.offset = str(offset)
@@ -314,11 +316,11 @@ def parse_warc_record(record):
                            get_header('Content-Type'),
                            def_mime)
 
-    # status
-    if record.rec_type in ('request', 'revisit'):
-        entry.status = '-'
-    else:
+    # status -- only for response records (by convention):
+    if record.rec_type == 'response':
         entry.extract_status(record.status_headers)
+    else:
+        entry.status = '-'
 
     # digest
     entry.digest = record.rec_headers.get_header('WARC-Payload-Digest')
diff --git a/pywb/warc/recordloader.py b/pywb/warc/recordloader.py
index 12b93b9c..67cc9e22 100644
--- a/pywb/warc/recordloader.py
+++ b/pywb/warc/recordloader.py
@@ -36,7 +36,7 @@ class ArchiveLoadFailed(WbException):
 #=================================================================
 class ArcWarcRecordLoader:
     # Standard ARC v1.0 headers
-    # TODO: support ARV v2.0 also?
+    # TODO: support ARC v2.0 also?
     ARC_HEADERS = ["uri", "ip-address", "archive-date",
                    "content-type", "length"]
 
@@ -128,9 +128,14 @@ class ArcWarcRecordLoader:
         # limit stream to the length for all valid records
         stream = LimitReader.wrap_stream(stream, length)
 
-        # if empty record (error or otherwise) set status to -
+        # if empty record (error or otherwise) set status to 204
         if length == 0:
-            status_headers = StatusAndHeaders('- None', [])
+            if is_err:
+                msg = '204 Possible Error'
+            else:
+                msg = '204 No Content'
+
+            status_headers = StatusAndHeaders(msg, [])
 
         # response record or non-empty revisit: parse HTTP status and headers!
         elif (rec_type in ('response', 'revisit') and
@@ -144,8 +149,10 @@ class ArcWarcRecordLoader:
 
         # everything else: create a no-status entry, set content-type
         else:
-            content_type_header = [('Content-Type', content_type)]
-            status_headers = StatusAndHeaders('- OK', content_type_header)
+            content_type_header = [('Content-Type', content_type),
+                                   ('Content-Length', length)]
+
+            status_headers = StatusAndHeaders('200 OK', content_type_header)
 
         return ArcWarcRecord(the_format, rec_type,
                              rec_headers, stream, status_headers,
diff --git a/pywb/warc/test/test_indexing.py b/pywb/warc/test/test_indexing.py
index b90e9d65..cb8dc4bb 100644
--- a/pywb/warc/test/test_indexing.py
+++ b/pywb/warc/test/test_indexing.py
@@ -43,12 +43,16 @@ com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ
  CDX N b a m s k r M S V g
 com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1656 151 example.arc
 
-# wget warc, just responses
+# wget warc, includes metadata by default
 >>> print_cdx_index('example-wget-1-14.warc.gz')
  CDX N b a m s k r M S V g
 com,example)/ 20140216012908 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1151 792 example-wget-1-14.warc.gz
+metadata)/gnu.org/software/wget/warc/manifest.txt 20140216012908 metadata://gnu.org/software/wget/warc/MANIFEST.txt text/plain - SWUF4CK2XMZSOKSA7SDT7M7NUGWH2TRE - - 315 1943 example-wget-1-14.warc.gz
+metadata)/gnu.org/software/wget/warc/wget_arguments.txt 20140216012908 metadata://gnu.org/software/wget/warc/wget_arguments.txt text/plain - UCXDCGORD6K4RJT5NUQGKE2PKEG4ZZD6 - - 340 2258 example-wget-1-14.warc.gz
+metadata)/gnu.org/software/wget/warc/wget.log 20140216012908 metadata://gnu.org/software/wget/warc/wget.log text/plain - 2ULE2LD5UOWDXGACCT624TU5BVKACRQ4 - - 599 2598 example-wget-1-14.warc.gz
 
-# wget warc include all w/ metadata
+
+# wget warc, includes metadata and request
 >>> print_cdx_index('example-wget-1-14.warc.gz', include_all=True)
  CDX N b a m s k r M S V g
 com,example)/ 20140216012908 http://example.com/ - - - - - 394 398 example-wget-1-14.warc.gz
@@ -110,32 +114,32 @@ org,httpbin)/post?data=^&foo=bar 20140610001255 http://httpbin.org/post?foo=bar
 >>> cli_lines(['--sort', '-',  TEST_WARC_DIR])
 com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz
 org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1001 353 example-url-agnostic-orig.warc.gz
-200
+Total: 206
 
 # test sort, multiple inputs, all records + post query
 >>> cli_lines(['--sort', '-a', '-p', '-9', TEST_WARC_DIR])
 com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - 355 example-url-agnostic-revisit.warc.gz
 org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - 353 example-url-agnostic-orig.warc.gz
-398
+Total: 398
 
 # test writing to stdout
 >>> cli_lines(['-', TEST_WARC_DIR + 'example.warc.gz'])
 com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz
 org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
-4
+Total: 4
 
 # test writing to stdout ('-' omitted)
 >>> cli_lines([TEST_WARC_DIR + 'example.warc.gz'])
 com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz
 org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
-4
+Total: 4
 
 # test writing to temp dir, also use unicode filename
 >>> cli_lines_with_dir(unicode(TEST_WARC_DIR + 'example.warc.gz'))
 example.cdx
 com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz
 org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
-4
+Total: 4
 """
 
 from pywb import get_test_dir
@@ -191,9 +195,9 @@ def cli_lines(cmds):
     lines = buff.getvalue().rstrip().split('\n')
 
     # print first, last, num lines
-    print (lines[1])
-    print (lines[-1])
-    print len(lines)
+    print(lines[1])
+    print(lines[-1])
+    print('Total: ' + str(len(lines)))
 
 def cli_lines_with_dir(input_):
     try:
@@ -224,6 +228,6 @@ def cli_lines_with_dir(input_):
     # print first, last, num lines
     print (lines[1])
     print (lines[-1])
-    print len(lines)
+    print('Total: ' + str(len(lines)))