From fe1c32c8f7d78cb38804abb85aa62f84aab88ccd Mon Sep 17 00:00:00 2001
From: Ilya Kreymer <ikreymer@gmail.com>
Date: Thu, 19 Mar 2015 11:20:40 -0700
Subject: [PATCH] cdxj: support loading cdxj (#76) cdx obj: allow alt field
 names to be used (eg. mime, mimetype, m) (status/statuscode/s) in querying
 and reading cdx cdx minimal: (#75) now implies cdxj to avoid more formats
 minimal includes digest always and mime when warc/revisit tests for cdxj
 loading indexing optimization: reuse same entry obj for records of same type

---
 config.yaml                     |  4 +-
 pywb/cdx/cdxobject.py           | 88 +++++++++++++++++++++++++++------
 pywb/cdx/cdxops.py              | 11 +++--
 pywb/cdx/cdxserver.py           |  2 +-
 pywb/cdx/cdxsource.py           |  5 +-
 pywb/cdx/test/test_cdxops.py    | 20 +++++++-
 pywb/warc/archiveiterator.py    | 54 +++++++++++---------
 pywb/warc/cdxindexer.py         | 56 ++++++++-------------
 pywb/warc/test/test_indexing.py | 17 +++----
 setup.py                        |  2 +
 tests/test_config.yaml          |  2 +
 tests/test_integration.py       |  8 +++
 12 files changed, 174 insertions(+), 95 deletions(-)
diff --git a/config.yaml b/config.yaml
index de1610bd..36ce435f 100644
--- a/config.yaml
+++ b/config.yaml
@@ -107,8 +107,8 @@ enable_http_proxy: true
 # List of route names:
 # <route>: <package or file path>
 # default route static/default for pywb defaults
-static_routes:
-          static/default: pywb/static/
+#static_routes:
+#          static/default: pywb/static/
 
 # enable cdx server api for querying cdx directly (experimental)
 enable_cdx_api: true
diff --git a/pywb/cdx/cdxobject.py b/pywb/cdx/cdxobject.py
index 89a40be0..60a0eb8b 100644
--- a/pywb/cdx/cdxobject.py
+++ b/pywb/cdx/cdxobject.py
@@ -10,6 +10,26 @@ from urlparse import parse_qs
 
 from pywb.utils.wbexception import WbException
 
+from json import loads as json_decode
+
+
+#=================================================================
+URLKEY = 'urlkey'
+TIMESTAMP = 'timestamp'
+ORIGINAL = 'original'
+MIMETYPE = 'mimetype'
+STATUSCODE = 'statuscode'
+DIGEST = 'digest'
+REDIRECT = 'redirect'
+ROBOTFLAGS = 'robotflags'
+LENGTH = 'length'
+OFFSET = 'offset'
+FILENAME = 'filename'
+
+ORIG_LENGTH = 'orig.length'
+ORIG_OFFSET = 'orig.offset'
+ORIG_FILENAME = 'orig.filename'
+
 
 #=================================================================
 class CDXException(WbException):
@@ -24,28 +44,53 @@ class CDXObject(OrderedDict):
     """
     CDX_FORMATS = [
         # Public CDX Format
-        ["urlkey", "timestamp", "original", "mimetype", "statuscode",
-         "digest", "length"],
+        [URLKEY, TIMESTAMP, ORIGINAL, MIMETYPE, STATUSCODE,
+         DIGEST, LENGTH],
 
         # CDX 11 Format
-        ["urlkey", "timestamp", "original", "mimetype", "statuscode",
-         "digest", "redirect", "robotflags", "length", "offset", "filename"],
+        [URLKEY, TIMESTAMP, ORIGINAL, MIMETYPE, STATUSCODE,
+         DIGEST, REDIRECT, ROBOTFLAGS, LENGTH, OFFSET, FILENAME],
 
         # CDX 9 Format
-        ["urlkey", "timestamp", "original", "mimetype", "statuscode",
-         "digest", "redirect", "offset", "filename"],
+        [URLKEY, TIMESTAMP, ORIGINAL, MIMETYPE, STATUSCODE,
+         DIGEST, REDIRECT, OFFSET, FILENAME],
 
         # CDX 11 Format + 3 revisit resolve fields
-        ["urlkey", "timestamp", "original", "mimetype", "statuscode",
-         "digest", "redirect", "robotflags", "length", "offset", "filename",
-         "orig.length", "orig.offset", "orig.filename"],
+        [URLKEY, TIMESTAMP, ORIGINAL, MIMETYPE, STATUSCODE,
+         DIGEST, REDIRECT, ROBOTFLAGS, LENGTH, OFFSET, FILENAME,
+         ORIG_LENGTH, ORIG_OFFSET, ORIG_FILENAME],
 
         # CDX 9 Format + 3 revisit resolve fields
-        ["urlkey", "timestamp", "original", "mimetype", "statuscode",
-         "digest", "redirect", "offset", "filename",
-         "orig.length", "orig.offset", "orig.filename"]
+        [URLKEY, TIMESTAMP, ORIGINAL, MIMETYPE, STATUSCODE,
+         DIGEST, REDIRECT, OFFSET, FILENAME,
+         ORIG_LENGTH, ORIG_OFFSET, ORIG_FILENAME],
     ]
 
+
+    CDX_ALT_FIELDS = {
+                  'u': ORIGINAL,
+                  'url': ORIGINAL,
+
+                  'status': STATUSCODE,
+                  's': STATUSCODE,
+
+                  'mime': MIMETYPE,
+                  'm': MIMETYPE,
+
+                  'l': LENGTH,
+                  's': LENGTH,
+
+                  'o': OFFSET,
+
+                  'd': DIGEST,
+
+                  't': TIMESTAMP,
+
+                  'k': URLKEY,
+
+                  'f': FILENAME
+    }
+
     def __init__(self, cdxline=''):
         OrderedDict.__init__(self)
 
@@ -56,7 +101,20 @@ class CDXObject(OrderedDict):
             self.cdxline = cdxline
             return
 
-        fields = cdxline.split(' ')
+        fields = cdxline.split(' ' , 2)
+        # Check for CDX JSON
+        if fields[-1].startswith('{'):
+            self[URLKEY] = fields[0]
+            self[TIMESTAMP] = fields[1]
+            json_fields = json_decode(fields[-1])
+            for n, v in json_fields.iteritems():
+                n = self.CDX_ALT_FIELDS.get(n, n)
+                self[n] = str(v)
+            self.cdxline = cdxline
+            return
+
+        more_fields = fields.pop().split(' ')
+        fields.extend(more_fields)
 
         cdxformat = None
         for i in self.CDX_FORMATS:
@@ -80,8 +138,8 @@ class CDXObject(OrderedDict):
 
     def is_revisit(self):
         """return ``True`` if this record is a revisit record."""
-        return (self['mimetype'] == 'warc/revisit' or
-                self['filename'] == '-')
+        return (self.get(MIMETYPE) == 'warc/revisit' or
+                self.get(FILENAME) == '-')
 
     def to_text(self, fields=None):
         """
diff --git a/pywb/cdx/cdxops.py b/pywb/cdx/cdxops.py
index 4aa4fc17..0bf58c42 100644
--- a/pywb/cdx/cdxops.py
+++ b/pywb/cdx/cdxops.py
@@ -172,6 +172,8 @@ def cdx_filter(cdx_iter, filter_strings):
             # apply filter to cdx[field]
             else:
                 self.field = parts[0]
+                self.field = CDXObject.CDX_ALT_FIELDS.get(self.field,
+                                                          self.field)
                 string = parts[1]
 
             # make regex if regex mode
@@ -181,7 +183,10 @@ def cdx_filter(cdx_iter, filter_strings):
                 self.filter_str = string
 
         def __call__(self, cdx):
-            val = cdx[self.field] if self.field else str(cdx)
+            if not self.field:
+                val = str(cdx)
+            else:
+                val = cdx.get(self.field, '')
 
             matched = self.compare_func(val)
 
@@ -280,8 +285,8 @@ def cdx_resolve_revisits(cdx_iter):
         if original_cdx and is_revisit:
             fill_orig = lambda field: original_cdx[field]
             # Transfer mimetype and statuscode
-            cdx['mimetype'] = original_cdx['mimetype']
-            cdx['statuscode'] = original_cdx['statuscode']
+            cdx['mimetype'] = original_cdx.get('mimetype', 'none')
+            cdx['statuscode'] = original_cdx.get('statuscode', 'none')
         else:
             fill_orig = lambda field: '-'
 
diff --git a/pywb/cdx/cdxserver.py b/pywb/cdx/cdxserver.py
index 1ecec407..0de3e325 100644
--- a/pywb/cdx/cdxserver.py
+++ b/pywb/cdx/cdxserver.py
@@ -167,7 +167,7 @@ class CDXServer(BaseCDXServer):
         if filename.startswith('redis://'):
             return RedisCDXSource(filename, config)
 
-        if filename.endswith('.cdx'):
+        if filename.endswith(('.cdx', '.cdxj')):
             return CDXFile(filename)
 
         if filename.endswith(('.summary', '.idx')):
diff --git a/pywb/cdx/cdxsource.py b/pywb/cdx/cdxsource.py
index e3174ab1..7eabdbed 100644
--- a/pywb/cdx/cdxsource.py
+++ b/pywb/cdx/cdxsource.py
@@ -29,13 +29,10 @@ class CDXFile(CDXSource):
 
     def load_cdx(self, query):
         def do_open():
-            try:
-                source = open(self.filename, 'rb')
+            with open(self.filename, 'rb') as source:
                 gen = iter_range(source, query.key, query.end_key)
                 for line in gen:
                     yield line
-            finally:
-                source.close()
 
         return do_open()
         #return iter_range(do_open(), query.key, query.end_key)
diff --git a/pywb/cdx/test/test_cdxops.py b/pywb/cdx/test/test_cdxops.py
index 86c2fce8..88ea0b58 100644
--- a/pywb/cdx/test/test_cdxops.py
+++ b/pywb/cdx/test/test_cdxops.py
@@ -48,6 +48,19 @@ org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/s
 >>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', filter = 'statuscode:200')
 org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz
 
+# Filter Alt field name
+>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', filter = 'status:200')
+org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz
+
+# Filter -- no field specified, match regex on entire line
+>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', filter = '~screen.css 20140126200625')
+org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz
+
+# Filter -- no such field, no matches
+>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', filter = 'blah:200')
+Traceback (most recent call last):
+NotFoundException: No Captures found for: http://iana.org/_css/2013.1/screen.css
+
 # Filter exact
 >>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '=urlkey:com,example)/?example=1')
 com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz
@@ -82,7 +95,6 @@ org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/
 org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz - - -
 org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz 8754 41238 iana.warc.gz
 
-
 # Sort by closest timestamp + field select output
 >>> cdx_ops_test(closest = '20140126200826', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', fields = 'timestamp', limit = 10)
 20140126200826
@@ -138,6 +150,12 @@ org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201249 http://www.iana.org/_
 >>> cdx_ops_test('http://iana.org/domains/root/db', resolveRevisits = True)
 org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz - - -
 org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz - - -
+
+# Resolve Revisit -- cdxj minimal
+#>>> cdx_ops_test(url = 'http://example.com/?example=1', sources=[get_test_dir() + 'cdxj/example.cdxj'], resolveRevisits=True)
+
+
+
 """
 
 #=================================================================
diff --git a/pywb/warc/archiveiterator.py b/pywb/warc/archiveiterator.py
index 7a88ec44..c72eae62 100644
--- a/pywb/warc/archiveiterator.py
+++ b/pywb/warc/archiveiterator.py
@@ -197,6 +197,9 @@ class ArchiveIterator(object):
 class ArchiveIndexEntryMixin(object):
     MIME_RE = re.compile('[; ]')
 
+    def reset_entry(self):
+        self['key'] = ''
+
     def extract_mime(self, mime, def_mime='unk'):
         """ Utility function to extract mimetype only
         from a full content type, removing charset settings
@@ -215,11 +218,12 @@ class ArchiveIndexEntryMixin(object):
             self['status'] = '-'
 
     def set_rec_info(self, offset, length, digest):
-        self['length'] = str(length)
-        self['offset'] = str(offset)
         if digest:
             self['digest'] = digest
 
+        self['length'] = str(length)
+        self['offset'] = str(offset)
+
     def merge_request_data(self, other, options):
         surt_ordered = options.get('surt_ordered', True)
 
@@ -248,12 +252,21 @@ class ArchiveIndexEntryMixin(object):
 class DefaultRecordIter(object):
     def __init__(self, **options):
         self.options = options
+        self.entry_cache = {}
 
-    def _create_index_entry(self):
-        if self.options.get('cdxj'):
-            return OrderedArchiveIndexEntry()
-        else:
-            return ArchiveIndexEntry()
+    def _create_index_entry(self, rec_type):
+        try:
+            entry = self.entry_cache[rec_type]
+            entry.reset_entry()
+        except:
+            if self.options.get('cdxj'):
+                entry = OrderedArchiveIndexEntry()
+            else:
+                entry = ArchiveIndexEntry()
+
+            self.entry_cache[rec_type] = entry
+
+        return entry
 
     def create_record_iter(self, arcv_iter):
         append_post = self.options.get('append_post')
@@ -295,8 +308,7 @@ class DefaultRecordIter(object):
 
             compute_digest = False
 
-            if (not minimal and
-                entry.get('digest', '-') == '-' and
+            if (entry.get('digest', '-') == '-' and
                 record.rec_type not in ('revisit', 'request', 'warcinfo')):
 
                 compute_digest = True
@@ -312,7 +324,6 @@ class DefaultRecordIter(object):
 
                 entry['_post_query'] = post_query
 
-            #entry.set_rec_info(*arcv_iter.read_to_end(record, compute_digest))
             arcv_iter.read_to_end(record, compute_digest)
             entry.set_rec_info(*arcv_iter.member_info)
             entry.record = record
@@ -355,7 +366,7 @@ class DefaultRecordIter(object):
         """ Parse warc record
         """
 
-        entry = self._create_index_entry()
+        entry = self._create_index_entry(record.rec_type)
 
         if record.rec_type == 'warcinfo':
             entry['url'] = record.rec_headers.get_header('WARC-Filename')
@@ -369,12 +380,11 @@ class DefaultRecordIter(object):
         entry['timestamp'] = iso_date_to_timestamp(record.rec_headers.
                                                    get_header('WARC-Date'))
 
-        if self.options.get('minimal'):
-            return entry
-
         # mime
         if record.rec_type == 'revisit':
             entry['mime'] = 'warc/revisit'
+        elif self.options.get('minimal'):
+            entry['mime'] = '-'
         else:
             def_mime = '-' if record.rec_type == 'request' else 'unk'
             entry.extract_mime(record.status_headers.
@@ -382,7 +392,7 @@ class DefaultRecordIter(object):
                                def_mime)
 
         # status -- only for response records (by convention):
-        if record.rec_type == 'response':
+        if record.rec_type == 'response' and not self.options.get('minimal'):
             entry.extract_status(record.status_headers)
         else:
             entry['status'] = '-'
@@ -414,7 +424,7 @@ class DefaultRecordIter(object):
         # replace nulls
         url = url.replace('\x00', '%00')
 
-        entry = self._create_index_entry()
+        entry = self._create_index_entry(record.rec_type)
         entry['url'] = url
 
         # timestamp
@@ -422,14 +432,12 @@ class DefaultRecordIter(object):
         if len(entry['timestamp']) > 14:
             entry['timestamp'] = entry['timestamp'][:14]
 
-        if self.options.get('minimal'):
-            return entry
+        if not self.options.get('minimal'):
+            # mime
+            entry.extract_mime(record.rec_headers.get_header('content-type'))
 
-        # mime
-        entry.extract_mime(record.rec_headers.get_header('content-type'))
-
-        # status
-        entry.extract_status(record.status_headers)
+            # status
+            entry.extract_status(record.status_headers)
 
         # digest
         entry['digest'] = '-'
diff --git a/pywb/warc/cdxindexer.py b/pywb/warc/cdxindexer.py
index 7bbe4942..4d7c5837 100644
--- a/pywb/warc/cdxindexer.py
+++ b/pywb/warc/cdxindexer.py
@@ -16,6 +16,7 @@ from io import BytesIO
 
 from archiveiterator import DefaultRecordIter
 
+
 #=================================================================
 class BaseCDXWriter(object):
     def __init__(self, out):
@@ -68,26 +69,6 @@ class CDXJ(object):
         out.write('\n')
 
 
-#=================================================================
-class CDX06(object):
-    def _write_header(self):
-        self.out.write(' CDX N b a S V g\n')
-
-    def write_cdx_line(self, out, entry, filename):
-        out.write(entry['key'])
-        out.write(' ')
-        out.write(entry['timestamp'])
-        out.write(' ')
-        out.write(entry['url'])
-        out.write(' ')
-        out.write(entry['length'])
-        out.write(' ')
-        out.write(entry['offset'])
-        out.write(' ')
-        out.write(filename)
-        out.write('\n')
-
-
 #=================================================================
 class CDX09(object):
     def _write_header(self):
@@ -201,6 +182,8 @@ def cdx_filename(filename):
 #=================================================================
 def get_cdx_writer_cls(options):
     writer_cls = options.get('writer_cls')
+    if options.get('minimal'):
+        options['cdxj'] = True
 
     if writer_cls:
         if not options.get('writer_add_mixin'):
@@ -212,8 +195,6 @@ def get_cdx_writer_cls(options):
 
     if options.get('cdxj'):
         format_mixin = CDXJ
-    elif options.get('cdx06') or options.get('minimal'):
-        format_mixin = CDX06
     elif options.get('cdx09'):
         format_mixin = CDX09
     else:
@@ -311,14 +292,20 @@ Not-recommended for new cdx, use only for backwards-compatibility.
     cdx09_help = """
 Use older 9-field cdx format, default is 11-cdx field
 """
-    minimal_help = """
-Use a minimal 6-field cdx format, outputing only the basic fields
-needed to identiyfy record:
-canonicalized url, timestamp, original url, archive offset, archive length
-and archive filename.
+    minimal_json_help = """
+CDX JSON output, but with minimal fields only, available  w/o parsing
+http record. The fields are:
+canonicalized url, timestamp, original url, digest, archive offset, archive length
+and archive filename. mimetype is included to indicate warc/revisit only.
 
 This option skips record parsing and will not work with
 POST append (-p) option
+"""
+
+    json_help = """
+Output CDX JSON format per line, with url timestamp first, followed by json dict
+for all other fields:
+url timestamp { ... }
 """
 
     output_help = """output file or directory.
@@ -370,15 +357,13 @@ if input is a directory"""
                         action='store_true',
                         help=cdx09_help)
 
-    group.add_argument('-6', '--cdx06',
-                        action='store_true')
-
     group.add_argument('-j', '--cdxj',
-                        action='store_true')
-
-    parser.add_argument('-m', '--minimal',
                         action='store_true',
-                        help=minimal_help)
+                        help=json_help)
+
+    parser.add_argument('-mj', '--minimal-cdxj',
+                        action='store_true',
+                        help=minimal_json_help)
 
     parser.add_argument('output', nargs='?', default='-', help=output_help)
     parser.add_argument('inputs', nargs='+', help=input_help)
@@ -392,9 +377,8 @@ if input is a directory"""
                           append_post=cmd.postappend,
                           recurse=cmd.recurse,
                           cdx09=cmd.cdx09,
-                          cdx06=cmd.cdx06,
                           cdxj=cmd.cdxj,
-                          minimal=cmd.minimal)
+                          minimal=cmd.minimal_cdxj)
 
 
 if __name__ == '__main__':
diff --git a/pywb/warc/test/test_indexing.py b/pywb/warc/test/test_indexing.py
index 6d1d33f7..25498760 100644
--- a/pywb/warc/test/test_indexing.py
+++ b/pywb/warc/test/test_indexing.py
@@ -8,18 +8,11 @@ com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 20
 com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz
 org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
 
-# warc.gz -- minimal cdx
->>> print_cdx_index('example.warc.gz', minimal=True)
- CDX N b a S V g
-com,example)/?example=1 20140103030321 http://example.com?example=1 1043 333 example.warc.gz
-com,example)/?example=1 20140103030341 http://example.com?example=1 553 1864 example.warc.gz
-org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example 577 2907 example.warc.gz
-
 # warc.gz -- minimal CDXJ
 >>> print_cdx_index('example.warc.gz', minimal=True, cdxj=True)
-com,example)/?example=1 20140103030321 {"url": "http://example.com?example=1", "length": "1043", "offset": "333", "filename": "example.warc.gz"}
-com,example)/?example=1 20140103030341 {"url": "http://example.com?example=1", "length": "553", "offset": "1864", "filename": "example.warc.gz"}
-org,iana)/domains/example 20140128051539 {"url": "http://www.iana.org/domains/example", "length": "577", "offset": "2907", "filename": "example.warc.gz"}
+com,example)/?example=1 20140103030321 {"url": "http://example.com?example=1", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "1043", "offset": "333", "filename": "example.warc.gz"}
+com,example)/?example=1 20140103030341 {"url": "http://example.com?example=1", "mime": "warc/revisit", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "553", "offset": "1864", "filename": "example.warc.gz"}
+org,iana)/domains/example 20140128051539 {"url": "http://www.iana.org/domains/example", "digest": "JZ622UA23G5ZU6Y3XAKH4LINONUEICEG", "length": "577", "offset": "2907", "filename": "example.warc.gz"}
 
 # warc.gz -- parse all
 >>> print_cdx_index('example.warc.gz', include_all=True)
@@ -63,6 +56,10 @@ com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ
 >>> print_cdx_index('example.arc.gz', cdxj=True)
 com,example)/ 20140216050221 {"url": "http://example.com/", "mime": "text/html", "status": "200", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "856", "offset": "171", "filename": "example.arc.gz"}
 
+# arc.gz -- minimal + json
+>>> print_cdx_index('example.arc.gz', cdxj=True, minimal=True)
+com,example)/ 20140216050221 {"url": "http://example.com/", "digest": "PEWDX5GTH66WU74WBPGFECIYBMPMP3FP", "length": "856", "offset": "171", "filename": "example.arc.gz"}
+
 # arc
 >>> print_cdx_index('example.arc')
  CDX N b a m s k r M S V g
diff --git a/setup.py b/setup.py
index b0e89c60..4eca13cd 100755
--- a/setup.py
+++ b/setup.py
@@ -60,6 +60,8 @@ setup(
         },
     data_files=[
         ('sample_archive/cdx', glob.glob('sample_archive/cdx/*')),
+        ('sample_archive/cdxj', glob.glob('sample_archive/cdxj/*')),
+        ('sample_archive/non-surt-cdx', glob.glob('sample_archive/non-surt-cdx/*')),
         ('sample_archive/zipcdx', glob.glob('sample_archive/zipcdx/*')),
         ('sample_archive/warcs', glob.glob('sample_archive/warcs/*')),
         ('sample_archive/text_content',
diff --git a/tests/test_config.yaml b/tests/test_config.yaml
index fde10382..d7533951 100644
--- a/tests/test_config.yaml
+++ b/tests/test_config.yaml
@@ -46,6 +46,8 @@ collections:
         index_paths: ./sample_archive/cdx/
         redir_to_exact: false
 
+    pywb-cdxj:
+        index_paths: ./sample_archive/cdxj/
 
 
 # indicate if cdx files are sorted by SURT keys -- eg: com,example)/
diff --git a/tests/test_integration.py b/tests/test_integration.py
index 1b04b0de..0612004b 100644
--- a/tests/test_integration.py
+++ b/tests/test_integration.py
@@ -124,6 +124,14 @@ class TestWb:
         assert 'wb.js' in resp.body
         assert '/pywb-nosurt/20140103030321/http://www.iana.org/domains/example' in resp.body
 
+    def test_replay_cdxj(self):
+        resp = self.testapp.get('/pywb-cdxj/20140103030321/http://example.com?example=1')
+        self._assert_basic_html(resp)
+
+        assert '"20140103030321"' in resp.body
+        assert 'wb.js' in resp.body
+        assert '/pywb-cdxj/20140103030321/http://www.iana.org/domains/example' in resp.body
+
     def test_zero_len_revisit(self):
         resp = self.testapp.get('/pywb/20140603030341/http://example.com?example=2')
         self._assert_basic_html(resp)