From 3f084625b07f40525e04a8e82af7d40c6a8ec151 Mon Sep 17 00:00:00 2001
From: Ilya Kreymer <ikreymer@gmail.com>
Date: Tue, 17 Mar 2015 21:11:35 -0700
Subject: [PATCH] indexing: cdx json support (#76): use OrderedDict when
 indexing json to ensure consistent ordering skip empty or '-' fields add
 tests for cdx json

---
 pywb/manager/manager.py         |  2 +-
 pywb/warc/archiveiterator.py    | 31 ++++++++++++++++++++++++-------
 pywb/warc/cdxindexer.py         | 18 ++++++++++++++----
 pywb/warc/test/test_indexing.py | 18 ++++++++++++++++++
 pywb/webapp/pywb_init.py        |  2 +-
 5 files changed, 58 insertions(+), 13 deletions(-)

diff --git a/pywb/manager/manager.py b/pywb/manager/manager.py
index 595a71f9..9bd57a6b 100644
--- a/pywb/manager/manager.py
+++ b/pywb/manager/manager.py
@@ -23,7 +23,7 @@ def get_input(msg):  #pragma: no cover
 
 
 #=============================================================================
-class CollectionsManager(object):  #pragma: no cover
+class CollectionsManager(object):
     """ This utility is designed to
 simplify the creation and management of web archive collections
 
diff --git a/pywb/warc/archiveiterator.py b/pywb/warc/archiveiterator.py
index dfdc1835..7a88ec44 100644
--- a/pywb/warc/archiveiterator.py
+++ b/pywb/warc/archiveiterator.py
@@ -10,6 +10,11 @@ import base64
 
 import re
 
+try:  # pragma: no cover
+    from collections import OrderedDict
+except ImportError:  # pragma: no cover
+    from ordereddict import OrderedDict
+
 
 #=================================================================
 class ArchiveIterator(object):
@@ -189,7 +194,7 @@ class ArchiveIterator(object):
 
 
 #=================================================================
-class ArchiveIndexEntry(dict):
+class ArchiveIndexEntryMixin(object):
     MIME_RE = re.compile('[; ]')
 
     def extract_mime(self, mime, def_mime='unk'):
@@ -210,8 +215,8 @@ class ArchiveIndexEntry(dict):
             self['status'] = '-'
 
     def set_rec_info(self, offset, length, digest):
-        self['offset'] = str(offset)
         self['length'] = str(length)
+        self['offset'] = str(offset)
         if digest:
             self['digest'] = digest
 
@@ -244,6 +249,12 @@ class DefaultRecordIter(object):
     def __init__(self, **options):
         self.options = options
 
+    def _create_index_entry(self):
+        if self.options.get('cdxj'):
+            return OrderedArchiveIndexEntry()
+        else:
+            return ArchiveIndexEntry()
+
     def create_record_iter(self, arcv_iter):
         append_post = self.options.get('append_post')
         include_all = self.options.get('include_all')
@@ -344,7 +355,7 @@ class DefaultRecordIter(object):
         """ Parse warc record
         """
 
-        entry = ArchiveIndexEntry()
+        entry = self._create_index_entry()
 
         if record.rec_type == 'warcinfo':
             entry['url'] = record.rec_headers.get_header('WARC-Filename')
@@ -403,7 +414,7 @@ class DefaultRecordIter(object):
         # replace nulls
         url = url.replace('\x00', '%00')
 
-        entry = ArchiveIndexEntry()
+        entry = self._create_index_entry()
         entry['url'] = url
 
         # timestamp
@@ -414,12 +425,12 @@ class DefaultRecordIter(object):
         if self.options.get('minimal'):
             return entry
 
-        # status
-        entry.extract_status(record.status_headers)
-
         # mime
         entry.extract_mime(record.rec_headers.get_header('content-type'))
 
+        # status
+        entry.extract_status(record.status_headers)
+
         # digest
         entry['digest'] = '-'
 
@@ -439,3 +450,9 @@ class DefaultRecordIter(object):
                 continue
 
             yield entry
+
+class ArchiveIndexEntry(ArchiveIndexEntryMixin, dict):
+    pass
+
+class OrderedArchiveIndexEntry(ArchiveIndexEntryMixin, OrderedDict):
+    pass
diff --git a/pywb/warc/cdxindexer.py b/pywb/warc/cdxindexer.py
index c25c928c..7bbe4942 100644
--- a/pywb/warc/cdxindexer.py
+++ b/pywb/warc/cdxindexer.py
@@ -1,6 +1,13 @@
 import os
 import sys
-import json
+
+from json import dumps as json_encode
+
+try:  # pragma: no cover
+    from collections import OrderedDict
+except ImportError:  # pragma: no cover
+    from ordereddict import OrderedDict
+
 
 from argparse import ArgumentParser, RawTextHelpFormatter
 from bisect import insort
@@ -42,8 +49,7 @@ class CDXJ(object):
         out.write(entry['timestamp'])
         out.write(' ')
 
-        outdict = {}
-        outdict['filename'] = filename
+        outdict = OrderedDict()
 
         for n, v in entry.iteritems():
             if n in ('key', 'timestamp'):
@@ -52,9 +58,13 @@ class CDXJ(object):
             if n.startswith('_'):
                 continue
 
+            if not v or v == '-':
+                continue
+
             outdict[n] = v
 
-        out.write(json.dumps(outdict))
+        outdict['filename'] = filename
+        out.write(json_encode(outdict))
         out.write('\n')
 
 
diff --git a/pywb/warc/test/test_indexing.py b/pywb/warc/test/test_indexing.py
index c01d3ed0..6d1d33f7 100644
--- a/pywb/warc/test/test_indexing.py
+++ b/pywb/warc/test/test_indexing.py
@@ -15,6 +15,12 @@ com,example)/?example=1 20140103030321 http://example.com?example=1 1043 333 exa
 com,example)/?example=1 20140103030341 http://example.com?example=1 553 1864 example.warc.gz
 org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example 577 2907 example.warc.gz
 
+# warc.gz -- minimal CDXJ
+>>> print_cdx_index('example.warc.gz', minimal=True, cdxj=True)
+com,example)/?example=1 20140103030321 {"url": "http://example.com?example=1", "length": "1043", "offset": "333", "filename": "example.warc.gz"}
+com,example)/?example=1 20140103030341 {"url": "http://example.com?example=1", "length": "553", "offset": "1864", "filename": "example.warc.gz"}
+org,iana)/domains/example 20140128051539 {"url": "http://www.iana.org/domains/example", "length": "577", "offset": "2907", "filename": "example.warc.gz"}
+
 # warc.gz -- parse all
 >>> print_cdx_index('example.warc.gz', include_all=True)
  CDX N b a m s k r M S V g
@@ -24,6 +30,14 @@ com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit
 com,example)/?example=1 20140103030341 http://example.com?example=1 - - - - - 490 2417 example.warc.gz
 org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
 
+# warc.gz -- parse all -- CDXJ
+>>> print_cdx_index('example.warc.gz', include_all=True, cdxj=True)
+com,example)/?example=1 20140103030321 {"url": "http://example.com?example=1", "mime": "text/html", "status": "200", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "1043", "offset": "333", "filename": "example.warc.gz"}
+com,example)/?example=1 20140103030321 {"url": "http://example.com?example=1", "length": "488", "offset": "1376", "filename": "example.warc.gz"}
+com,example)/?example=1 20140103030341 {"url": "http://example.com?example=1", "mime": "warc/revisit", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "553", "offset": "1864", "filename": "example.warc.gz"}
+com,example)/?example=1 20140103030341 {"url": "http://example.com?example=1", "length": "490", "offset": "2417", "filename": "example.warc.gz"}
+org,iana)/domains/example 20140128051539 {"url": "http://www.iana.org/domains/example", "mime": "text/html", "status": "302", "digest": "JZ622UA23G5ZU6Y3XAKH4LINONUEICEG", "length": "577", "offset": "2907", "filename": "example.warc.gz"}
+
 # warc
 >>> print_cdx_index('example.warc')
  CDX N b a m s k r M S V g
@@ -45,6 +59,10 @@ org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example tex
  CDX N b a m s k r M S V g
 com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 856 171 example.arc.gz
 
+# arc.gz -- json
+>>> print_cdx_index('example.arc.gz', cdxj=True)
+com,example)/ 20140216050221 {"url": "http://example.com/", "mime": "text/html", "status": "200", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "856", "offset": "171", "filename": "example.arc.gz"}
+
 # arc
 >>> print_cdx_index('example.arc')
  CDX N b a m s k r M S V g
diff --git a/pywb/webapp/pywb_init.py b/pywb/webapp/pywb_init.py
index cd18cc25..cad3d1a6 100644
--- a/pywb/webapp/pywb_init.py
+++ b/pywb/webapp/pywb_init.py
@@ -126,7 +126,7 @@ def create_cdx_server_app(passed_config):
 
 
 #=================================================================
-class DirectoryCollsLoader(object):  #pragma: no cover
+class DirectoryCollsLoader(object):
     def __init__(self, config, static_routes):
         self.config = config
         self.static_routes = static_routes