cdxj: support loading cdxj (#76)

cdx obj: allow alt field names to be used (eg. mime, mimetype, m) (status/statuscode/s) in querying and reading cdx cdx minimal: (#75) now implies cdxj to avoid more formats minimal includes digest always and mime when warc/revisit tests for cdxj loading indexing optimization: reuse same entry obj for records of same type
2025-03-15 00:03:28 +01:00 · 2015-03-19 11:20:40 -07:00 · 2015-03-19 11:20:40 -07:00 · fe1c32c8f7
commit fe1c32c8f7
parent 73f24f5a2b
12 changed files with 174 additions and 95 deletions
--- a/config.yaml
+++ b/config.yaml
@ -107,8 +107,8 @@ enable_http_proxy: true
 # List of route names:
 # <route>: <package or file path>
 # default route static/default for pywb defaults
-static_routes:
-          static/default: pywb/static/
+#static_routes:
+#          static/default: pywb/static/

 # enable cdx server api for querying cdx directly (experimental)
 enable_cdx_api: true
--- a/pywb/cdx/cdxobject.py
+++ b/pywb/cdx/cdxobject.py
@ -10,6 +10,26 @@ from urlparse import parse_qs

 from pywb.utils.wbexception import WbException

+from json import loads as json_decode
+
+
+#=================================================================
+URLKEY = 'urlkey'
+TIMESTAMP = 'timestamp'
+ORIGINAL = 'original'
+MIMETYPE = 'mimetype'
+STATUSCODE = 'statuscode'
+DIGEST = 'digest'
+REDIRECT = 'redirect'
+ROBOTFLAGS = 'robotflags'
+LENGTH = 'length'
+OFFSET = 'offset'
+FILENAME = 'filename'
+
+ORIG_LENGTH = 'orig.length'
+ORIG_OFFSET = 'orig.offset'
+ORIG_FILENAME = 'orig.filename'
+

 #=================================================================
 class CDXException(WbException):
@ -24,28 +44,53 @@ class CDXObject(OrderedDict):
    """
    CDX_FORMATS = [
        # Public CDX Format
-        ["urlkey", "timestamp", "original", "mimetype", "statuscode",
-         "digest", "length"],
+        [URLKEY, TIMESTAMP, ORIGINAL, MIMETYPE, STATUSCODE,
+         DIGEST, LENGTH],

        # CDX 11 Format
-        ["urlkey", "timestamp", "original", "mimetype", "statuscode",
-         "digest", "redirect", "robotflags", "length", "offset", "filename"],
+        [URLKEY, TIMESTAMP, ORIGINAL, MIMETYPE, STATUSCODE,
+         DIGEST, REDIRECT, ROBOTFLAGS, LENGTH, OFFSET, FILENAME],

        # CDX 9 Format
-        ["urlkey", "timestamp", "original", "mimetype", "statuscode",
-         "digest", "redirect", "offset", "filename"],
+        [URLKEY, TIMESTAMP, ORIGINAL, MIMETYPE, STATUSCODE,
+         DIGEST, REDIRECT, OFFSET, FILENAME],

        # CDX 11 Format + 3 revisit resolve fields
-        ["urlkey", "timestamp", "original", "mimetype", "statuscode",
-         "digest", "redirect", "robotflags", "length", "offset", "filename",
-         "orig.length", "orig.offset", "orig.filename"],
+        [URLKEY, TIMESTAMP, ORIGINAL, MIMETYPE, STATUSCODE,
+         DIGEST, REDIRECT, ROBOTFLAGS, LENGTH, OFFSET, FILENAME,
+         ORIG_LENGTH, ORIG_OFFSET, ORIG_FILENAME],

        # CDX 9 Format + 3 revisit resolve fields
-        ["urlkey", "timestamp", "original", "mimetype", "statuscode",
-         "digest", "redirect", "offset", "filename",
-         "orig.length", "orig.offset", "orig.filename"]
+        [URLKEY, TIMESTAMP, ORIGINAL, MIMETYPE, STATUSCODE,
+         DIGEST, REDIRECT, OFFSET, FILENAME,
+         ORIG_LENGTH, ORIG_OFFSET, ORIG_FILENAME],
    ]

+
+    CDX_ALT_FIELDS = {
+                  'u': ORIGINAL,
+                  'url': ORIGINAL,
+
+                  'status': STATUSCODE,
+                  's': STATUSCODE,
+
+                  'mime': MIMETYPE,
+                  'm': MIMETYPE,
+
+                  'l': LENGTH,
+                  's': LENGTH,
+
+                  'o': OFFSET,
+
+                  'd': DIGEST,
+
+                  't': TIMESTAMP,
+
+                  'k': URLKEY,
+
+                  'f': FILENAME
+    }
+
    def __init__(self, cdxline=''):
        OrderedDict.__init__(self)

@ -56,7 +101,20 @@ class CDXObject(OrderedDict):
            self.cdxline = cdxline
            return

-        fields = cdxline.split(' ')
+        fields = cdxline.split(' ' , 2)
+        # Check for CDX JSON
+        if fields[-1].startswith('{'):
+            self[URLKEY] = fields[0]
+            self[TIMESTAMP] = fields[1]
+            json_fields = json_decode(fields[-1])
+            for n, v in json_fields.iteritems():
+                n = self.CDX_ALT_FIELDS.get(n, n)
+                self[n] = str(v)
+            self.cdxline = cdxline
+            return
+
+        more_fields = fields.pop().split(' ')
+        fields.extend(more_fields)

        cdxformat = None
        for i in self.CDX_FORMATS:
@ -80,8 +138,8 @@ class CDXObject(OrderedDict):

    def is_revisit(self):
        """return ``True`` if this record is a revisit record."""
-        return (self['mimetype'] == 'warc/revisit' or
-                self['filename'] == '-')
+        return (self.get(MIMETYPE) == 'warc/revisit' or
+                self.get(FILENAME) == '-')

    def to_text(self, fields=None):
        """
--- a/pywb/cdx/cdxops.py
+++ b/pywb/cdx/cdxops.py
@ -172,6 +172,8 @@ def cdx_filter(cdx_iter, filter_strings):
            # apply filter to cdx[field]
            else:
                self.field = parts[0]
+                self.field = CDXObject.CDX_ALT_FIELDS.get(self.field,
+                                                          self.field)
                string = parts[1]

            # make regex if regex mode
@ -181,7 +183,10 @@ def cdx_filter(cdx_iter, filter_strings):
                self.filter_str = string

        def __call__(self, cdx):
-            val = cdx[self.field] if self.field else str(cdx)
+            if not self.field:
+                val = str(cdx)
+            else:
+                val = cdx.get(self.field, '')

            matched = self.compare_func(val)

@ -280,8 +285,8 @@ def cdx_resolve_revisits(cdx_iter):
        if original_cdx and is_revisit:
            fill_orig = lambda field: original_cdx[field]
            # Transfer mimetype and statuscode
-            cdx['mimetype'] = original_cdx['mimetype']
-            cdx['statuscode'] = original_cdx['statuscode']
+            cdx['mimetype'] = original_cdx.get('mimetype', 'none')
+            cdx['statuscode'] = original_cdx.get('statuscode', 'none')
        else:
            fill_orig = lambda field: '-'

--- a/pywb/cdx/cdxserver.py
+++ b/pywb/cdx/cdxserver.py
@ -167,7 +167,7 @@ class CDXServer(BaseCDXServer):
        if filename.startswith('redis://'):
            return RedisCDXSource(filename, config)

-        if filename.endswith('.cdx'):
+        if filename.endswith(('.cdx', '.cdxj')):
            return CDXFile(filename)

        if filename.endswith(('.summary', '.idx')):
--- a/pywb/cdx/cdxsource.py
+++ b/pywb/cdx/cdxsource.py
@ -29,13 +29,10 @@ class CDXFile(CDXSource):

    def load_cdx(self, query):
        def do_open():
-            try:
-                source = open(self.filename, 'rb')
+            with open(self.filename, 'rb') as source:
                gen = iter_range(source, query.key, query.end_key)
                for line in gen:
                    yield line
-            finally:
-                source.close()

        return do_open()
        #return iter_range(do_open(), query.key, query.end_key)
--- a/pywb/cdx/test/test_cdxops.py
+++ b/pywb/cdx/test/test_cdxops.py
@ -48,6 +48,19 @@ org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/s
 >>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', filter = 'statuscode:200')
 org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz

+# Filter Alt field name
+>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', filter = 'status:200')
+org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz
+
+# Filter -- no field specified, match regex on entire line
+>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', filter = '~screen.css 20140126200625')
+org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz
+
+# Filter -- no such field, no matches
+>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', filter = 'blah:200')
+Traceback (most recent call last):
+NotFoundException: No Captures found for: http://iana.org/_css/2013.1/screen.css
+
 # Filter exact
 >>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '=urlkey:com,example)/?example=1')
 com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz
@ -82,7 +95,6 @@ org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/
 org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz - - -
 org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz 8754 41238 iana.warc.gz

-
 # Sort by closest timestamp + field select output
 >>> cdx_ops_test(closest = '20140126200826', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', fields = 'timestamp', limit = 10)
 20140126200826
@ -138,6 +150,12 @@ org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201249 http://www.iana.org/_
 >>> cdx_ops_test('http://iana.org/domains/root/db', resolveRevisits = True)
 org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz - - -
 org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz - - -
+
+# Resolve Revisit -- cdxj minimal
+#>>> cdx_ops_test(url = 'http://example.com/?example=1', sources=[get_test_dir() + 'cdxj/example.cdxj'], resolveRevisits=True)
+
+
+
 """

 #=================================================================
--- a/pywb/warc/archiveiterator.py
+++ b/pywb/warc/archiveiterator.py
@ -197,6 +197,9 @@ class ArchiveIterator(object):
 class ArchiveIndexEntryMixin(object):
    MIME_RE = re.compile('[; ]')

+    def reset_entry(self):
+        self['key'] = ''
+
    def extract_mime(self, mime, def_mime='unk'):
        """ Utility function to extract mimetype only
        from a full content type, removing charset settings
@ -215,11 +218,12 @@ class ArchiveIndexEntryMixin(object):
            self['status'] = '-'

    def set_rec_info(self, offset, length, digest):
-        self['length'] = str(length)
-        self['offset'] = str(offset)
        if digest:
            self['digest'] = digest

+        self['length'] = str(length)
+        self['offset'] = str(offset)
+
    def merge_request_data(self, other, options):
        surt_ordered = options.get('surt_ordered', True)

@ -248,12 +252,21 @@ class ArchiveIndexEntryMixin(object):
 class DefaultRecordIter(object):
    def __init__(self, **options):
        self.options = options
+        self.entry_cache = {}

-    def _create_index_entry(self):
+    def _create_index_entry(self, rec_type):
+        try:
+            entry = self.entry_cache[rec_type]
+            entry.reset_entry()
+        except:
            if self.options.get('cdxj'):
-            return OrderedArchiveIndexEntry()
+                entry = OrderedArchiveIndexEntry()
            else:
-            return ArchiveIndexEntry()
+                entry = ArchiveIndexEntry()
+
+            self.entry_cache[rec_type] = entry
+
+        return entry

    def create_record_iter(self, arcv_iter):
        append_post = self.options.get('append_post')
@ -295,8 +308,7 @@ class DefaultRecordIter(object):

            compute_digest = False

-            if (not minimal and
-                entry.get('digest', '-') == '-' and
+            if (entry.get('digest', '-') == '-' and
                record.rec_type not in ('revisit', 'request', 'warcinfo')):

                compute_digest = True
@ -312,7 +324,6 @@ class DefaultRecordIter(object):

                entry['_post_query'] = post_query

-            #entry.set_rec_info(*arcv_iter.read_to_end(record, compute_digest))
            arcv_iter.read_to_end(record, compute_digest)
            entry.set_rec_info(*arcv_iter.member_info)
            entry.record = record
@ -355,7 +366,7 @@ class DefaultRecordIter(object):
        """ Parse warc record
        """

-        entry = self._create_index_entry()
+        entry = self._create_index_entry(record.rec_type)

        if record.rec_type == 'warcinfo':
            entry['url'] = record.rec_headers.get_header('WARC-Filename')
@ -369,12 +380,11 @@ class DefaultRecordIter(object):
        entry['timestamp'] = iso_date_to_timestamp(record.rec_headers.
                                                   get_header('WARC-Date'))

-        if self.options.get('minimal'):
-            return entry
-
        # mime
        if record.rec_type == 'revisit':
            entry['mime'] = 'warc/revisit'
+        elif self.options.get('minimal'):
+            entry['mime'] = '-'
        else:
            def_mime = '-' if record.rec_type == 'request' else 'unk'
            entry.extract_mime(record.status_headers.
@ -382,7 +392,7 @@ class DefaultRecordIter(object):
                               def_mime)

        # status -- only for response records (by convention):
-        if record.rec_type == 'response':
+        if record.rec_type == 'response' and not self.options.get('minimal'):
            entry.extract_status(record.status_headers)
        else:
            entry['status'] = '-'
@ -414,7 +424,7 @@ class DefaultRecordIter(object):
        # replace nulls
        url = url.replace('\x00', '%00')

-        entry = self._create_index_entry()
+        entry = self._create_index_entry(record.rec_type)
        entry['url'] = url

        # timestamp
@ -422,9 +432,7 @@ class DefaultRecordIter(object):
        if len(entry['timestamp']) > 14:
            entry['timestamp'] = entry['timestamp'][:14]

-        if self.options.get('minimal'):
-            return entry
-
+        if not self.options.get('minimal'):
            # mime
            entry.extract_mime(record.rec_headers.get_header('content-type'))

--- a/pywb/warc/cdxindexer.py
+++ b/pywb/warc/cdxindexer.py
@ -16,6 +16,7 @@ from io import BytesIO

 from archiveiterator import DefaultRecordIter

+
 #=================================================================
 class BaseCDXWriter(object):
    def __init__(self, out):
@ -68,26 +69,6 @@ class CDXJ(object):
        out.write('\n')


-#=================================================================
-class CDX06(object):
-    def _write_header(self):
-        self.out.write(' CDX N b a S V g\n')
-
-    def write_cdx_line(self, out, entry, filename):
-        out.write(entry['key'])
-        out.write(' ')
-        out.write(entry['timestamp'])
-        out.write(' ')
-        out.write(entry['url'])
-        out.write(' ')
-        out.write(entry['length'])
-        out.write(' ')
-        out.write(entry['offset'])
-        out.write(' ')
-        out.write(filename)
-        out.write('\n')
-
-
 #=================================================================
 class CDX09(object):
    def _write_header(self):
@ -201,6 +182,8 @@ def cdx_filename(filename):
 #=================================================================
 def get_cdx_writer_cls(options):
    writer_cls = options.get('writer_cls')
+    if options.get('minimal'):
+        options['cdxj'] = True

    if writer_cls:
        if not options.get('writer_add_mixin'):
@ -212,8 +195,6 @@ def get_cdx_writer_cls(options):

    if options.get('cdxj'):
        format_mixin = CDXJ
-    elif options.get('cdx06') or options.get('minimal'):
-        format_mixin = CDX06
    elif options.get('cdx09'):
        format_mixin = CDX09
    else:
@ -311,14 +292,20 @@ Not-recommended for new cdx, use only for backwards-compatibility.
    cdx09_help = """
 Use older 9-field cdx format, default is 11-cdx field
 """
-    minimal_help = """
-Use a minimal 6-field cdx format, outputing only the basic fields
-needed to identiyfy record:
-canonicalized url, timestamp, original url, archive offset, archive length
-and archive filename.
+    minimal_json_help = """
+CDX JSON output, but with minimal fields only, available  w/o parsing
+http record. The fields are:
+canonicalized url, timestamp, original url, digest, archive offset, archive length
+and archive filename. mimetype is included to indicate warc/revisit only.

 This option skips record parsing and will not work with
 POST append (-p) option
+"""
+
+    json_help = """
+Output CDX JSON format per line, with url timestamp first, followed by json dict
+for all other fields:
+url timestamp { ... }
 """

    output_help = """output file or directory.
@ -370,15 +357,13 @@ if input is a directory"""
                        action='store_true',
                        help=cdx09_help)

-    group.add_argument('-6', '--cdx06',
-                        action='store_true')
-
    group.add_argument('-j', '--cdxj',
-                        action='store_true')
-
-    parser.add_argument('-m', '--minimal',
                        action='store_true',
-                        help=minimal_help)
+                        help=json_help)
+
+    parser.add_argument('-mj', '--minimal-cdxj',
+                        action='store_true',
+                        help=minimal_json_help)

    parser.add_argument('output', nargs='?', default='-', help=output_help)
    parser.add_argument('inputs', nargs='+', help=input_help)
@ -392,9 +377,8 @@ if input is a directory"""
                          append_post=cmd.postappend,
                          recurse=cmd.recurse,
                          cdx09=cmd.cdx09,
-                          cdx06=cmd.cdx06,
                          cdxj=cmd.cdxj,
-                          minimal=cmd.minimal)
+                          minimal=cmd.minimal_cdxj)


 if __name__ == '__main__':
--- a/pywb/warc/test/test_indexing.py
+++ b/pywb/warc/test/test_indexing.py
@ -8,18 +8,11 @@ com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 20
 com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz
 org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz

-# warc.gz -- minimal cdx
->>> print_cdx_index('example.warc.gz', minimal=True)
- CDX N b a S V g
-com,example)/?example=1 20140103030321 http://example.com?example=1 1043 333 example.warc.gz
-com,example)/?example=1 20140103030341 http://example.com?example=1 553 1864 example.warc.gz
-org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example 577 2907 example.warc.gz
-
 # warc.gz -- minimal CDXJ
 >>> print_cdx_index('example.warc.gz', minimal=True, cdxj=True)
-com,example)/?example=1 20140103030321 {"url": "http://example.com?example=1", "length": "1043", "offset": "333", "filename": "example.warc.gz"}
-com,example)/?example=1 20140103030341 {"url": "http://example.com?example=1", "length": "553", "offset": "1864", "filename": "example.warc.gz"}
-org,iana)/domains/example 20140128051539 {"url": "http://www.iana.org/domains/example", "length": "577", "offset": "2907", "filename": "example.warc.gz"}
+com,example)/?example=1 20140103030321 {"url": "http://example.com?example=1", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "1043", "offset": "333", "filename": "example.warc.gz"}
+com,example)/?example=1 20140103030341 {"url": "http://example.com?example=1", "mime": "warc/revisit", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "553", "offset": "1864", "filename": "example.warc.gz"}
+org,iana)/domains/example 20140128051539 {"url": "http://www.iana.org/domains/example", "digest": "JZ622UA23G5ZU6Y3XAKH4LINONUEICEG", "length": "577", "offset": "2907", "filename": "example.warc.gz"}

 # warc.gz -- parse all
 >>> print_cdx_index('example.warc.gz', include_all=True)
@ -63,6 +56,10 @@ com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ
 >>> print_cdx_index('example.arc.gz', cdxj=True)
 com,example)/ 20140216050221 {"url": "http://example.com/", "mime": "text/html", "status": "200", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "856", "offset": "171", "filename": "example.arc.gz"}

+# arc.gz -- minimal + json
+>>> print_cdx_index('example.arc.gz', cdxj=True, minimal=True)
+com,example)/ 20140216050221 {"url": "http://example.com/", "digest": "PEWDX5GTH66WU74WBPGFECIYBMPMP3FP", "length": "856", "offset": "171", "filename": "example.arc.gz"}
+
 # arc
 >>> print_cdx_index('example.arc')
 CDX N b a m s k r M S V g
--- a/setup.py
+++ b/setup.py
@ -60,6 +60,8 @@ setup(
        },
    data_files=[
        ('sample_archive/cdx', glob.glob('sample_archive/cdx/*')),
+        ('sample_archive/cdxj', glob.glob('sample_archive/cdxj/*')),
+        ('sample_archive/non-surt-cdx', glob.glob('sample_archive/non-surt-cdx/*')),
        ('sample_archive/zipcdx', glob.glob('sample_archive/zipcdx/*')),
        ('sample_archive/warcs', glob.glob('sample_archive/warcs/*')),
        ('sample_archive/text_content',
--- a/tests/test_config.yaml
+++ b/tests/test_config.yaml
@ -46,6 +46,8 @@ collections:
        index_paths: ./sample_archive/cdx/
        redir_to_exact: false

+    pywb-cdxj:
+        index_paths: ./sample_archive/cdxj/


 # indicate if cdx files are sorted by SURT keys -- eg: com,example)/
--- a/tests/test_integration.py
+++ b/tests/test_integration.py
@ -124,6 +124,14 @@ class TestWb:
        assert 'wb.js' in resp.body
        assert '/pywb-nosurt/20140103030321/http://www.iana.org/domains/example' in resp.body

+    def test_replay_cdxj(self):
+        resp = self.testapp.get('/pywb-cdxj/20140103030321/http://example.com?example=1')
+        self._assert_basic_html(resp)
+
+        assert '"20140103030321"' in resp.body
+        assert 'wb.js' in resp.body
+        assert '/pywb-cdxj/20140103030321/http://www.iana.org/domains/example' in resp.body
+
    def test_zero_len_revisit(self):
        resp = self.testapp.get('/pywb/20140603030341/http://example.com?example=2')
        self._assert_basic_html(resp)