cdxj: support loading cdxj (#76)

cdx obj: allow alt field names to be used (eg. mime, mimetype, m) (status/statuscode/s) in querying and reading cdx cdx minimal: (#75) now implies cdxj to avoid more formats minimal includes digest always and mime when warc/revisit tests for cdxj loading indexing optimization: reuse same entry obj for records of same type
2025-03-15 00:03:28 +01:00 · 2015-03-19 11:20:40 -07:00 · 2015-03-19 11:20:40 -07:00 · fe1c32c8f7
commit fe1c32c8f7
parent 73f24f5a2b
12 changed files with 174 additions and 95 deletions
--- a/config.yaml
+++ b/config.yaml
@ -107,8 +107,8 @@ enable_http_proxy: true
 # List of route names:
 # <route>: <package or file path>
 # default route static/default for pywb defaults
-static_routes:
+#static_routes:
-          static/default: pywb/static/
+#          static/default: pywb/static/
 # enable cdx server api for querying cdx directly (experimental)
 enable_cdx_api: true
--- a/pywb/cdx/cdxobject.py
+++ b/pywb/cdx/cdxobject.py
@ -10,6 +10,26 @@ from urlparse import parse_qs
 from pywb.utils.wbexception import WbException
 from json import loads as json_decode
 #=================================================================
 URLKEY = 'urlkey'
 TIMESTAMP = 'timestamp'
 ORIGINAL = 'original'
 MIMETYPE = 'mimetype'
 STATUSCODE = 'statuscode'
 DIGEST = 'digest'
 REDIRECT = 'redirect'
 ROBOTFLAGS = 'robotflags'
 LENGTH = 'length'
 OFFSET = 'offset'
 FILENAME = 'filename'
 ORIG_LENGTH = 'orig.length'
 ORIG_OFFSET = 'orig.offset'
 ORIG_FILENAME = 'orig.filename'
 #=================================================================
 class CDXException(WbException):
@ -24,28 +44,53 @@ class CDXObject(OrderedDict):
    """
    CDX_FORMATS = [
        # Public CDX Format
-        ["urlkey", "timestamp", "original", "mimetype", "statuscode",
+        [URLKEY, TIMESTAMP, ORIGINAL, MIMETYPE, STATUSCODE,
-         "digest", "length"],
+         DIGEST, LENGTH],
        # CDX 11 Format
-        ["urlkey", "timestamp", "original", "mimetype", "statuscode",
+        [URLKEY, TIMESTAMP, ORIGINAL, MIMETYPE, STATUSCODE,
-         "digest", "redirect", "robotflags", "length", "offset", "filename"],
+         DIGEST, REDIRECT, ROBOTFLAGS, LENGTH, OFFSET, FILENAME],
        # CDX 9 Format
-        ["urlkey", "timestamp", "original", "mimetype", "statuscode",
+        [URLKEY, TIMESTAMP, ORIGINAL, MIMETYPE, STATUSCODE,
-         "digest", "redirect", "offset", "filename"],
+         DIGEST, REDIRECT, OFFSET, FILENAME],
        # CDX 11 Format + 3 revisit resolve fields
-        ["urlkey", "timestamp", "original", "mimetype", "statuscode",
+        [URLKEY, TIMESTAMP, ORIGINAL, MIMETYPE, STATUSCODE,
-         "digest", "redirect", "robotflags", "length", "offset", "filename",
+         DIGEST, REDIRECT, ROBOTFLAGS, LENGTH, OFFSET, FILENAME,
-         "orig.length", "orig.offset", "orig.filename"],
+         ORIG_LENGTH, ORIG_OFFSET, ORIG_FILENAME],
        # CDX 9 Format + 3 revisit resolve fields
-        ["urlkey", "timestamp", "original", "mimetype", "statuscode",
+        [URLKEY, TIMESTAMP, ORIGINAL, MIMETYPE, STATUSCODE,
-         "digest", "redirect", "offset", "filename",
+         DIGEST, REDIRECT, OFFSET, FILENAME,
-         "orig.length", "orig.offset", "orig.filename"]
+         ORIG_LENGTH, ORIG_OFFSET, ORIG_FILENAME],
    ]
    CDX_ALT_FIELDS = {
                  'u': ORIGINAL,
                  'url': ORIGINAL,
                  'status': STATUSCODE,
                  's': STATUSCODE,
                  'mime': MIMETYPE,
                  'm': MIMETYPE,
                  'l': LENGTH,
                  's': LENGTH,
                  'o': OFFSET,
                  'd': DIGEST,
                  't': TIMESTAMP,
                  'k': URLKEY,
                  'f': FILENAME
    }
    def __init__(self, cdxline=''):
        OrderedDict.__init__(self)
@ -56,7 +101,20 @@ class CDXObject(OrderedDict):
            self.cdxline = cdxline
            return
-        fields = cdxline.split(' ')
+        fields = cdxline.split(' ' , 2)
        # Check for CDX JSON
        if fields[-1].startswith('{'):
            self[URLKEY] = fields[0]
            self[TIMESTAMP] = fields[1]
            json_fields = json_decode(fields[-1])
            for n, v in json_fields.iteritems():
                n = self.CDX_ALT_FIELDS.get(n, n)
                self[n] = str(v)
            self.cdxline = cdxline
            return
        more_fields = fields.pop().split(' ')
        fields.extend(more_fields)
        cdxformat = None
        for i in self.CDX_FORMATS:
@ -80,8 +138,8 @@ class CDXObject(OrderedDict):
    def is_revisit(self):
        """return ``True`` if this record is a revisit record."""
-        return (self['mimetype'] == 'warc/revisit' or
+        return (self.get(MIMETYPE) == 'warc/revisit' or
-                self['filename'] == '-')
+                self.get(FILENAME) == '-')
    def to_text(self, fields=None):
        """
--- a/pywb/cdx/cdxops.py
+++ b/pywb/cdx/cdxops.py
@ -172,6 +172,8 @@ def cdx_filter(cdx_iter, filter_strings):
            # apply filter to cdx[field]
            else:
                self.field = parts[0]
                self.field = CDXObject.CDX_ALT_FIELDS.get(self.field,
                                                          self.field)
                string = parts[1]
            # make regex if regex mode
@ -181,7 +183,10 @@ def cdx_filter(cdx_iter, filter_strings):
                self.filter_str = string
        def __call__(self, cdx):
-            val = cdx[self.field] if self.field else str(cdx)
+            if not self.field:
                val = str(cdx)
            else:
                val = cdx.get(self.field, '')
            matched = self.compare_func(val)
@ -280,8 +285,8 @@ def cdx_resolve_revisits(cdx_iter):
        if original_cdx and is_revisit:
            fill_orig = lambda field: original_cdx[field]
            # Transfer mimetype and statuscode
-            cdx['mimetype'] = original_cdx['mimetype']
+            cdx['mimetype'] = original_cdx.get('mimetype', 'none')
-            cdx['statuscode'] = original_cdx['statuscode']
+            cdx['statuscode'] = original_cdx.get('statuscode', 'none')
        else:
            fill_orig = lambda field: '-'
--- a/pywb/cdx/cdxserver.py
+++ b/pywb/cdx/cdxserver.py
@ -167,7 +167,7 @@ class CDXServer(BaseCDXServer):
        if filename.startswith('redis://'):
            return RedisCDXSource(filename, config)
-        if filename.endswith('.cdx'):
+        if filename.endswith(('.cdx', '.cdxj')):
            return CDXFile(filename)
        if filename.endswith(('.summary', '.idx')):
--- a/pywb/cdx/cdxsource.py
+++ b/pywb/cdx/cdxsource.py
@ -29,13 +29,10 @@ class CDXFile(CDXSource):
    def load_cdx(self, query):
        def do_open():
-            try:
+            with open(self.filename, 'rb') as source:
                source = open(self.filename, 'rb')
                gen = iter_range(source, query.key, query.end_key)
                for line in gen:
                    yield line
            finally:
                source.close()
        return do_open()
        #return iter_range(do_open(), query.key, query.end_key)
--- a/pywb/cdx/test/test_cdxops.py
+++ b/pywb/cdx/test/test_cdxops.py
@ -48,6 +48,19 @@ org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/s
 >>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', filter = 'statuscode:200')
 org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz
 # Filter Alt field name
 >>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', filter = 'status:200')
 org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz
 # Filter -- no field specified, match regex on entire line
 >>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', filter = '~screen.css 20140126200625')
 org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz
 # Filter -- no such field, no matches
 >>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', filter = 'blah:200')
 Traceback (most recent call last):
 NotFoundException: No Captures found for: http://iana.org/_css/2013.1/screen.css
 # Filter exact
 >>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '=urlkey:com,example)/?example=1')
 com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz
@ -82,7 +95,6 @@ org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/
 org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz - - -
 org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz 8754 41238 iana.warc.gz
 # Sort by closest timestamp + field select output
 >>> cdx_ops_test(closest = '20140126200826', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', fields = 'timestamp', limit = 10)
 20140126200826
@ -138,6 +150,12 @@ org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201249 http://www.iana.org/_
 >>> cdx_ops_test('http://iana.org/domains/root/db', resolveRevisits = True)
 org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz - - -
 org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz - - -
 # Resolve Revisit -- cdxj minimal
 #>>> cdx_ops_test(url = 'http://example.com/?example=1', sources=[get_test_dir() + 'cdxj/example.cdxj'], resolveRevisits=True)
 """
 #=================================================================
--- a/pywb/warc/archiveiterator.py
+++ b/pywb/warc/archiveiterator.py
@ -197,6 +197,9 @@ class ArchiveIterator(object):
 class ArchiveIndexEntryMixin(object):
    MIME_RE = re.compile('[; ]')
    def reset_entry(self):
        self['key'] = ''
    def extract_mime(self, mime, def_mime='unk'):
        """ Utility function to extract mimetype only
        from a full content type, removing charset settings
@ -215,11 +218,12 @@ class ArchiveIndexEntryMixin(object):
            self['status'] = '-'
    def set_rec_info(self, offset, length, digest):
        self['length'] = str(length)
        self['offset'] = str(offset)
        if digest:
            self['digest'] = digest
        self['length'] = str(length)
        self['offset'] = str(offset)
    def merge_request_data(self, other, options):
        surt_ordered = options.get('surt_ordered', True)
@ -248,12 +252,21 @@ class ArchiveIndexEntryMixin(object):
 class DefaultRecordIter(object):
    def __init__(self, **options):
        self.options = options
        self.entry_cache = {}
-    def _create_index_entry(self):
+    def _create_index_entry(self, rec_type):
-        if self.options.get('cdxj'):
+        try:
-            return OrderedArchiveIndexEntry()
+            entry = self.entry_cache[rec_type]
-        else:
+            entry.reset_entry()
-            return ArchiveIndexEntry()
+        except:
            if self.options.get('cdxj'):
                entry = OrderedArchiveIndexEntry()
            else:
                entry = ArchiveIndexEntry()
            self.entry_cache[rec_type] = entry
        return entry
    def create_record_iter(self, arcv_iter):
        append_post = self.options.get('append_post')
@ -295,8 +308,7 @@ class DefaultRecordIter(object):
            compute_digest = False
-            if (not minimal and
+            if (entry.get('digest', '-') == '-' and
                entry.get('digest', '-') == '-' and
                record.rec_type not in ('revisit', 'request', 'warcinfo')):
                compute_digest = True
@ -312,7 +324,6 @@ class DefaultRecordIter(object):
                entry['_post_query'] = post_query
            #entry.set_rec_info(*arcv_iter.read_to_end(record, compute_digest))
            arcv_iter.read_to_end(record, compute_digest)
            entry.set_rec_info(*arcv_iter.member_info)
            entry.record = record
@ -355,7 +366,7 @@ class DefaultRecordIter(object):
        """ Parse warc record
        """
-        entry = self._create_index_entry()
+        entry = self._create_index_entry(record.rec_type)
        if record.rec_type == 'warcinfo':
            entry['url'] = record.rec_headers.get_header('WARC-Filename')
@ -369,12 +380,11 @@ class DefaultRecordIter(object):
        entry['timestamp'] = iso_date_to_timestamp(record.rec_headers.
                                                   get_header('WARC-Date'))
        if self.options.get('minimal'):
            return entry
        # mime
        if record.rec_type == 'revisit':
            entry['mime'] = 'warc/revisit'
        elif self.options.get('minimal'):
            entry['mime'] = '-'
        else:
            def_mime = '-' if record.rec_type == 'request' else 'unk'
            entry.extract_mime(record.status_headers.
@ -382,7 +392,7 @@ class DefaultRecordIter(object):
                               def_mime)
        # status -- only for response records (by convention):
-        if record.rec_type == 'response':
+        if record.rec_type == 'response' and not self.options.get('minimal'):
            entry.extract_status(record.status_headers)
        else:
            entry['status'] = '-'
@ -414,7 +424,7 @@ class DefaultRecordIter(object):
        # replace nulls
        url = url.replace('\x00', '%00')
-        entry = self._create_index_entry()
+        entry = self._create_index_entry(record.rec_type)
        entry['url'] = url
        # timestamp
@ -422,14 +432,12 @@ class DefaultRecordIter(object):
        if len(entry['timestamp']) > 14:
            entry['timestamp'] = entry['timestamp'][:14]
-        if self.options.get('minimal'):
+        if not self.options.get('minimal'):
-            return entry
+            # mime
            entry.extract_mime(record.rec_headers.get_header('content-type'))
-        # mime
+            # status
-        entry.extract_mime(record.rec_headers.get_header('content-type'))
+            entry.extract_status(record.status_headers)
        # status
        entry.extract_status(record.status_headers)
        # digest
        entry['digest'] = '-'
--- a/pywb/warc/cdxindexer.py
+++ b/pywb/warc/cdxindexer.py
@ -16,6 +16,7 @@ from io import BytesIO
 from archiveiterator import DefaultRecordIter
 #=================================================================
 class BaseCDXWriter(object):
    def __init__(self, out):
@ -68,26 +69,6 @@ class CDXJ(object):
        out.write('\n')
 #=================================================================
 class CDX06(object):
    def _write_header(self):
        self.out.write(' CDX N b a S V g\n')
    def write_cdx_line(self, out, entry, filename):
        out.write(entry['key'])
        out.write(' ')
        out.write(entry['timestamp'])
        out.write(' ')
        out.write(entry['url'])
        out.write(' ')
        out.write(entry['length'])
        out.write(' ')
        out.write(entry['offset'])
        out.write(' ')
        out.write(filename)
        out.write('\n')
 #=================================================================
 class CDX09(object):
    def _write_header(self):
@ -201,6 +182,8 @@ def cdx_filename(filename):
 #=================================================================
 def get_cdx_writer_cls(options):
    writer_cls = options.get('writer_cls')
    if options.get('minimal'):
        options['cdxj'] = True
    if writer_cls:
        if not options.get('writer_add_mixin'):
@ -212,8 +195,6 @@ def get_cdx_writer_cls(options):
    if options.get('cdxj'):
        format_mixin = CDXJ
    elif options.get('cdx06') or options.get('minimal'):
        format_mixin = CDX06
    elif options.get('cdx09'):
        format_mixin = CDX09
    else:
@ -311,14 +292,20 @@ Not-recommended for new cdx, use only for backwards-compatibility.
    cdx09_help = """
 Use older 9-field cdx format, default is 11-cdx field
 """
-    minimal_help = """
+    minimal_json_help = """
-Use a minimal 6-field cdx format, outputing only the basic fields
+CDX JSON output, but with minimal fields only, available  w/o parsing
-needed to identiyfy record:
+http record. The fields are:
-canonicalized url, timestamp, original url, archive offset, archive length
+canonicalized url, timestamp, original url, digest, archive offset, archive length
-and archive filename.
+and archive filename. mimetype is included to indicate warc/revisit only.
 This option skips record parsing and will not work with
 POST append (-p) option
 """
    json_help = """
 Output CDX JSON format per line, with url timestamp first, followed by json dict
 for all other fields:
 url timestamp { ... }
 """
    output_help = """output file or directory.
@ -370,15 +357,13 @@ if input is a directory"""
                        action='store_true',
                        help=cdx09_help)
    group.add_argument('-6', '--cdx06',
                        action='store_true')
    group.add_argument('-j', '--cdxj',
                        action='store_true')
    parser.add_argument('-m', '--minimal',
                        action='store_true',
-                        help=minimal_help)
+                        help=json_help)
    parser.add_argument('-mj', '--minimal-cdxj',
                        action='store_true',
                        help=minimal_json_help)
    parser.add_argument('output', nargs='?', default='-', help=output_help)
    parser.add_argument('inputs', nargs='+', help=input_help)
@ -392,9 +377,8 @@ if input is a directory"""
                          append_post=cmd.postappend,
                          recurse=cmd.recurse,
                          cdx09=cmd.cdx09,
                          cdx06=cmd.cdx06,
                          cdxj=cmd.cdxj,
-                          minimal=cmd.minimal)
+                          minimal=cmd.minimal_cdxj)
 if __name__ == '__main__':
--- a/pywb/warc/test/test_indexing.py
+++ b/pywb/warc/test/test_indexing.py
@ -8,18 +8,11 @@ com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 20
 com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz
 org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
 # warc.gz -- minimal cdx
 >>> print_cdx_index('example.warc.gz', minimal=True)
 CDX N b a S V g
 com,example)/?example=1 20140103030321 http://example.com?example=1 1043 333 example.warc.gz
 com,example)/?example=1 20140103030341 http://example.com?example=1 553 1864 example.warc.gz
 org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example 577 2907 example.warc.gz
 # warc.gz -- minimal CDXJ
 >>> print_cdx_index('example.warc.gz', minimal=True, cdxj=True)
-com,example)/?example=1 20140103030321 {"url": "http://example.com?example=1", "length": "1043", "offset": "333", "filename": "example.warc.gz"}
+com,example)/?example=1 20140103030321 {"url": "http://example.com?example=1", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "1043", "offset": "333", "filename": "example.warc.gz"}
-com,example)/?example=1 20140103030341 {"url": "http://example.com?example=1", "length": "553", "offset": "1864", "filename": "example.warc.gz"}
+com,example)/?example=1 20140103030341 {"url": "http://example.com?example=1", "mime": "warc/revisit", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "553", "offset": "1864", "filename": "example.warc.gz"}
-org,iana)/domains/example 20140128051539 {"url": "http://www.iana.org/domains/example", "length": "577", "offset": "2907", "filename": "example.warc.gz"}
+org,iana)/domains/example 20140128051539 {"url": "http://www.iana.org/domains/example", "digest": "JZ622UA23G5ZU6Y3XAKH4LINONUEICEG", "length": "577", "offset": "2907", "filename": "example.warc.gz"}
 # warc.gz -- parse all
 >>> print_cdx_index('example.warc.gz', include_all=True)
@ -63,6 +56,10 @@ com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ
 >>> print_cdx_index('example.arc.gz', cdxj=True)
 com,example)/ 20140216050221 {"url": "http://example.com/", "mime": "text/html", "status": "200", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "856", "offset": "171", "filename": "example.arc.gz"}
 # arc.gz -- minimal + json
 >>> print_cdx_index('example.arc.gz', cdxj=True, minimal=True)
 com,example)/ 20140216050221 {"url": "http://example.com/", "digest": "PEWDX5GTH66WU74WBPGFECIYBMPMP3FP", "length": "856", "offset": "171", "filename": "example.arc.gz"}
 # arc
 >>> print_cdx_index('example.arc')
 CDX N b a m s k r M S V g
--- a/setup.py
+++ b/setup.py
@ -60,6 +60,8 @@ setup(
        },
    data_files=[
        ('sample_archive/cdx', glob.glob('sample_archive/cdx/*')),
        ('sample_archive/cdxj', glob.glob('sample_archive/cdxj/*')),
        ('sample_archive/non-surt-cdx', glob.glob('sample_archive/non-surt-cdx/*')),
        ('sample_archive/zipcdx', glob.glob('sample_archive/zipcdx/*')),
        ('sample_archive/warcs', glob.glob('sample_archive/warcs/*')),
        ('sample_archive/text_content',
--- a/tests/test_config.yaml
+++ b/tests/test_config.yaml
@ -46,6 +46,8 @@ collections:
        index_paths: ./sample_archive/cdx/
        redir_to_exact: false
    pywb-cdxj:
        index_paths: ./sample_archive/cdxj/
 # indicate if cdx files are sorted by SURT keys -- eg: com,example)/
--- a/tests/test_integration.py
+++ b/tests/test_integration.py
@ -124,6 +124,14 @@ class TestWb:
        assert 'wb.js' in resp.body
        assert '/pywb-nosurt/20140103030321/http://www.iana.org/domains/example' in resp.body
    def test_replay_cdxj(self):
        resp = self.testapp.get('/pywb-cdxj/20140103030321/http://example.com?example=1')
        self._assert_basic_html(resp)
        assert '"20140103030321"' in resp.body
        assert 'wb.js' in resp.body
        assert '/pywb-cdxj/20140103030321/http://www.iana.org/domains/example' in resp.body
    def test_zero_len_revisit(self):
        resp = self.testapp.get('/pywb/20140603030341/http://example.com?example=2')
        self._assert_basic_html(resp)