cdx indexing: wrap record iterator global functions in class DefaultRecordIter to allow for better extensibility

add 'minimal' option to skip digest/mime/status extraction only include minimal data (url+timestamp) cdx-indexer: add -6 option to create 6-field index
2025-03-28 00:25:21 +01:00 · 2015-02-25 13:31:37 -08:00 · 2015-02-25 13:31:37 -08:00 · 671f45f69f
commit 671f45f69f
parent 1d4c54deaa
2 changed files with 194 additions and 167 deletions
--- a/pywb/warc/archiveiterator.py
+++ b/pywb/warc/archiveiterator.py
@ -242,10 +242,14 @@ class ArchiveIndexEntry(object):
 #=================================================================
-def create_record_iter(arcv_iter, options):
+class DefaultRecordIter(object):
-    append_post = options.get('append_post')
+    def __init__(self, **options):
-    include_all = options.get('include_all')
+        self.options = options
-    block_size = options.get('block_size', 16384)
+
    def create_record_iter(self, arcv_iter):
        append_post = self.options.get('append_post')
        include_all = self.options.get('include_all')
        block_size = self.options.get('block_size', 16384)
        for record in arcv_iter.iter_records(block_size):
            entry = None
@ -263,16 +267,16 @@ def create_record_iter(arcv_iter, options):
                      record.content_type == 'application/warc-fields'):
                    continue
-            entry = parse_warc_record(record)
+                entry = self.parse_warc_record(record)
            elif record.format == 'arc':
-            entry = parse_arc_record(record)
+                entry = self.parse_arc_record(record)
            if not entry:
                continue
            if entry.url and not entry.key:
                entry.key = canonicalize(entry.url,
-                                     options.get('surt_ordered', True))
+                                         self.options.get('surt_ordered', True))
            compute_digest = False
@ -281,7 +285,7 @@ def create_record_iter(arcv_iter, options):
                compute_digest = True
-        elif record.rec_type == 'request' and options.get('append_post'):
+            elif record.rec_type == 'request' and self.options.get('append_post'):
                method = record.status_headers.protocol
                len_ = record.status_headers.get_header('Content-Length')
@ -299,9 +303,7 @@ def create_record_iter(arcv_iter, options):
            yield entry
-
+    def join_request_records(self, entry_iter):
 #=================================================================
 def join_request_records(entry_iter, options):
        prev_entry = None
        for entry in entry_iter:
@ -318,8 +320,8 @@ def join_request_records(entry_iter, options):
                  prev_entry.record.rec_headers.get_header('WARC-Record-ID')):
                pass
-        elif (entry.merge_request_data(prev_entry, options) or
+            elif (entry.merge_request_data(prev_entry, self.options) or
-              prev_entry.merge_request_data(entry, options)):
+                  prev_entry.merge_request_data(entry, self.options)):
                yield prev_entry
                yield entry
                prev_entry = None
@ -332,8 +334,8 @@ def join_request_records(entry_iter, options):
            yield prev_entry
-#=================================================================
+    #=================================================================
-def parse_warc_record(record):
+    def parse_warc_record(self, record):
        """ Parse warc record
        """
@ -351,6 +353,9 @@ def parse_warc_record(record):
        entry.timestamp = iso_date_to_timestamp(record.rec_headers.
                                                get_header('WARC-Date'))
        if self.options.get('minimal'):
            return entry
        # mime
        if record.rec_type == 'revisit':
            entry.mime = 'warc/revisit'
@ -377,8 +382,8 @@ def parse_warc_record(record):
        return entry
-#=================================================================
+    #=================================================================
-def parse_arc_record(record):
+    def parse_arc_record(self, record):
        """ Parse arc record
        """
        if record.rec_type == 'arc_header':
@ -400,6 +405,9 @@ def parse_arc_record(record):
        if len(entry.timestamp) > 14:
            entry.timestamp = entry.timestamp[:14]
        if self.options.get('minimal'):
            return entry
        # status
        entry.extract_status(record.status_headers)
@ -411,19 +419,17 @@ def parse_arc_record(record):
        return entry
-
+    def __call__(self, fh):
 #=================================================================
 def create_index_iter(fh, **options):
        aiter = ArchiveIterator(fh)
-    entry_iter = create_record_iter(aiter, options)
+        entry_iter = self.create_record_iter(aiter)
-    if options.get('append_post'):
+        if self.options.get('append_post'):
-        entry_iter = join_request_records(entry_iter, options)
+            entry_iter = self.join_request_records(entry_iter)
        for entry in entry_iter:
            if (entry.record.rec_type in ('request', 'warcinfo') and
-             not options.get('include_all')):
+                 not self.options.get('include_all')):
                continue
            yield entry
--- a/pywb/warc/cdxindexer.py
+++ b/pywb/warc/cdxindexer.py
@ -5,20 +5,21 @@ from bisect import insort
 from io import BytesIO
-from archiveiterator import create_index_iter
+from archiveiterator import DefaultRecordIter
 #=================================================================
 class CDXWriter(object):
-    def __init__(self, out, cdx09=False):
+    def __init__(self, out, format_):
        self.out = out
-        self.cdx09 = cdx09
+        self.format_ = format_
    def __enter__(self):
-        if not self.cdx09:
+        if self.format_ == 'cdx09':
            self.out.write(' CDX N b a m s k r M S V g\n')
        else:
            self.out.write(' CDX N b a m s k r V g\n')
        elif self.format_ == 'cdx06':
            self.out.write(' CDX N b a S V g\n')
        else:
            self.out.write(' CDX N b a m s k r M S V g\n')
        return self
@ -41,17 +42,24 @@ class CDXWriter(object):
        out.write(' ')
        out.write(entry.url)
        out.write(' ')
        if self.format_ != 'cdx06':
            out.write(entry.mime)
            out.write(' ')
            out.write(entry.status)
            out.write(' ')
            out.write(entry.digest)
-        if self.cdx09:
+
        if self.format_ == 'cdx09':
            out.write(' - ')
        elif self.format_ == 'cdx06':
            out.write(entry.length)
            out.write(' ')
        else:
            out.write(' - - ')
            out.write(entry.length)
            out.write(' ')
        out.write(entry.offset)
        out.write(' ')
        out.write(filename)
@ -153,11 +161,12 @@ def write_multi_cdx_index(output, inputs, **options):
            outfile = open(output, 'wb')
        writer_cls = get_cdx_writer_cls(options)
        record_iter = DefaultRecordIter(**options)
-        with writer_cls(outfile, options.get('cdx09')) as writer:
+        with writer_cls(outfile, options.get('format')) as writer:
            for fullpath, filename in iter_file_or_dir(inputs, recurse):
                with open(fullpath, 'rb') as infile:
-                    entry_iter = create_index_iter(infile, **options)
+                    entry_iter = record_iter(infile)
                    for entry in entry_iter:
                        writer.write(entry, filename)
@ -172,8 +181,8 @@ def write_cdx_index(outfile, infile, filename, **options):
    writer_cls = get_cdx_writer_cls(options)
-    with writer_cls(outfile, options.get('cdx09')) as writer:
+    with writer_cls(outfile, options.get('format')) as writer:
-        entry_iter = create_index_iter(infile, **options)
+        entry_iter = DefaultRecordIter(**options)(infile)
        for entry in entry_iter:
            writer.write(entry, filename)
@ -260,7 +269,12 @@ if input is a directory"""
                        action='store_true',
                        help=unsurt_help)
-    parser.add_argument('-9', '--cdx09',
+    group = parser.add_mutually_exclusive_group()
    group.add_argument('-9', '--cdx09',
                        action='store_true',
                        help=cdx09_help)
    group.add_argument('-6', '--cdx06',
                        action='store_true',
                        help=cdx09_help)
@ -269,13 +283,20 @@ if input is a directory"""
    cmd = parser.parse_args(args=args)
    format_ = 'cdx11'
    if cmd.cdx09:
        format_ = 'cdx09'
    elif cmd.cdx06:
        format_ = 'cdx06'
    write_multi_cdx_index(cmd.output, cmd.inputs,
                          sort=cmd.sort,
                          surt_ordered=not cmd.unsurt,
                          include_all=cmd.allrecords,
                          append_post=cmd.postappend,
                          recurse=cmd.recurse,
-                          cdx09=cmd.cdx09)
+                          format=format_,
                          minimal=cmd.cdx06)
 if __name__ == '__main__':