indexing: for minimal index, use a single -m flag to create a 6 field index.

minimal index also skips parsing contents of warc/arc records altogether add cli docs for minimal index, tracked via #75
2025-03-15 00:03:28 +01:00 · 2015-03-07 11:56:17 -08:00 · 2015-03-07 11:56:17 -08:00 · fe1683da56
commit fe1683da56
parent 499e21233e
3 changed files with 30 additions and 10 deletions
--- a/pywb/warc/archiveiterator.py
+++ b/pywb/warc/archiveiterator.py
@ -40,7 +40,7 @@ class ArchiveIterator(object):
    warc2warc -Z myfile.{0} > myfile.{0}.gz
    """

-    def __init__(self, fileobj):
+    def __init__(self, fileobj, no_record_parse=False):
        self.fh = fileobj

        self.loader = ArcWarcRecordLoader()
@ -50,6 +50,7 @@ class ArchiveIterator(object):
        self.known_format = None

        self.member_info = None
+        self.no_record_parse = no_record_parse

    def iter_records(self, block_size=16384):
        """ iterate over each record
@ -176,7 +177,8 @@ class ArchiveIterator(object):
        """
        record = self.loader.parse_record_stream(self.reader,
                                                 next_line,
-                                                 self.known_format)
+                                                 self.known_format,
+                                                 self.no_record_parse)

        self.member_info = None

@ -254,6 +256,10 @@ class DefaultRecordIter(object):
        minimal = self.options.get('minimal')
        append_post = self.options.get('append_post')

+        if append_post and minimal:
+            raise Exception('Sorry, minimal index option and ' +
+                            'append POST options can not be used together')
+
        for record in arcv_iter.iter_records(block_size):
            entry = None

@ -423,7 +429,7 @@ class DefaultRecordIter(object):
        return entry

    def __call__(self, fh):
-        aiter = ArchiveIterator(fh)
+        aiter = ArchiveIterator(fh, self.options.get('minimal', False))

        entry_iter = self.create_record_iter(aiter)

--- a/pywb/warc/cdxindexer.py
+++ b/pywb/warc/cdxindexer.py
@ -173,7 +173,7 @@ def get_cdx_writer_cls(options):

    if options.get('cdx09'):
        format_mixin = CDX09
-    elif options.get('cdx06'):
+    elif options.get('minimal'):
        format_mixin = CDX06
    else:
        format_mixin = CDX11
@ -269,6 +269,15 @@ Not-recommended for new cdx, use only for backwards-compatibility.

    cdx09_help = """
 Use older 9-field cdx format, default is 11-cdx field
+"""
+    minimal_help = """
+Use a minimal 6-field cdx format, outputing only the basic fields
+needed to identiyfy record:
+canonicalized url, timestamp, original url, archive offset, archive length
+and archive filename.
+
+This option skips record parsing and will not work with
+POST append (-p) option
 """

    output_help = """output file or directory.
@ -320,9 +329,9 @@ if input is a directory"""
                        action='store_true',
                        help=cdx09_help)

-    group.add_argument('-6', '--cdx06',
+    group.add_argument('-m', '--minimal',
                        action='store_true',
-                        help=cdx09_help)
+                        help=minimal_help)

    parser.add_argument('output', nargs='?', default='-', help=output_help)
    parser.add_argument('inputs', nargs='+', help=input_help)
@ -336,8 +345,7 @@ if input is a directory"""
                          append_post=cmd.postappend,
                          recurse=cmd.recurse,
                          cdx09=cmd.cdx09,
-                          cdx06=cmd.cdx06,
-                          minimal=cmd.cdx06)
+                          minimal=cmd.minimal)


 if __name__ == '__main__':
--- a/pywb/warc/recordloader.py
+++ b/pywb/warc/recordloader.py
@ -83,7 +83,9 @@ class ArcWarcRecordLoader:
        return self.parse_record_stream(stream)

    def parse_record_stream(self, stream,
-                            statusline=None, known_format=None):
+                            statusline=None,
+                            known_format=None,
+                            no_record_parse=False):
        """ Parse file-like stream and return an ArcWarcRecord
        encapsulating the record headers, http headers (if any),
        and a stream limited to the remainder of the record.
@ -128,8 +130,12 @@ class ArcWarcRecordLoader:
        # limit stream to the length for all valid records
        stream = LimitReader.wrap_stream(stream, length)

+        # don't parse the http record at all
+        if no_record_parse:
+            status_headers = StatusAndHeaders('', [])
+
        # if empty record (error or otherwise) set status to 204
-        if length == 0:
+        elif length == 0:
            if is_err:
                msg = '204 Possible Error'
            else: