diff --git a/pywb/warc/archiveiterator.py b/pywb/warc/archiveiterator.py index 4bc17687..49e67a6f 100644 --- a/pywb/warc/archiveiterator.py +++ b/pywb/warc/archiveiterator.py @@ -40,7 +40,7 @@ class ArchiveIterator(object): warc2warc -Z myfile.{0} > myfile.{0}.gz """ - def __init__(self, fileobj): + def __init__(self, fileobj, no_record_parse=False): self.fh = fileobj self.loader = ArcWarcRecordLoader() @@ -50,6 +50,7 @@ class ArchiveIterator(object): self.known_format = None self.member_info = None + self.no_record_parse = no_record_parse def iter_records(self, block_size=16384): """ iterate over each record @@ -176,7 +177,8 @@ class ArchiveIterator(object): """ record = self.loader.parse_record_stream(self.reader, next_line, - self.known_format) + self.known_format, + self.no_record_parse) self.member_info = None @@ -254,6 +256,10 @@ class DefaultRecordIter(object): minimal = self.options.get('minimal') append_post = self.options.get('append_post') + if append_post and minimal: + raise Exception('Sorry, minimal index option and ' + + 'append POST options can not be used together') + for record in arcv_iter.iter_records(block_size): entry = None @@ -423,7 +429,7 @@ class DefaultRecordIter(object): return entry def __call__(self, fh): - aiter = ArchiveIterator(fh) + aiter = ArchiveIterator(fh, self.options.get('minimal', False)) entry_iter = self.create_record_iter(aiter) diff --git a/pywb/warc/cdxindexer.py b/pywb/warc/cdxindexer.py index 3bcb1c1a..84cfe585 100644 --- a/pywb/warc/cdxindexer.py +++ b/pywb/warc/cdxindexer.py @@ -173,7 +173,7 @@ def get_cdx_writer_cls(options): if options.get('cdx09'): format_mixin = CDX09 - elif options.get('cdx06'): + elif options.get('minimal'): format_mixin = CDX06 else: format_mixin = CDX11 @@ -269,6 +269,15 @@ Not-recommended for new cdx, use only for backwards-compatibility. cdx09_help = """ Use older 9-field cdx format, default is 11-cdx field +""" + minimal_help = """ +Use a minimal 6-field cdx format, outputing only the basic fields +needed to identiyfy record: +canonicalized url, timestamp, original url, archive offset, archive length +and archive filename. + +This option skips record parsing and will not work with +POST append (-p) option """ output_help = """output file or directory. @@ -320,9 +329,9 @@ if input is a directory""" action='store_true', help=cdx09_help) - group.add_argument('-6', '--cdx06', + group.add_argument('-m', '--minimal', action='store_true', - help=cdx09_help) + help=minimal_help) parser.add_argument('output', nargs='?', default='-', help=output_help) parser.add_argument('inputs', nargs='+', help=input_help) @@ -336,8 +345,7 @@ if input is a directory""" append_post=cmd.postappend, recurse=cmd.recurse, cdx09=cmd.cdx09, - cdx06=cmd.cdx06, - minimal=cmd.cdx06) + minimal=cmd.minimal) if __name__ == '__main__': diff --git a/pywb/warc/recordloader.py b/pywb/warc/recordloader.py index 67cc9e22..c2bc4b74 100644 --- a/pywb/warc/recordloader.py +++ b/pywb/warc/recordloader.py @@ -83,7 +83,9 @@ class ArcWarcRecordLoader: return self.parse_record_stream(stream) def parse_record_stream(self, stream, - statusline=None, known_format=None): + statusline=None, + known_format=None, + no_record_parse=False): """ Parse file-like stream and return an ArcWarcRecord encapsulating the record headers, http headers (if any), and a stream limited to the remainder of the record. @@ -128,8 +130,12 @@ class ArcWarcRecordLoader: # limit stream to the length for all valid records stream = LimitReader.wrap_stream(stream, length) + # don't parse the http record at all + if no_record_parse: + status_headers = StatusAndHeaders('', []) + # if empty record (error or otherwise) set status to 204 - if length == 0: + elif length == 0: if is_err: msg = '204 Possible Error' else: