mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
indexing: for minimal index, use a single -m flag to create a 6 field index.
minimal index also skips parsing contents of warc/arc records altogether add cli docs for minimal index, tracked via #75
This commit is contained in:
parent
499e21233e
commit
fe1683da56
@ -40,7 +40,7 @@ class ArchiveIterator(object):
|
||||
warc2warc -Z myfile.{0} > myfile.{0}.gz
|
||||
"""
|
||||
|
||||
def __init__(self, fileobj):
|
||||
def __init__(self, fileobj, no_record_parse=False):
|
||||
self.fh = fileobj
|
||||
|
||||
self.loader = ArcWarcRecordLoader()
|
||||
@ -50,6 +50,7 @@ class ArchiveIterator(object):
|
||||
self.known_format = None
|
||||
|
||||
self.member_info = None
|
||||
self.no_record_parse = no_record_parse
|
||||
|
||||
def iter_records(self, block_size=16384):
|
||||
""" iterate over each record
|
||||
@ -176,7 +177,8 @@ class ArchiveIterator(object):
|
||||
"""
|
||||
record = self.loader.parse_record_stream(self.reader,
|
||||
next_line,
|
||||
self.known_format)
|
||||
self.known_format,
|
||||
self.no_record_parse)
|
||||
|
||||
self.member_info = None
|
||||
|
||||
@ -254,6 +256,10 @@ class DefaultRecordIter(object):
|
||||
minimal = self.options.get('minimal')
|
||||
append_post = self.options.get('append_post')
|
||||
|
||||
if append_post and minimal:
|
||||
raise Exception('Sorry, minimal index option and ' +
|
||||
'append POST options can not be used together')
|
||||
|
||||
for record in arcv_iter.iter_records(block_size):
|
||||
entry = None
|
||||
|
||||
@ -423,7 +429,7 @@ class DefaultRecordIter(object):
|
||||
return entry
|
||||
|
||||
def __call__(self, fh):
|
||||
aiter = ArchiveIterator(fh)
|
||||
aiter = ArchiveIterator(fh, self.options.get('minimal', False))
|
||||
|
||||
entry_iter = self.create_record_iter(aiter)
|
||||
|
||||
|
@ -173,7 +173,7 @@ def get_cdx_writer_cls(options):
|
||||
|
||||
if options.get('cdx09'):
|
||||
format_mixin = CDX09
|
||||
elif options.get('cdx06'):
|
||||
elif options.get('minimal'):
|
||||
format_mixin = CDX06
|
||||
else:
|
||||
format_mixin = CDX11
|
||||
@ -269,6 +269,15 @@ Not-recommended for new cdx, use only for backwards-compatibility.
|
||||
|
||||
cdx09_help = """
|
||||
Use older 9-field cdx format, default is 11-cdx field
|
||||
"""
|
||||
minimal_help = """
|
||||
Use a minimal 6-field cdx format, outputing only the basic fields
|
||||
needed to identiyfy record:
|
||||
canonicalized url, timestamp, original url, archive offset, archive length
|
||||
and archive filename.
|
||||
|
||||
This option skips record parsing and will not work with
|
||||
POST append (-p) option
|
||||
"""
|
||||
|
||||
output_help = """output file or directory.
|
||||
@ -320,9 +329,9 @@ if input is a directory"""
|
||||
action='store_true',
|
||||
help=cdx09_help)
|
||||
|
||||
group.add_argument('-6', '--cdx06',
|
||||
group.add_argument('-m', '--minimal',
|
||||
action='store_true',
|
||||
help=cdx09_help)
|
||||
help=minimal_help)
|
||||
|
||||
parser.add_argument('output', nargs='?', default='-', help=output_help)
|
||||
parser.add_argument('inputs', nargs='+', help=input_help)
|
||||
@ -336,8 +345,7 @@ if input is a directory"""
|
||||
append_post=cmd.postappend,
|
||||
recurse=cmd.recurse,
|
||||
cdx09=cmd.cdx09,
|
||||
cdx06=cmd.cdx06,
|
||||
minimal=cmd.cdx06)
|
||||
minimal=cmd.minimal)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
@ -83,7 +83,9 @@ class ArcWarcRecordLoader:
|
||||
return self.parse_record_stream(stream)
|
||||
|
||||
def parse_record_stream(self, stream,
|
||||
statusline=None, known_format=None):
|
||||
statusline=None,
|
||||
known_format=None,
|
||||
no_record_parse=False):
|
||||
""" Parse file-like stream and return an ArcWarcRecord
|
||||
encapsulating the record headers, http headers (if any),
|
||||
and a stream limited to the remainder of the record.
|
||||
@ -128,8 +130,12 @@ class ArcWarcRecordLoader:
|
||||
# limit stream to the length for all valid records
|
||||
stream = LimitReader.wrap_stream(stream, length)
|
||||
|
||||
# don't parse the http record at all
|
||||
if no_record_parse:
|
||||
status_headers = StatusAndHeaders('', [])
|
||||
|
||||
# if empty record (error or otherwise) set status to 204
|
||||
if length == 0:
|
||||
elif length == 0:
|
||||
if is_err:
|
||||
msg = '204 Possible Error'
|
||||
else:
|
||||
|
Loading…
x
Reference in New Issue
Block a user