1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

indexing: for minimal index, use a single -m flag to create a 6 field index.

minimal index also skips parsing contents of warc/arc records altogether
add cli docs for minimal index, tracked via #75
This commit is contained in:
Ilya Kreymer 2015-03-07 11:56:17 -08:00
parent 499e21233e
commit fe1683da56
3 changed files with 30 additions and 10 deletions

View File

@ -40,7 +40,7 @@ class ArchiveIterator(object):
warc2warc -Z myfile.{0} > myfile.{0}.gz
"""
def __init__(self, fileobj):
def __init__(self, fileobj, no_record_parse=False):
self.fh = fileobj
self.loader = ArcWarcRecordLoader()
@ -50,6 +50,7 @@ class ArchiveIterator(object):
self.known_format = None
self.member_info = None
self.no_record_parse = no_record_parse
def iter_records(self, block_size=16384):
""" iterate over each record
@ -176,7 +177,8 @@ class ArchiveIterator(object):
"""
record = self.loader.parse_record_stream(self.reader,
next_line,
self.known_format)
self.known_format,
self.no_record_parse)
self.member_info = None
@ -254,6 +256,10 @@ class DefaultRecordIter(object):
minimal = self.options.get('minimal')
append_post = self.options.get('append_post')
if append_post and minimal:
raise Exception('Sorry, minimal index option and ' +
'append POST options can not be used together')
for record in arcv_iter.iter_records(block_size):
entry = None
@ -423,7 +429,7 @@ class DefaultRecordIter(object):
return entry
def __call__(self, fh):
aiter = ArchiveIterator(fh)
aiter = ArchiveIterator(fh, self.options.get('minimal', False))
entry_iter = self.create_record_iter(aiter)

View File

@ -173,7 +173,7 @@ def get_cdx_writer_cls(options):
if options.get('cdx09'):
format_mixin = CDX09
elif options.get('cdx06'):
elif options.get('minimal'):
format_mixin = CDX06
else:
format_mixin = CDX11
@ -269,6 +269,15 @@ Not-recommended for new cdx, use only for backwards-compatibility.
cdx09_help = """
Use older 9-field cdx format, default is 11-cdx field
"""
minimal_help = """
Use a minimal 6-field cdx format, outputing only the basic fields
needed to identiyfy record:
canonicalized url, timestamp, original url, archive offset, archive length
and archive filename.
This option skips record parsing and will not work with
POST append (-p) option
"""
output_help = """output file or directory.
@ -320,9 +329,9 @@ if input is a directory"""
action='store_true',
help=cdx09_help)
group.add_argument('-6', '--cdx06',
group.add_argument('-m', '--minimal',
action='store_true',
help=cdx09_help)
help=minimal_help)
parser.add_argument('output', nargs='?', default='-', help=output_help)
parser.add_argument('inputs', nargs='+', help=input_help)
@ -336,8 +345,7 @@ if input is a directory"""
append_post=cmd.postappend,
recurse=cmd.recurse,
cdx09=cmd.cdx09,
cdx06=cmd.cdx06,
minimal=cmd.cdx06)
minimal=cmd.minimal)
if __name__ == '__main__':

View File

@ -83,7 +83,9 @@ class ArcWarcRecordLoader:
return self.parse_record_stream(stream)
def parse_record_stream(self, stream,
statusline=None, known_format=None):
statusline=None,
known_format=None,
no_record_parse=False):
""" Parse file-like stream and return an ArcWarcRecord
encapsulating the record headers, http headers (if any),
and a stream limited to the remainder of the record.
@ -128,8 +130,12 @@ class ArcWarcRecordLoader:
# limit stream to the length for all valid records
stream = LimitReader.wrap_stream(stream, length)
# don't parse the http record at all
if no_record_parse:
status_headers = StatusAndHeaders('', [])
# if empty record (error or otherwise) set status to 204
if length == 0:
elif length == 0:
if is_err:
msg = '204 Possible Error'
else: