mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
indexing: for minimal index, use a single -m flag to create a 6 field index.
minimal index also skips parsing contents of warc/arc records altogether add cli docs for minimal index, tracked via #75
This commit is contained in:
parent
499e21233e
commit
fe1683da56
@ -40,7 +40,7 @@ class ArchiveIterator(object):
|
|||||||
warc2warc -Z myfile.{0} > myfile.{0}.gz
|
warc2warc -Z myfile.{0} > myfile.{0}.gz
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, fileobj):
|
def __init__(self, fileobj, no_record_parse=False):
|
||||||
self.fh = fileobj
|
self.fh = fileobj
|
||||||
|
|
||||||
self.loader = ArcWarcRecordLoader()
|
self.loader = ArcWarcRecordLoader()
|
||||||
@ -50,6 +50,7 @@ class ArchiveIterator(object):
|
|||||||
self.known_format = None
|
self.known_format = None
|
||||||
|
|
||||||
self.member_info = None
|
self.member_info = None
|
||||||
|
self.no_record_parse = no_record_parse
|
||||||
|
|
||||||
def iter_records(self, block_size=16384):
|
def iter_records(self, block_size=16384):
|
||||||
""" iterate over each record
|
""" iterate over each record
|
||||||
@ -176,7 +177,8 @@ class ArchiveIterator(object):
|
|||||||
"""
|
"""
|
||||||
record = self.loader.parse_record_stream(self.reader,
|
record = self.loader.parse_record_stream(self.reader,
|
||||||
next_line,
|
next_line,
|
||||||
self.known_format)
|
self.known_format,
|
||||||
|
self.no_record_parse)
|
||||||
|
|
||||||
self.member_info = None
|
self.member_info = None
|
||||||
|
|
||||||
@ -254,6 +256,10 @@ class DefaultRecordIter(object):
|
|||||||
minimal = self.options.get('minimal')
|
minimal = self.options.get('minimal')
|
||||||
append_post = self.options.get('append_post')
|
append_post = self.options.get('append_post')
|
||||||
|
|
||||||
|
if append_post and minimal:
|
||||||
|
raise Exception('Sorry, minimal index option and ' +
|
||||||
|
'append POST options can not be used together')
|
||||||
|
|
||||||
for record in arcv_iter.iter_records(block_size):
|
for record in arcv_iter.iter_records(block_size):
|
||||||
entry = None
|
entry = None
|
||||||
|
|
||||||
@ -423,7 +429,7 @@ class DefaultRecordIter(object):
|
|||||||
return entry
|
return entry
|
||||||
|
|
||||||
def __call__(self, fh):
|
def __call__(self, fh):
|
||||||
aiter = ArchiveIterator(fh)
|
aiter = ArchiveIterator(fh, self.options.get('minimal', False))
|
||||||
|
|
||||||
entry_iter = self.create_record_iter(aiter)
|
entry_iter = self.create_record_iter(aiter)
|
||||||
|
|
||||||
|
@ -173,7 +173,7 @@ def get_cdx_writer_cls(options):
|
|||||||
|
|
||||||
if options.get('cdx09'):
|
if options.get('cdx09'):
|
||||||
format_mixin = CDX09
|
format_mixin = CDX09
|
||||||
elif options.get('cdx06'):
|
elif options.get('minimal'):
|
||||||
format_mixin = CDX06
|
format_mixin = CDX06
|
||||||
else:
|
else:
|
||||||
format_mixin = CDX11
|
format_mixin = CDX11
|
||||||
@ -269,6 +269,15 @@ Not-recommended for new cdx, use only for backwards-compatibility.
|
|||||||
|
|
||||||
cdx09_help = """
|
cdx09_help = """
|
||||||
Use older 9-field cdx format, default is 11-cdx field
|
Use older 9-field cdx format, default is 11-cdx field
|
||||||
|
"""
|
||||||
|
minimal_help = """
|
||||||
|
Use a minimal 6-field cdx format, outputing only the basic fields
|
||||||
|
needed to identiyfy record:
|
||||||
|
canonicalized url, timestamp, original url, archive offset, archive length
|
||||||
|
and archive filename.
|
||||||
|
|
||||||
|
This option skips record parsing and will not work with
|
||||||
|
POST append (-p) option
|
||||||
"""
|
"""
|
||||||
|
|
||||||
output_help = """output file or directory.
|
output_help = """output file or directory.
|
||||||
@ -320,9 +329,9 @@ if input is a directory"""
|
|||||||
action='store_true',
|
action='store_true',
|
||||||
help=cdx09_help)
|
help=cdx09_help)
|
||||||
|
|
||||||
group.add_argument('-6', '--cdx06',
|
group.add_argument('-m', '--minimal',
|
||||||
action='store_true',
|
action='store_true',
|
||||||
help=cdx09_help)
|
help=minimal_help)
|
||||||
|
|
||||||
parser.add_argument('output', nargs='?', default='-', help=output_help)
|
parser.add_argument('output', nargs='?', default='-', help=output_help)
|
||||||
parser.add_argument('inputs', nargs='+', help=input_help)
|
parser.add_argument('inputs', nargs='+', help=input_help)
|
||||||
@ -336,8 +345,7 @@ if input is a directory"""
|
|||||||
append_post=cmd.postappend,
|
append_post=cmd.postappend,
|
||||||
recurse=cmd.recurse,
|
recurse=cmd.recurse,
|
||||||
cdx09=cmd.cdx09,
|
cdx09=cmd.cdx09,
|
||||||
cdx06=cmd.cdx06,
|
minimal=cmd.minimal)
|
||||||
minimal=cmd.cdx06)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
@ -83,7 +83,9 @@ class ArcWarcRecordLoader:
|
|||||||
return self.parse_record_stream(stream)
|
return self.parse_record_stream(stream)
|
||||||
|
|
||||||
def parse_record_stream(self, stream,
|
def parse_record_stream(self, stream,
|
||||||
statusline=None, known_format=None):
|
statusline=None,
|
||||||
|
known_format=None,
|
||||||
|
no_record_parse=False):
|
||||||
""" Parse file-like stream and return an ArcWarcRecord
|
""" Parse file-like stream and return an ArcWarcRecord
|
||||||
encapsulating the record headers, http headers (if any),
|
encapsulating the record headers, http headers (if any),
|
||||||
and a stream limited to the remainder of the record.
|
and a stream limited to the remainder of the record.
|
||||||
@ -128,8 +130,12 @@ class ArcWarcRecordLoader:
|
|||||||
# limit stream to the length for all valid records
|
# limit stream to the length for all valid records
|
||||||
stream = LimitReader.wrap_stream(stream, length)
|
stream = LimitReader.wrap_stream(stream, length)
|
||||||
|
|
||||||
|
# don't parse the http record at all
|
||||||
|
if no_record_parse:
|
||||||
|
status_headers = StatusAndHeaders('', [])
|
||||||
|
|
||||||
# if empty record (error or otherwise) set status to 204
|
# if empty record (error or otherwise) set status to 204
|
||||||
if length == 0:
|
elif length == 0:
|
||||||
if is_err:
|
if is_err:
|
||||||
msg = '204 Possible Error'
|
msg = '204 Possible Error'
|
||||||
else:
|
else:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user