1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

cdx indexing: wrap record iterator global functions in class DefaultRecordIter to allow for better extensibility

add 'minimal' option to skip digest/mime/status extraction only include minimal data (url+timestamp)
cdx-indexer: add -6 option to create 6-field index
This commit is contained in:
Ilya Kreymer 2015-02-25 13:31:37 -08:00
parent 1d4c54deaa
commit 671f45f69f
2 changed files with 194 additions and 167 deletions

View File

@ -242,188 +242,194 @@ class ArchiveIndexEntry(object):
#================================================================= #=================================================================
def create_record_iter(arcv_iter, options): class DefaultRecordIter(object):
append_post = options.get('append_post') def __init__(self, **options):
include_all = options.get('include_all') self.options = options
block_size = options.get('block_size', 16384)
for record in arcv_iter.iter_records(block_size): def create_record_iter(self, arcv_iter):
entry = None append_post = self.options.get('append_post')
include_all = self.options.get('include_all')
block_size = self.options.get('block_size', 16384)
if not include_all and (record.status_headers.get_statuscode() == '-'): for record in arcv_iter.iter_records(block_size):
continue entry = None
if record.format == 'warc': if not include_all and (record.status_headers.get_statuscode() == '-'):
if (record.rec_type in ('request', 'warcinfo') and
not include_all and
not append_post):
continue continue
elif (not include_all and if record.format == 'warc':
record.content_type == 'application/warc-fields'): if (record.rec_type in ('request', 'warcinfo') and
not include_all and
not append_post):
continue
elif (not include_all and
record.content_type == 'application/warc-fields'):
continue
entry = self.parse_warc_record(record)
elif record.format == 'arc':
entry = self.parse_arc_record(record)
if not entry:
continue continue
entry = parse_warc_record(record) if entry.url and not entry.key:
elif record.format == 'arc': entry.key = canonicalize(entry.url,
entry = parse_arc_record(record) self.options.get('surt_ordered', True))
if not entry: compute_digest = False
continue
if entry.url and not entry.key: if (entry.digest == '-' and
entry.key = canonicalize(entry.url, record.rec_type not in ('revisit', 'request', 'warcinfo')):
options.get('surt_ordered', True))
compute_digest = False compute_digest = True
if (entry.digest == '-' and elif record.rec_type == 'request' and self.options.get('append_post'):
record.rec_type not in ('revisit', 'request', 'warcinfo')): method = record.status_headers.protocol
len_ = record.status_headers.get_header('Content-Length')
compute_digest = True post_query = extract_post_query(method,
entry.mime,
len_,
record.stream)
elif record.rec_type == 'request' and options.get('append_post'): entry.post_query = post_query
method = record.status_headers.protocol
len_ = record.status_headers.get_header('Content-Length')
post_query = extract_post_query(method, #entry.set_rec_info(*arcv_iter.read_to_end(record, compute_digest))
entry.mime, arcv_iter.read_to_end(record, compute_digest)
len_, entry.set_rec_info(*arcv_iter.member_info)
record.stream) entry.record = record
entry.post_query = post_query
#entry.set_rec_info(*arcv_iter.read_to_end(record, compute_digest))
arcv_iter.read_to_end(record, compute_digest)
entry.set_rec_info(*arcv_iter.member_info)
entry.record = record
yield entry
#=================================================================
def join_request_records(entry_iter, options):
prev_entry = None
for entry in entry_iter:
if not prev_entry:
prev_entry = entry
continue
# check for url match
if (entry.url != prev_entry.url):
pass
# check for concurrency also
elif (entry.record.rec_headers.get_header('WARC-Concurrent-To') !=
prev_entry.record.rec_headers.get_header('WARC-Record-ID')):
pass
elif (entry.merge_request_data(prev_entry, options) or
prev_entry.merge_request_data(entry, options)):
yield prev_entry
yield entry yield entry
prev_entry = None
continue
yield prev_entry def join_request_records(self, entry_iter):
prev_entry = entry prev_entry = None
if prev_entry: for entry in entry_iter:
yield prev_entry if not prev_entry:
prev_entry = entry
continue
# check for url match
if (entry.url != prev_entry.url):
pass
# check for concurrency also
elif (entry.record.rec_headers.get_header('WARC-Concurrent-To') !=
prev_entry.record.rec_headers.get_header('WARC-Record-ID')):
pass
elif (entry.merge_request_data(prev_entry, self.options) or
prev_entry.merge_request_data(entry, self.options)):
yield prev_entry
yield entry
prev_entry = None
continue
yield prev_entry
prev_entry = entry
if prev_entry:
yield prev_entry
#================================================================= #=================================================================
def parse_warc_record(record): def parse_warc_record(self, record):
""" Parse warc record """ Parse warc record
""" """
entry = ArchiveIndexEntry() entry = ArchiveIndexEntry()
if record.rec_type == 'warcinfo':
entry.url = record.rec_headers.get_header('WARC-Filename')
entry.key = entry.url
entry.warcinfo = record.stream.read(record.length)
return entry
entry.url = record.rec_headers.get_header('WARC-Target-Uri')
# timestamp
entry.timestamp = iso_date_to_timestamp(record.rec_headers.
get_header('WARC-Date'))
if self.options.get('minimal'):
return entry
# mime
if record.rec_type == 'revisit':
entry.mime = 'warc/revisit'
else:
def_mime = '-' if record.rec_type == 'request' else 'unk'
entry.extract_mime(record.status_headers.
get_header('Content-Type'),
def_mime)
# status -- only for response records (by convention):
if record.rec_type == 'response':
entry.extract_status(record.status_headers)
else:
entry.status = '-'
# digest
entry.digest = record.rec_headers.get_header('WARC-Payload-Digest')
if entry.digest and entry.digest.startswith('sha1:'):
entry.digest = entry.digest[len('sha1:'):]
if not entry.digest:
entry.digest = '-'
if record.rec_type == 'warcinfo':
entry.url = record.rec_headers.get_header('WARC-Filename')
entry.key = entry.url
entry.warcinfo = record.stream.read(record.length)
return entry return entry
entry.url = record.rec_headers.get_header('WARC-Target-Uri')
# timestamp #=================================================================
entry.timestamp = iso_date_to_timestamp(record.rec_headers. def parse_arc_record(self, record):
get_header('WARC-Date')) """ Parse arc record
"""
if record.rec_type == 'arc_header':
return None
# mime url = record.rec_headers.get_header('uri')
if record.rec_type == 'revisit': url = url.replace('\r', '%0D')
entry.mime = 'warc/revisit' url = url.replace('\n', '%0A')
else: # replace formfeed
def_mime = '-' if record.rec_type == 'request' else 'unk' url = url.replace('\x0c', '%0C')
entry.extract_mime(record.status_headers. # replace nulls
get_header('Content-Type'), url = url.replace('\x00', '%00')
def_mime)
# status -- only for response records (by convention): entry = ArchiveIndexEntry()
if record.rec_type == 'response': entry.url = url
# timestamp
entry.timestamp = record.rec_headers.get_header('archive-date')
if len(entry.timestamp) > 14:
entry.timestamp = entry.timestamp[:14]
if self.options.get('minimal'):
return entry
# status
entry.extract_status(record.status_headers) entry.extract_status(record.status_headers)
else:
entry.status = '-'
# digest # mime
entry.digest = record.rec_headers.get_header('WARC-Payload-Digest') entry.extract_mime(record.rec_headers.get_header('content-type'))
if entry.digest and entry.digest.startswith('sha1:'):
entry.digest = entry.digest[len('sha1:'):]
if not entry.digest: # digest
entry.digest = '-' entry.digest = '-'
return entry return entry
def __call__(self, fh):
aiter = ArchiveIterator(fh)
#================================================================= entry_iter = self.create_record_iter(aiter)
def parse_arc_record(record):
""" Parse arc record
"""
if record.rec_type == 'arc_header':
return None
url = record.rec_headers.get_header('uri') if self.options.get('append_post'):
url = url.replace('\r', '%0D') entry_iter = self.join_request_records(entry_iter)
url = url.replace('\n', '%0A')
# replace formfeed
url = url.replace('\x0c', '%0C')
# replace nulls
url = url.replace('\x00', '%00')
entry = ArchiveIndexEntry() for entry in entry_iter:
entry.url = url if (entry.record.rec_type in ('request', 'warcinfo') and
not self.options.get('include_all')):
continue
# timestamp yield entry
entry.timestamp = record.rec_headers.get_header('archive-date')
if len(entry.timestamp) > 14:
entry.timestamp = entry.timestamp[:14]
# status
entry.extract_status(record.status_headers)
# mime
entry.extract_mime(record.rec_headers.get_header('content-type'))
# digest
entry.digest = '-'
return entry
#=================================================================
def create_index_iter(fh, **options):
aiter = ArchiveIterator(fh)
entry_iter = create_record_iter(aiter, options)
if options.get('append_post'):
entry_iter = join_request_records(entry_iter, options)
for entry in entry_iter:
if (entry.record.rec_type in ('request', 'warcinfo') and
not options.get('include_all')):
continue
yield entry

View File

@ -5,20 +5,21 @@ from bisect import insort
from io import BytesIO from io import BytesIO
from archiveiterator import create_index_iter from archiveiterator import DefaultRecordIter
#================================================================= #=================================================================
class CDXWriter(object): class CDXWriter(object):
def __init__(self, out, cdx09=False): def __init__(self, out, format_):
self.out = out self.out = out
self.cdx09 = cdx09 self.format_ = format_
def __enter__(self): def __enter__(self):
if not self.cdx09: if self.format_ == 'cdx09':
self.out.write(' CDX N b a m s k r M S V g\n')
else:
self.out.write(' CDX N b a m s k r V g\n') self.out.write(' CDX N b a m s k r V g\n')
elif self.format_ == 'cdx06':
self.out.write(' CDX N b a S V g\n')
else:
self.out.write(' CDX N b a m s k r M S V g\n')
return self return self
@ -41,17 +42,24 @@ class CDXWriter(object):
out.write(' ') out.write(' ')
out.write(entry.url) out.write(entry.url)
out.write(' ') out.write(' ')
out.write(entry.mime)
out.write(' ') if self.format_ != 'cdx06':
out.write(entry.status) out.write(entry.mime)
out.write(' ') out.write(' ')
out.write(entry.digest) out.write(entry.status)
if self.cdx09: out.write(' ')
out.write(entry.digest)
if self.format_ == 'cdx09':
out.write(' - ') out.write(' - ')
elif self.format_ == 'cdx06':
out.write(entry.length)
out.write(' ')
else: else:
out.write(' - - ') out.write(' - - ')
out.write(entry.length) out.write(entry.length)
out.write(' ') out.write(' ')
out.write(entry.offset) out.write(entry.offset)
out.write(' ') out.write(' ')
out.write(filename) out.write(filename)
@ -153,11 +161,12 @@ def write_multi_cdx_index(output, inputs, **options):
outfile = open(output, 'wb') outfile = open(output, 'wb')
writer_cls = get_cdx_writer_cls(options) writer_cls = get_cdx_writer_cls(options)
record_iter = DefaultRecordIter(**options)
with writer_cls(outfile, options.get('cdx09')) as writer: with writer_cls(outfile, options.get('format')) as writer:
for fullpath, filename in iter_file_or_dir(inputs, recurse): for fullpath, filename in iter_file_or_dir(inputs, recurse):
with open(fullpath, 'rb') as infile: with open(fullpath, 'rb') as infile:
entry_iter = create_index_iter(infile, **options) entry_iter = record_iter(infile)
for entry in entry_iter: for entry in entry_iter:
writer.write(entry, filename) writer.write(entry, filename)
@ -172,8 +181,8 @@ def write_cdx_index(outfile, infile, filename, **options):
writer_cls = get_cdx_writer_cls(options) writer_cls = get_cdx_writer_cls(options)
with writer_cls(outfile, options.get('cdx09')) as writer: with writer_cls(outfile, options.get('format')) as writer:
entry_iter = create_index_iter(infile, **options) entry_iter = DefaultRecordIter(**options)(infile)
for entry in entry_iter: for entry in entry_iter:
writer.write(entry, filename) writer.write(entry, filename)
@ -260,7 +269,12 @@ if input is a directory"""
action='store_true', action='store_true',
help=unsurt_help) help=unsurt_help)
parser.add_argument('-9', '--cdx09', group = parser.add_mutually_exclusive_group()
group.add_argument('-9', '--cdx09',
action='store_true',
help=cdx09_help)
group.add_argument('-6', '--cdx06',
action='store_true', action='store_true',
help=cdx09_help) help=cdx09_help)
@ -269,13 +283,20 @@ if input is a directory"""
cmd = parser.parse_args(args=args) cmd = parser.parse_args(args=args)
format_ = 'cdx11'
if cmd.cdx09:
format_ = 'cdx09'
elif cmd.cdx06:
format_ = 'cdx06'
write_multi_cdx_index(cmd.output, cmd.inputs, write_multi_cdx_index(cmd.output, cmd.inputs,
sort=cmd.sort, sort=cmd.sort,
surt_ordered=not cmd.unsurt, surt_ordered=not cmd.unsurt,
include_all=cmd.allrecords, include_all=cmd.allrecords,
append_post=cmd.postappend, append_post=cmd.postappend,
recurse=cmd.recurse, recurse=cmd.recurse,
cdx09=cmd.cdx09) format=format_,
minimal=cmd.cdx06)
if __name__ == '__main__': if __name__ == '__main__':