1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

cdx indexer: refactor indexer into mixins for differnt formats for easier customization

This commit is contained in:
Ilya Kreymer 2015-02-25 16:45:47 -08:00
parent ee1fabf600
commit 48eab2662d
2 changed files with 102 additions and 58 deletions

View File

@ -250,6 +250,9 @@ class DefaultRecordIter(object):
append_post = self.options.get('append_post')
include_all = self.options.get('include_all')
block_size = self.options.get('block_size', 16384)
surt_ordered = self.options.get('surt_ordered', True)
minimal = self.options.get('minimal')
append_post = self.options.get('append_post')
for record in arcv_iter.iter_records(block_size):
entry = None
@ -275,17 +278,17 @@ class DefaultRecordIter(object):
continue
if entry.url and not entry.key:
entry.key = canonicalize(entry.url,
self.options.get('surt_ordered', True))
entry.key = canonicalize(entry.url, surt_ordered)
compute_digest = False
if (entry.digest == '-' and
record.rec_type not in ('revisit', 'request', 'warcinfo')):
if (not minimal and
entry.digest == '-' and
record.rec_type not in ('revisit', 'request', 'warcinfo')):
compute_digest = True
elif record.rec_type == 'request' and self.options.get('append_post'):
elif record.rec_type == 'request' and append_post:
method = record.status_headers.protocol
len_ = record.status_headers.get_header('Content-Length')

View File

@ -8,58 +8,41 @@ from io import BytesIO
from archiveiterator import DefaultRecordIter
#=================================================================
class CDXWriter(object):
def __init__(self, out, format_):
class BaseCDXWriter(object):
def __init__(self, out):
self.out = out
self.format_ = format_
def __enter__(self):
if self.format_ == 'cdx09':
self.out.write(' CDX N b a m s k r V g\n')
elif self.format_ == 'cdx06':
self.out.write(' CDX N b a S V g\n')
else:
self.out.write(' CDX N b a m s k r M S V g\n')
self._write_header()
return self
def write(self, entry, filename):
if not entry.url or not entry.key:
return
if entry.record.rec_type == 'warcinfo':
return
self.write_cdx_line(self.out, entry, filename)
def __exit__(self, *args):
return False
def write_cdx_line(self, out, entry, filename):
if entry.record.rec_type == 'warcinfo':
return
#=================================================================
class CDX06(object):
def _write_header(self):
self.out.write(' CDX N b a S V g\n')
def write_cdx_line(self, out, entry, filename):
out.write(entry.key)
out.write(' ')
out.write(entry.timestamp)
out.write(' ')
out.write(entry.url)
out.write(' ')
if self.format_ != 'cdx06':
out.write(entry.mime)
out.write(' ')
out.write(entry.status)
out.write(' ')
out.write(entry.digest)
if self.format_ == 'cdx09':
out.write(' - ')
elif self.format_ == 'cdx06':
out.write(entry.length)
out.write(' ')
else:
out.write(' - - ')
out.write(entry.length)
out.write(' ')
out.write(entry.length)
out.write(' ')
out.write(entry.offset)
out.write(' ')
out.write(filename)
@ -67,21 +50,72 @@ class CDXWriter(object):
#=================================================================
class SortedCDXWriter(CDXWriter):
class CDX09(object):
def _write_header(self):
self.out.write(' CDX N b a m s k r V g\n')
def write_cdx_line(self, out, entry, filename):
out.write(entry.key)
out.write(' ')
out.write(entry.timestamp)
out.write(' ')
out.write(entry.url)
out.write(' ')
out.write(entry.mime)
out.write(' ')
out.write(entry.status)
out.write(' ')
out.write(entry.digest)
out.write(' - ')
out.write(entry.offset)
out.write(' ')
out.write(filename)
out.write('\n')
#=================================================================
class CDX11(object):
def _write_header(self):
self.out.write(' CDX N b a m s k r M S V g\n')
def write_cdx_line(self, out, entry, filename):
out.write(entry.key)
out.write(' ')
out.write(entry.timestamp)
out.write(' ')
out.write(entry.url)
out.write(' ')
out.write(entry.mime)
out.write(' ')
out.write(entry.status)
out.write(' ')
out.write(entry.digest)
out.write(' - - ')
out.write(entry.length)
out.write(' ')
out.write(entry.offset)
out.write(' ')
out.write(filename)
out.write('\n')
#=================================================================
class SortedCDXWriter(BaseCDXWriter):
def __enter__(self):
self.sortlist = []
return super(SortedCDXWriter, self).__enter__()
res = super(SortedCDXWriter, self).__enter__()
self.actual_out = self.out
return res
def write(self, entry, filename):
outbuff = BytesIO()
self.write_cdx_line(outbuff, entry, filename)
line = outbuff.getvalue()
self.out = BytesIO()
super(SortedCDXWriter, self).write(entry, filename)
line = self.out.getvalue()
if line:
insort(self.sortlist, line)
def __exit__(self, *args):
self.out.write(''.join(self.sortlist))
self.actual_out.write(''.join(self.sortlist))
return False
@ -129,13 +163,25 @@ def cdx_filename(filename):
def get_cdx_writer_cls(options):
writer_cls = options.get('writer_cls')
if not writer_cls:
if options.get('sort'):
writer_cls = SortedCDXWriter
else:
writer_cls = CDXWriter
if writer_cls:
if not options.get('writer_add_mixin'):
return writer_cls
elif options.get('sort'):
writer_cls = SortedCDXWriter
else:
writer_cls = BaseCDXWriter
return writer_cls
if options.get('cdx09'):
format_mixin = CDX09
elif options.get('cdx06'):
format_mixin = CDX06
else:
format_mixin = CDX11
class CDXWriter(writer_cls, format_mixin):
pass
return CDXWriter
#=================================================================
@ -163,7 +209,7 @@ def write_multi_cdx_index(output, inputs, **options):
writer_cls = get_cdx_writer_cls(options)
record_iter = DefaultRecordIter(**options)
with writer_cls(outfile, options.get('format')) as writer:
with writer_cls(outfile) as writer:
for fullpath, filename in iter_file_or_dir(inputs, recurse):
with open(fullpath, 'rb') as infile:
entry_iter = record_iter(infile)
@ -181,7 +227,7 @@ def write_cdx_index(outfile, infile, filename, **options):
writer_cls = get_cdx_writer_cls(options)
with writer_cls(outfile, options.get('format')) as writer:
with writer_cls(outfile) as writer:
entry_iter = DefaultRecordIter(**options)(infile)
for entry in entry_iter:
@ -283,19 +329,14 @@ if input is a directory"""
cmd = parser.parse_args(args=args)
format_ = 'cdx11'
if cmd.cdx09:
format_ = 'cdx09'
elif cmd.cdx06:
format_ = 'cdx06'
write_multi_cdx_index(cmd.output, cmd.inputs,
sort=cmd.sort,
surt_ordered=not cmd.unsurt,
include_all=cmd.allrecords,
append_post=cmd.postappend,
recurse=cmd.recurse,
format=format_,
cdx09=cmd.cdx09,
cdx06=cmd.cdx06,
minimal=cmd.cdx06)