From 48eab2662d41cddc2a3328d2f76476c3a82effc8 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 25 Feb 2015 16:45:47 -0800 Subject: [PATCH] cdx indexer: refactor indexer into mixins for differnt formats for easier customization --- pywb/warc/archiveiterator.py | 13 ++-- pywb/warc/cdxindexer.py | 147 ++++++++++++++++++++++------------- 2 files changed, 102 insertions(+), 58 deletions(-) diff --git a/pywb/warc/archiveiterator.py b/pywb/warc/archiveiterator.py index 11c35428..4bc17687 100644 --- a/pywb/warc/archiveiterator.py +++ b/pywb/warc/archiveiterator.py @@ -250,6 +250,9 @@ class DefaultRecordIter(object): append_post = self.options.get('append_post') include_all = self.options.get('include_all') block_size = self.options.get('block_size', 16384) + surt_ordered = self.options.get('surt_ordered', True) + minimal = self.options.get('minimal') + append_post = self.options.get('append_post') for record in arcv_iter.iter_records(block_size): entry = None @@ -275,17 +278,17 @@ class DefaultRecordIter(object): continue if entry.url and not entry.key: - entry.key = canonicalize(entry.url, - self.options.get('surt_ordered', True)) + entry.key = canonicalize(entry.url, surt_ordered) compute_digest = False - if (entry.digest == '-' and - record.rec_type not in ('revisit', 'request', 'warcinfo')): + if (not minimal and + entry.digest == '-' and + record.rec_type not in ('revisit', 'request', 'warcinfo')): compute_digest = True - elif record.rec_type == 'request' and self.options.get('append_post'): + elif record.rec_type == 'request' and append_post: method = record.status_headers.protocol len_ = record.status_headers.get_header('Content-Length') diff --git a/pywb/warc/cdxindexer.py b/pywb/warc/cdxindexer.py index e2c3c0f4..3bcb1c1a 100644 --- a/pywb/warc/cdxindexer.py +++ b/pywb/warc/cdxindexer.py @@ -8,58 +8,41 @@ from io import BytesIO from archiveiterator import DefaultRecordIter #================================================================= -class CDXWriter(object): - def __init__(self, out, format_): +class BaseCDXWriter(object): + def __init__(self, out): self.out = out - self.format_ = format_ def __enter__(self): - if self.format_ == 'cdx09': - self.out.write(' CDX N b a m s k r V g\n') - elif self.format_ == 'cdx06': - self.out.write(' CDX N b a S V g\n') - else: - self.out.write(' CDX N b a m s k r M S V g\n') - + self._write_header() return self def write(self, entry, filename): if not entry.url or not entry.key: return + if entry.record.rec_type == 'warcinfo': + return + self.write_cdx_line(self.out, entry, filename) def __exit__(self, *args): return False - def write_cdx_line(self, out, entry, filename): - if entry.record.rec_type == 'warcinfo': - return +#================================================================= +class CDX06(object): + def _write_header(self): + self.out.write(' CDX N b a S V g\n') + + def write_cdx_line(self, out, entry, filename): out.write(entry.key) out.write(' ') out.write(entry.timestamp) out.write(' ') out.write(entry.url) out.write(' ') - - if self.format_ != 'cdx06': - out.write(entry.mime) - out.write(' ') - out.write(entry.status) - out.write(' ') - out.write(entry.digest) - - if self.format_ == 'cdx09': - out.write(' - ') - elif self.format_ == 'cdx06': - out.write(entry.length) - out.write(' ') - else: - out.write(' - - ') - out.write(entry.length) - out.write(' ') - + out.write(entry.length) + out.write(' ') out.write(entry.offset) out.write(' ') out.write(filename) @@ -67,21 +50,72 @@ class CDXWriter(object): #================================================================= -class SortedCDXWriter(CDXWriter): +class CDX09(object): + def _write_header(self): + self.out.write(' CDX N b a m s k r V g\n') + + def write_cdx_line(self, out, entry, filename): + out.write(entry.key) + out.write(' ') + out.write(entry.timestamp) + out.write(' ') + out.write(entry.url) + out.write(' ') + out.write(entry.mime) + out.write(' ') + out.write(entry.status) + out.write(' ') + out.write(entry.digest) + out.write(' - ') + out.write(entry.offset) + out.write(' ') + out.write(filename) + out.write('\n') + + +#================================================================= +class CDX11(object): + def _write_header(self): + self.out.write(' CDX N b a m s k r M S V g\n') + + def write_cdx_line(self, out, entry, filename): + out.write(entry.key) + out.write(' ') + out.write(entry.timestamp) + out.write(' ') + out.write(entry.url) + out.write(' ') + out.write(entry.mime) + out.write(' ') + out.write(entry.status) + out.write(' ') + out.write(entry.digest) + out.write(' - - ') + out.write(entry.length) + out.write(' ') + out.write(entry.offset) + out.write(' ') + out.write(filename) + out.write('\n') + + +#================================================================= +class SortedCDXWriter(BaseCDXWriter): def __enter__(self): self.sortlist = [] - return super(SortedCDXWriter, self).__enter__() + res = super(SortedCDXWriter, self).__enter__() + self.actual_out = self.out + return res def write(self, entry, filename): - outbuff = BytesIO() - self.write_cdx_line(outbuff, entry, filename) - - line = outbuff.getvalue() + self.out = BytesIO() + super(SortedCDXWriter, self).write(entry, filename) + line = self.out.getvalue() if line: insort(self.sortlist, line) def __exit__(self, *args): - self.out.write(''.join(self.sortlist)) + self.actual_out.write(''.join(self.sortlist)) return False @@ -129,13 +163,25 @@ def cdx_filename(filename): def get_cdx_writer_cls(options): writer_cls = options.get('writer_cls') - if not writer_cls: - if options.get('sort'): - writer_cls = SortedCDXWriter - else: - writer_cls = CDXWriter + if writer_cls: + if not options.get('writer_add_mixin'): + return writer_cls + elif options.get('sort'): + writer_cls = SortedCDXWriter + else: + writer_cls = BaseCDXWriter - return writer_cls + if options.get('cdx09'): + format_mixin = CDX09 + elif options.get('cdx06'): + format_mixin = CDX06 + else: + format_mixin = CDX11 + + class CDXWriter(writer_cls, format_mixin): + pass + + return CDXWriter #================================================================= @@ -163,7 +209,7 @@ def write_multi_cdx_index(output, inputs, **options): writer_cls = get_cdx_writer_cls(options) record_iter = DefaultRecordIter(**options) - with writer_cls(outfile, options.get('format')) as writer: + with writer_cls(outfile) as writer: for fullpath, filename in iter_file_or_dir(inputs, recurse): with open(fullpath, 'rb') as infile: entry_iter = record_iter(infile) @@ -181,7 +227,7 @@ def write_cdx_index(outfile, infile, filename, **options): writer_cls = get_cdx_writer_cls(options) - with writer_cls(outfile, options.get('format')) as writer: + with writer_cls(outfile) as writer: entry_iter = DefaultRecordIter(**options)(infile) for entry in entry_iter: @@ -283,19 +329,14 @@ if input is a directory""" cmd = parser.parse_args(args=args) - format_ = 'cdx11' - if cmd.cdx09: - format_ = 'cdx09' - elif cmd.cdx06: - format_ = 'cdx06' - write_multi_cdx_index(cmd.output, cmd.inputs, sort=cmd.sort, surt_ordered=not cmd.unsurt, include_all=cmd.allrecords, append_post=cmd.postappend, recurse=cmd.recurse, - format=format_, + cdx09=cmd.cdx09, + cdx06=cmd.cdx06, minimal=cmd.cdx06)