1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

cdx indexer: refactor indexer into mixins for differnt formats for easier customization

This commit is contained in:
Ilya Kreymer 2015-02-25 16:45:47 -08:00
parent ee1fabf600
commit 48eab2662d
2 changed files with 102 additions and 58 deletions

View File

@ -250,6 +250,9 @@ class DefaultRecordIter(object):
append_post = self.options.get('append_post') append_post = self.options.get('append_post')
include_all = self.options.get('include_all') include_all = self.options.get('include_all')
block_size = self.options.get('block_size', 16384) block_size = self.options.get('block_size', 16384)
surt_ordered = self.options.get('surt_ordered', True)
minimal = self.options.get('minimal')
append_post = self.options.get('append_post')
for record in arcv_iter.iter_records(block_size): for record in arcv_iter.iter_records(block_size):
entry = None entry = None
@ -275,17 +278,17 @@ class DefaultRecordIter(object):
continue continue
if entry.url and not entry.key: if entry.url and not entry.key:
entry.key = canonicalize(entry.url, entry.key = canonicalize(entry.url, surt_ordered)
self.options.get('surt_ordered', True))
compute_digest = False compute_digest = False
if (entry.digest == '-' and if (not minimal and
record.rec_type not in ('revisit', 'request', 'warcinfo')): entry.digest == '-' and
record.rec_type not in ('revisit', 'request', 'warcinfo')):
compute_digest = True compute_digest = True
elif record.rec_type == 'request' and self.options.get('append_post'): elif record.rec_type == 'request' and append_post:
method = record.status_headers.protocol method = record.status_headers.protocol
len_ = record.status_headers.get_header('Content-Length') len_ = record.status_headers.get_header('Content-Length')

View File

@ -8,58 +8,41 @@ from io import BytesIO
from archiveiterator import DefaultRecordIter from archiveiterator import DefaultRecordIter
#================================================================= #=================================================================
class CDXWriter(object): class BaseCDXWriter(object):
def __init__(self, out, format_): def __init__(self, out):
self.out = out self.out = out
self.format_ = format_
def __enter__(self): def __enter__(self):
if self.format_ == 'cdx09': self._write_header()
self.out.write(' CDX N b a m s k r V g\n')
elif self.format_ == 'cdx06':
self.out.write(' CDX N b a S V g\n')
else:
self.out.write(' CDX N b a m s k r M S V g\n')
return self return self
def write(self, entry, filename): def write(self, entry, filename):
if not entry.url or not entry.key: if not entry.url or not entry.key:
return return
if entry.record.rec_type == 'warcinfo':
return
self.write_cdx_line(self.out, entry, filename) self.write_cdx_line(self.out, entry, filename)
def __exit__(self, *args): def __exit__(self, *args):
return False return False
def write_cdx_line(self, out, entry, filename):
if entry.record.rec_type == 'warcinfo':
return
#=================================================================
class CDX06(object):
def _write_header(self):
self.out.write(' CDX N b a S V g\n')
def write_cdx_line(self, out, entry, filename):
out.write(entry.key) out.write(entry.key)
out.write(' ') out.write(' ')
out.write(entry.timestamp) out.write(entry.timestamp)
out.write(' ') out.write(' ')
out.write(entry.url) out.write(entry.url)
out.write(' ') out.write(' ')
out.write(entry.length)
if self.format_ != 'cdx06': out.write(' ')
out.write(entry.mime)
out.write(' ')
out.write(entry.status)
out.write(' ')
out.write(entry.digest)
if self.format_ == 'cdx09':
out.write(' - ')
elif self.format_ == 'cdx06':
out.write(entry.length)
out.write(' ')
else:
out.write(' - - ')
out.write(entry.length)
out.write(' ')
out.write(entry.offset) out.write(entry.offset)
out.write(' ') out.write(' ')
out.write(filename) out.write(filename)
@ -67,21 +50,72 @@ class CDXWriter(object):
#================================================================= #=================================================================
class SortedCDXWriter(CDXWriter): class CDX09(object):
def _write_header(self):
self.out.write(' CDX N b a m s k r V g\n')
def write_cdx_line(self, out, entry, filename):
out.write(entry.key)
out.write(' ')
out.write(entry.timestamp)
out.write(' ')
out.write(entry.url)
out.write(' ')
out.write(entry.mime)
out.write(' ')
out.write(entry.status)
out.write(' ')
out.write(entry.digest)
out.write(' - ')
out.write(entry.offset)
out.write(' ')
out.write(filename)
out.write('\n')
#=================================================================
class CDX11(object):
def _write_header(self):
self.out.write(' CDX N b a m s k r M S V g\n')
def write_cdx_line(self, out, entry, filename):
out.write(entry.key)
out.write(' ')
out.write(entry.timestamp)
out.write(' ')
out.write(entry.url)
out.write(' ')
out.write(entry.mime)
out.write(' ')
out.write(entry.status)
out.write(' ')
out.write(entry.digest)
out.write(' - - ')
out.write(entry.length)
out.write(' ')
out.write(entry.offset)
out.write(' ')
out.write(filename)
out.write('\n')
#=================================================================
class SortedCDXWriter(BaseCDXWriter):
def __enter__(self): def __enter__(self):
self.sortlist = [] self.sortlist = []
return super(SortedCDXWriter, self).__enter__() res = super(SortedCDXWriter, self).__enter__()
self.actual_out = self.out
return res
def write(self, entry, filename): def write(self, entry, filename):
outbuff = BytesIO() self.out = BytesIO()
self.write_cdx_line(outbuff, entry, filename) super(SortedCDXWriter, self).write(entry, filename)
line = self.out.getvalue()
line = outbuff.getvalue()
if line: if line:
insort(self.sortlist, line) insort(self.sortlist, line)
def __exit__(self, *args): def __exit__(self, *args):
self.out.write(''.join(self.sortlist)) self.actual_out.write(''.join(self.sortlist))
return False return False
@ -129,13 +163,25 @@ def cdx_filename(filename):
def get_cdx_writer_cls(options): def get_cdx_writer_cls(options):
writer_cls = options.get('writer_cls') writer_cls = options.get('writer_cls')
if not writer_cls: if writer_cls:
if options.get('sort'): if not options.get('writer_add_mixin'):
writer_cls = SortedCDXWriter return writer_cls
else: elif options.get('sort'):
writer_cls = CDXWriter writer_cls = SortedCDXWriter
else:
writer_cls = BaseCDXWriter
return writer_cls if options.get('cdx09'):
format_mixin = CDX09
elif options.get('cdx06'):
format_mixin = CDX06
else:
format_mixin = CDX11
class CDXWriter(writer_cls, format_mixin):
pass
return CDXWriter
#================================================================= #=================================================================
@ -163,7 +209,7 @@ def write_multi_cdx_index(output, inputs, **options):
writer_cls = get_cdx_writer_cls(options) writer_cls = get_cdx_writer_cls(options)
record_iter = DefaultRecordIter(**options) record_iter = DefaultRecordIter(**options)
with writer_cls(outfile, options.get('format')) as writer: with writer_cls(outfile) as writer:
for fullpath, filename in iter_file_or_dir(inputs, recurse): for fullpath, filename in iter_file_or_dir(inputs, recurse):
with open(fullpath, 'rb') as infile: with open(fullpath, 'rb') as infile:
entry_iter = record_iter(infile) entry_iter = record_iter(infile)
@ -181,7 +227,7 @@ def write_cdx_index(outfile, infile, filename, **options):
writer_cls = get_cdx_writer_cls(options) writer_cls = get_cdx_writer_cls(options)
with writer_cls(outfile, options.get('format')) as writer: with writer_cls(outfile) as writer:
entry_iter = DefaultRecordIter(**options)(infile) entry_iter = DefaultRecordIter(**options)(infile)
for entry in entry_iter: for entry in entry_iter:
@ -283,19 +329,14 @@ if input is a directory"""
cmd = parser.parse_args(args=args) cmd = parser.parse_args(args=args)
format_ = 'cdx11'
if cmd.cdx09:
format_ = 'cdx09'
elif cmd.cdx06:
format_ = 'cdx06'
write_multi_cdx_index(cmd.output, cmd.inputs, write_multi_cdx_index(cmd.output, cmd.inputs,
sort=cmd.sort, sort=cmd.sort,
surt_ordered=not cmd.unsurt, surt_ordered=not cmd.unsurt,
include_all=cmd.allrecords, include_all=cmd.allrecords,
append_post=cmd.postappend, append_post=cmd.postappend,
recurse=cmd.recurse, recurse=cmd.recurse,
format=format_, cdx09=cmd.cdx09,
cdx06=cmd.cdx06,
minimal=cmd.cdx06) minimal=cmd.cdx06)