mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
cdx indexer: refactor indexer into mixins for differnt formats for easier customization
This commit is contained in:
parent
ee1fabf600
commit
48eab2662d
@ -250,6 +250,9 @@ class DefaultRecordIter(object):
|
|||||||
append_post = self.options.get('append_post')
|
append_post = self.options.get('append_post')
|
||||||
include_all = self.options.get('include_all')
|
include_all = self.options.get('include_all')
|
||||||
block_size = self.options.get('block_size', 16384)
|
block_size = self.options.get('block_size', 16384)
|
||||||
|
surt_ordered = self.options.get('surt_ordered', True)
|
||||||
|
minimal = self.options.get('minimal')
|
||||||
|
append_post = self.options.get('append_post')
|
||||||
|
|
||||||
for record in arcv_iter.iter_records(block_size):
|
for record in arcv_iter.iter_records(block_size):
|
||||||
entry = None
|
entry = None
|
||||||
@ -275,17 +278,17 @@ class DefaultRecordIter(object):
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
if entry.url and not entry.key:
|
if entry.url and not entry.key:
|
||||||
entry.key = canonicalize(entry.url,
|
entry.key = canonicalize(entry.url, surt_ordered)
|
||||||
self.options.get('surt_ordered', True))
|
|
||||||
|
|
||||||
compute_digest = False
|
compute_digest = False
|
||||||
|
|
||||||
if (entry.digest == '-' and
|
if (not minimal and
|
||||||
record.rec_type not in ('revisit', 'request', 'warcinfo')):
|
entry.digest == '-' and
|
||||||
|
record.rec_type not in ('revisit', 'request', 'warcinfo')):
|
||||||
|
|
||||||
compute_digest = True
|
compute_digest = True
|
||||||
|
|
||||||
elif record.rec_type == 'request' and self.options.get('append_post'):
|
elif record.rec_type == 'request' and append_post:
|
||||||
method = record.status_headers.protocol
|
method = record.status_headers.protocol
|
||||||
len_ = record.status_headers.get_header('Content-Length')
|
len_ = record.status_headers.get_header('Content-Length')
|
||||||
|
|
||||||
|
@ -8,58 +8,41 @@ from io import BytesIO
|
|||||||
from archiveiterator import DefaultRecordIter
|
from archiveiterator import DefaultRecordIter
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class CDXWriter(object):
|
class BaseCDXWriter(object):
|
||||||
def __init__(self, out, format_):
|
def __init__(self, out):
|
||||||
self.out = out
|
self.out = out
|
||||||
self.format_ = format_
|
|
||||||
|
|
||||||
def __enter__(self):
|
def __enter__(self):
|
||||||
if self.format_ == 'cdx09':
|
self._write_header()
|
||||||
self.out.write(' CDX N b a m s k r V g\n')
|
|
||||||
elif self.format_ == 'cdx06':
|
|
||||||
self.out.write(' CDX N b a S V g\n')
|
|
||||||
else:
|
|
||||||
self.out.write(' CDX N b a m s k r M S V g\n')
|
|
||||||
|
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def write(self, entry, filename):
|
def write(self, entry, filename):
|
||||||
if not entry.url or not entry.key:
|
if not entry.url or not entry.key:
|
||||||
return
|
return
|
||||||
|
|
||||||
|
if entry.record.rec_type == 'warcinfo':
|
||||||
|
return
|
||||||
|
|
||||||
self.write_cdx_line(self.out, entry, filename)
|
self.write_cdx_line(self.out, entry, filename)
|
||||||
|
|
||||||
def __exit__(self, *args):
|
def __exit__(self, *args):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def write_cdx_line(self, out, entry, filename):
|
|
||||||
if entry.record.rec_type == 'warcinfo':
|
|
||||||
return
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
class CDX06(object):
|
||||||
|
def _write_header(self):
|
||||||
|
self.out.write(' CDX N b a S V g\n')
|
||||||
|
|
||||||
|
def write_cdx_line(self, out, entry, filename):
|
||||||
out.write(entry.key)
|
out.write(entry.key)
|
||||||
out.write(' ')
|
out.write(' ')
|
||||||
out.write(entry.timestamp)
|
out.write(entry.timestamp)
|
||||||
out.write(' ')
|
out.write(' ')
|
||||||
out.write(entry.url)
|
out.write(entry.url)
|
||||||
out.write(' ')
|
out.write(' ')
|
||||||
|
out.write(entry.length)
|
||||||
if self.format_ != 'cdx06':
|
out.write(' ')
|
||||||
out.write(entry.mime)
|
|
||||||
out.write(' ')
|
|
||||||
out.write(entry.status)
|
|
||||||
out.write(' ')
|
|
||||||
out.write(entry.digest)
|
|
||||||
|
|
||||||
if self.format_ == 'cdx09':
|
|
||||||
out.write(' - ')
|
|
||||||
elif self.format_ == 'cdx06':
|
|
||||||
out.write(entry.length)
|
|
||||||
out.write(' ')
|
|
||||||
else:
|
|
||||||
out.write(' - - ')
|
|
||||||
out.write(entry.length)
|
|
||||||
out.write(' ')
|
|
||||||
|
|
||||||
out.write(entry.offset)
|
out.write(entry.offset)
|
||||||
out.write(' ')
|
out.write(' ')
|
||||||
out.write(filename)
|
out.write(filename)
|
||||||
@ -67,21 +50,72 @@ class CDXWriter(object):
|
|||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class SortedCDXWriter(CDXWriter):
|
class CDX09(object):
|
||||||
|
def _write_header(self):
|
||||||
|
self.out.write(' CDX N b a m s k r V g\n')
|
||||||
|
|
||||||
|
def write_cdx_line(self, out, entry, filename):
|
||||||
|
out.write(entry.key)
|
||||||
|
out.write(' ')
|
||||||
|
out.write(entry.timestamp)
|
||||||
|
out.write(' ')
|
||||||
|
out.write(entry.url)
|
||||||
|
out.write(' ')
|
||||||
|
out.write(entry.mime)
|
||||||
|
out.write(' ')
|
||||||
|
out.write(entry.status)
|
||||||
|
out.write(' ')
|
||||||
|
out.write(entry.digest)
|
||||||
|
out.write(' - ')
|
||||||
|
out.write(entry.offset)
|
||||||
|
out.write(' ')
|
||||||
|
out.write(filename)
|
||||||
|
out.write('\n')
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
class CDX11(object):
|
||||||
|
def _write_header(self):
|
||||||
|
self.out.write(' CDX N b a m s k r M S V g\n')
|
||||||
|
|
||||||
|
def write_cdx_line(self, out, entry, filename):
|
||||||
|
out.write(entry.key)
|
||||||
|
out.write(' ')
|
||||||
|
out.write(entry.timestamp)
|
||||||
|
out.write(' ')
|
||||||
|
out.write(entry.url)
|
||||||
|
out.write(' ')
|
||||||
|
out.write(entry.mime)
|
||||||
|
out.write(' ')
|
||||||
|
out.write(entry.status)
|
||||||
|
out.write(' ')
|
||||||
|
out.write(entry.digest)
|
||||||
|
out.write(' - - ')
|
||||||
|
out.write(entry.length)
|
||||||
|
out.write(' ')
|
||||||
|
out.write(entry.offset)
|
||||||
|
out.write(' ')
|
||||||
|
out.write(filename)
|
||||||
|
out.write('\n')
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
class SortedCDXWriter(BaseCDXWriter):
|
||||||
def __enter__(self):
|
def __enter__(self):
|
||||||
self.sortlist = []
|
self.sortlist = []
|
||||||
return super(SortedCDXWriter, self).__enter__()
|
res = super(SortedCDXWriter, self).__enter__()
|
||||||
|
self.actual_out = self.out
|
||||||
|
return res
|
||||||
|
|
||||||
def write(self, entry, filename):
|
def write(self, entry, filename):
|
||||||
outbuff = BytesIO()
|
self.out = BytesIO()
|
||||||
self.write_cdx_line(outbuff, entry, filename)
|
super(SortedCDXWriter, self).write(entry, filename)
|
||||||
|
line = self.out.getvalue()
|
||||||
line = outbuff.getvalue()
|
|
||||||
if line:
|
if line:
|
||||||
insort(self.sortlist, line)
|
insort(self.sortlist, line)
|
||||||
|
|
||||||
def __exit__(self, *args):
|
def __exit__(self, *args):
|
||||||
self.out.write(''.join(self.sortlist))
|
self.actual_out.write(''.join(self.sortlist))
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
@ -129,13 +163,25 @@ def cdx_filename(filename):
|
|||||||
def get_cdx_writer_cls(options):
|
def get_cdx_writer_cls(options):
|
||||||
writer_cls = options.get('writer_cls')
|
writer_cls = options.get('writer_cls')
|
||||||
|
|
||||||
if not writer_cls:
|
if writer_cls:
|
||||||
if options.get('sort'):
|
if not options.get('writer_add_mixin'):
|
||||||
writer_cls = SortedCDXWriter
|
return writer_cls
|
||||||
else:
|
elif options.get('sort'):
|
||||||
writer_cls = CDXWriter
|
writer_cls = SortedCDXWriter
|
||||||
|
else:
|
||||||
|
writer_cls = BaseCDXWriter
|
||||||
|
|
||||||
return writer_cls
|
if options.get('cdx09'):
|
||||||
|
format_mixin = CDX09
|
||||||
|
elif options.get('cdx06'):
|
||||||
|
format_mixin = CDX06
|
||||||
|
else:
|
||||||
|
format_mixin = CDX11
|
||||||
|
|
||||||
|
class CDXWriter(writer_cls, format_mixin):
|
||||||
|
pass
|
||||||
|
|
||||||
|
return CDXWriter
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
@ -163,7 +209,7 @@ def write_multi_cdx_index(output, inputs, **options):
|
|||||||
writer_cls = get_cdx_writer_cls(options)
|
writer_cls = get_cdx_writer_cls(options)
|
||||||
record_iter = DefaultRecordIter(**options)
|
record_iter = DefaultRecordIter(**options)
|
||||||
|
|
||||||
with writer_cls(outfile, options.get('format')) as writer:
|
with writer_cls(outfile) as writer:
|
||||||
for fullpath, filename in iter_file_or_dir(inputs, recurse):
|
for fullpath, filename in iter_file_or_dir(inputs, recurse):
|
||||||
with open(fullpath, 'rb') as infile:
|
with open(fullpath, 'rb') as infile:
|
||||||
entry_iter = record_iter(infile)
|
entry_iter = record_iter(infile)
|
||||||
@ -181,7 +227,7 @@ def write_cdx_index(outfile, infile, filename, **options):
|
|||||||
|
|
||||||
writer_cls = get_cdx_writer_cls(options)
|
writer_cls = get_cdx_writer_cls(options)
|
||||||
|
|
||||||
with writer_cls(outfile, options.get('format')) as writer:
|
with writer_cls(outfile) as writer:
|
||||||
entry_iter = DefaultRecordIter(**options)(infile)
|
entry_iter = DefaultRecordIter(**options)(infile)
|
||||||
|
|
||||||
for entry in entry_iter:
|
for entry in entry_iter:
|
||||||
@ -283,19 +329,14 @@ if input is a directory"""
|
|||||||
|
|
||||||
cmd = parser.parse_args(args=args)
|
cmd = parser.parse_args(args=args)
|
||||||
|
|
||||||
format_ = 'cdx11'
|
|
||||||
if cmd.cdx09:
|
|
||||||
format_ = 'cdx09'
|
|
||||||
elif cmd.cdx06:
|
|
||||||
format_ = 'cdx06'
|
|
||||||
|
|
||||||
write_multi_cdx_index(cmd.output, cmd.inputs,
|
write_multi_cdx_index(cmd.output, cmd.inputs,
|
||||||
sort=cmd.sort,
|
sort=cmd.sort,
|
||||||
surt_ordered=not cmd.unsurt,
|
surt_ordered=not cmd.unsurt,
|
||||||
include_all=cmd.allrecords,
|
include_all=cmd.allrecords,
|
||||||
append_post=cmd.postappend,
|
append_post=cmd.postappend,
|
||||||
recurse=cmd.recurse,
|
recurse=cmd.recurse,
|
||||||
format=format_,
|
cdx09=cmd.cdx09,
|
||||||
|
cdx06=cmd.cdx06,
|
||||||
minimal=cmd.cdx06)
|
minimal=cmd.cdx06)
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user