1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

cdx-indexer: minor cleanup, add custom writer override to

write_multi_cdx_index
This commit is contained in:
Ilya Kreymer 2015-02-04 11:17:26 -08:00
parent ef98716bd8
commit 40fba3c27b
2 changed files with 20 additions and 13 deletions

View File

@ -5,6 +5,8 @@ pywb 0.7.7 changelist
* rules: fix YT rewrite rule, add rule for wikimedia * rules: fix YT rewrite rule, add rule for wikimedia
* cdx-indexer: minor cleanup, add support for custom writer for batched cdx (write_multi_cdx_index)
pywb 0.7.6 changelist pywb 0.7.6 changelist
~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~

View File

@ -107,6 +107,19 @@ def cdx_filename(filename):
return remove_ext(filename) + '.cdx' return remove_ext(filename) + '.cdx'
#=================================================================
def get_cdx_writer_cls(options):
writer_cls = options.get('writer_cls')
if not writer_cls:
if options.get('sort'):
writer_cls = SortedCDXWriter
else:
writer_cls = CDXWriter
return writer_cls
#================================================================= #=================================================================
def write_multi_cdx_index(output, inputs, **options): def write_multi_cdx_index(output, inputs, **options):
# write one cdx per dir # write one cdx per dir
@ -117,7 +130,7 @@ def write_multi_cdx_index(output, inputs, **options):
with open(outpath, 'wb') as outfile: with open(outpath, 'wb') as outfile:
with open(fullpath, 'rb') as infile: with open(fullpath, 'rb') as infile:
write_cdx_index(outfile, infile, filename, **options) return write_cdx_index(outfile, infile, filename, **options)
# write to one cdx file # write to one cdx file
else: else:
@ -126,10 +139,7 @@ def write_multi_cdx_index(output, inputs, **options):
else: else:
outfile = open(output, 'wb') outfile = open(output, 'wb')
if options.get('sort'): writer_cls = get_cdx_writer_cls(options)
writer_cls = SortedCDXWriter
else:
writer_cls = CDXWriter
with writer_cls(outfile, options.get('cdx09')) as writer: with writer_cls(outfile, options.get('cdx09')) as writer:
for fullpath, filename in iter_file_or_dir(inputs): for fullpath, filename in iter_file_or_dir(inputs):
@ -139,20 +149,15 @@ def write_multi_cdx_index(output, inputs, **options):
for entry in entry_iter: for entry in entry_iter:
writer.write(entry, filename) writer.write(entry, filename)
return writer
#================================================================= #=================================================================
def write_cdx_index(outfile, infile, filename, **options): def write_cdx_index(outfile, infile, filename, **options):
writer_cls = options.get('writer_cls')
if type(filename) is unicode: if type(filename) is unicode:
filename = filename.encode(sys.getfilesystemencoding()) filename = filename.encode(sys.getfilesystemencoding())
if writer_cls: writer_cls = get_cdx_writer_cls(options)
pass
elif options.get('sort'):
writer_cls = SortedCDXWriter
else:
writer_cls = CDXWriter
with writer_cls(outfile, options.get('cdx09')) as writer: with writer_cls(outfile, options.get('cdx09')) as writer:
entry_iter = create_index_iter(infile, **options) entry_iter = create_index_iter(infile, **options)