1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-25 23:47:47 +01:00
pywb/pywb/warc/cdxindexer.py
Ilya Kreymer 90eee03cdb fixes for windows:
indexing: ensure '/' always written to cdx
autoindex: improved test case, ensure threads exit with join
style: fix long lines
2015-03-25 10:56:53 -07:00

426 lines
12 KiB
Python

import os
import sys
from json import dumps as json_encode
try: # pragma: no cover
from collections import OrderedDict
except ImportError: # pragma: no cover
from ordereddict import OrderedDict
from argparse import ArgumentParser, RawTextHelpFormatter
from bisect import insort
from io import BytesIO
from archiveiterator import DefaultRecordIter
#=================================================================
class BaseCDXWriter(object):
def __init__(self, out):
self.out = out
def __enter__(self):
self._write_header()
return self
def write(self, entry, filename):
if not entry.get('url') or not entry.get('urlkey'):
return
if entry.record.rec_type == 'warcinfo':
return
self.write_cdx_line(self.out, entry, filename)
def __exit__(self, *args):
return False
#=================================================================
class CDXJ(object):
def _write_header(self):
pass
def write_cdx_line(self, out, entry, filename):
out.write(entry['urlkey'])
out.write(' ')
out.write(entry['timestamp'])
out.write(' ')
outdict = OrderedDict()
for n, v in entry.iteritems():
if n in ('urlkey', 'timestamp'):
continue
if n.startswith('_'):
continue
if not v or v == '-':
continue
outdict[n] = v
outdict['filename'] = filename
out.write(json_encode(outdict))
out.write('\n')
#=================================================================
class CDX09(object):
def _write_header(self):
self.out.write(' CDX N b a m s k r V g\n')
def write_cdx_line(self, out, entry, filename):
out.write(entry['urlkey'])
out.write(' ')
out.write(entry['timestamp'])
out.write(' ')
out.write(entry['url'])
out.write(' ')
out.write(entry['mime'])
out.write(' ')
out.write(entry['status'])
out.write(' ')
out.write(entry['digest'])
out.write(' - ')
out.write(entry['offset'])
out.write(' ')
out.write(filename)
out.write('\n')
#=================================================================
class CDX11(object):
def _write_header(self):
self.out.write(' CDX N b a m s k r M S V g\n')
def write_cdx_line(self, out, entry, filename):
out.write(entry['urlkey'])
out.write(' ')
out.write(entry['timestamp'])
out.write(' ')
out.write(entry['url'])
out.write(' ')
out.write(entry['mime'])
out.write(' ')
out.write(entry['status'])
out.write(' ')
out.write(entry['digest'])
out.write(' - - ')
out.write(entry['length'])
out.write(' ')
out.write(entry['offset'])
out.write(' ')
out.write(filename)
out.write('\n')
#=================================================================
class SortedCDXWriter(BaseCDXWriter):
def __enter__(self):
self.sortlist = []
res = super(SortedCDXWriter, self).__enter__()
self.actual_out = self.out
return res
def write(self, entry, filename):
self.out = BytesIO()
super(SortedCDXWriter, self).write(entry, filename)
line = self.out.getvalue()
if line:
insort(self.sortlist, line)
def __exit__(self, *args):
self.actual_out.write(''.join(self.sortlist))
return False
#=================================================================
ALLOWED_EXT = ('.arc', '.arc.gz', '.warc', '.warc.gz')
#=================================================================
def _resolve_rel_path(path, rel_root):
path = os.path.relpath(path, rel_root)
if os.path.sep != '/': #pragma: no cover
path = path.replace(os.path.sep, '/')
return path
#=================================================================
def iter_file_or_dir(inputs, recursive=True, rel_root=None):
for input_ in inputs:
if not os.path.isdir(input_):
if not rel_root:
filename = os.path.basename(input_)
else:
filename = _resolve_rel_path(input_, rel_root)
yield input_, filename
elif not recursive:
for filename in os.listdir(input_):
if filename.endswith(ALLOWED_EXT):
full_path = os.path.join(input_, filename)
if rel_root:
filename = _resolve_rel_path(full_path, rel_root)
yield full_path, filename
else:
for root, dirs, files in os.walk(input_):
for filename in files:
if filename.endswith(ALLOWED_EXT):
full_path = os.path.join(root, filename)
if not rel_root:
rel_root = input_
rel_path = _resolve_rel_path(full_path, rel_root)
yield full_path, rel_path
#=================================================================
def remove_ext(filename):
for ext in ALLOWED_EXT:
if filename.endswith(ext):
filename = filename[:-len(ext)]
break
return filename
#=================================================================
def cdx_filename(filename):
return remove_ext(filename) + '.cdx'
#=================================================================
def get_cdx_writer_cls(options):
if options.get('minimal'):
options['cdxj'] = True
writer_cls = options.get('writer_cls')
if writer_cls:
if not options.get('writer_add_mixin'):
return writer_cls
elif options.get('sort'):
writer_cls = SortedCDXWriter
else:
writer_cls = BaseCDXWriter
if options.get('cdxj'):
format_mixin = CDXJ
elif options.get('cdx09'):
format_mixin = CDX09
else:
format_mixin = CDX11
class CDXWriter(writer_cls, format_mixin):
pass
return CDXWriter
#=================================================================
def write_multi_cdx_index(output, inputs, **options):
recurse = options.get('recurse', False)
rel_root = options.get('rel_root')
# write one cdx per dir
if output != '-' and os.path.isdir(output):
for fullpath, filename in iter_file_or_dir(inputs,
recurse,
rel_root):
outpath = cdx_filename(filename)
outpath = os.path.join(output, outpath)
with open(outpath, 'wb') as outfile:
with open(fullpath, 'rb') as infile:
writer = write_cdx_index(outfile, infile, filename,
**options)
return writer
# write to one cdx file
else:
if output == '-':
outfile = sys.stdout
else:
outfile = open(output, 'wb')
writer_cls = get_cdx_writer_cls(options)
record_iter = DefaultRecordIter(**options)
with writer_cls(outfile) as writer:
for fullpath, filename in iter_file_or_dir(inputs,
recurse,
rel_root):
with open(fullpath, 'rb') as infile:
entry_iter = record_iter(infile)
for entry in entry_iter:
writer.write(entry, filename)
return writer
#=================================================================
def write_cdx_index(outfile, infile, filename, **options):
if type(filename) is unicode:
filename = filename.encode(sys.getfilesystemencoding())
writer_cls = get_cdx_writer_cls(options)
with writer_cls(outfile) as writer:
entry_iter = DefaultRecordIter(**options)(infile)
for entry in entry_iter:
writer.write(entry, filename)
return writer
#=================================================================
def main(args=None):
description = """
Generate .cdx index files for WARCs and ARCs
Compressed (.warc.gz / .arc.gz) or uncompressed (.warc / .arc) formats
are supported.
"""
epilog = """
Some examples:
* Create "example.cdx" index from example.warc.gz
{0} ./cdx/example.cdx ./warcs/example.warc.gz
* Create "combined.cdx", a combined, sorted index of all warcs in ./warcs/
{0} --sort combined.cdx ./warcs/
* Create a sorted cdx per file in ./cdx/ for each archive file in ./warcs/
{0} --sort ./cdx/ ./warcs/
""".format(os.path.basename(sys.argv[0]))
sort_help = """
Sort the output to each file before writing to create a total ordering
"""
unsurt_help = """
Convert SURT (Sort-friendly URI Reordering Transform) back to regular
urls for the cdx key. Default is to use SURT keys.
Not-recommended for new cdx, use only for backwards-compatibility.
"""
cdx09_help = """
Use older 9-field cdx format, default is 11-cdx field
"""
minimal_json_help = """
CDX JSON output, but with minimal fields only, available w/o parsing
http record. The fields are: canonicalized url, timestamp,
original url, digest, archive offset, archive length
and archive filename. mimetype is included to indicate warc/revisit only.
This option skips record parsing and will not work with
POST append (-p) option
"""
json_help = """
Output CDX JSON format per line, with url timestamp first,
followed by a json dict for all other fields:
url timestamp { ... }
"""
output_help = """
Output file or directory.
- If directory, each input file is written to a seperate output file
with a .cdx extension
- If output is '-', output is written to stdout
"""
input_help = """
Input file or directory.
- If directory, all archive files from that directory are read
"""
allrecords_help = """
Include All records.
currently includes the 'request' records in addition to all
response records
"""
post_append_help = """
For POST requests, append form query to url key.
(Only applies to form url encoded posts)
"""
recurse_dirs_help = """
Recurse through all subdirectories if the input is a directory
"""
dir_root_help = """
Make CDX filenames relative to specified root directory,
instead of current working directory
"""
parser = ArgumentParser(description=description,
epilog=epilog,
formatter_class=RawTextHelpFormatter)
parser.add_argument('-s', '--sort',
action='store_true',
help=sort_help)
parser.add_argument('-a', '--allrecords',
action='store_true',
help=allrecords_help)
parser.add_argument('-p', '--postappend',
action='store_true',
help=post_append_help)
parser.add_argument('-r', '--recurse',
action='store_true',
help=recurse_dirs_help)
parser.add_argument('-d', '--dir-root',
help=dir_root_help)
parser.add_argument('-u', '--unsurt',
action='store_true',
help=unsurt_help)
group = parser.add_mutually_exclusive_group()
group.add_argument('-9', '--cdx09',
action='store_true',
help=cdx09_help)
group.add_argument('-j', '--cdxj',
action='store_true',
help=json_help)
parser.add_argument('-mj', '--minimal-cdxj',
action='store_true',
help=minimal_json_help)
parser.add_argument('output', nargs='?', default='-', help=output_help)
parser.add_argument('inputs', nargs='+', help=input_help)
cmd = parser.parse_args(args=args)
write_multi_cdx_index(cmd.output, cmd.inputs,
sort=cmd.sort,
surt_ordered=not cmd.unsurt,
include_all=cmd.allrecords,
append_post=cmd.postappend,
recurse=cmd.recurse,
rel_root=cmd.dir_root,
cdx09=cmd.cdx09,
cdxj=cmd.cdxj,
minimal=cmd.minimal_cdxj)
if __name__ == '__main__':
main()