1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-19 10:19:37 +01:00
pywb/pywb/indexer/cdxindexer.py

476 lines
14 KiB
Python

import os
import sys
# Use ujson if available
try:
from ujson import dumps as ujson_dumps
try:
assert (ujson_dumps('http://example.com/',
escape_forward_slashes=False) ==
'"http://example.com/"')
except Exception as e: # pragma: no cover
sys.stderr.write('ujson w/o forward-slash escaping not available,\
defaulting to regular json\n')
raise
def json_encode(obj):
return ujson_dumps(obj, escape_forward_slashes=False)
except: # pragma: no cover
from json import dumps as json_encode
try: # pragma: no cover
from collections import OrderedDict
except ImportError: # pragma: no cover
from ordereddict import OrderedDict
from argparse import ArgumentParser, RawTextHelpFormatter
from six import StringIO
from pywb.indexer.archiveindexer import DefaultRecordParser
import codecs
import six
#=================================================================
class BaseCDXWriter(object):
# To ensure we do not index metadata mime types
# from older WARC specs (Heritrix 1.x) that collide with response records
METADATA_NO_INDEX_TYPES = ('text/anvl', )
def __init__(self, out):
self.out = codecs.getwriter('utf-8')(out)
#self.out = out
def __enter__(self):
self._write_header()
return self
def write(self, entry, filename):
if not entry.get('url') or not entry.get('urlkey'):
return
if self._is_skipped(entry):
return
self.write_cdx_line(self.out, entry, filename)
def _is_skipped(self, entry):
if entry.record.rec_type == 'warcinfo':
return True
return entry.record.rec_type == 'metadata' and entry['mime'] in self.METADATA_NO_INDEX_TYPES
def __exit__(self, *args):
return False
#=================================================================
class CDXJ(object):
def _write_header(self):
pass
def write_cdx_line(self, out, entry, filename):
out.write(entry['urlkey'])
out.write(' ')
out.write(entry['timestamp'])
out.write(' ')
outdict = OrderedDict()
for n, v in six.iteritems(entry):
if n in ('urlkey', 'timestamp'):
continue
if n.startswith('_'):
continue
if not v or v == '-':
continue
outdict[n] = v
outdict['filename'] = filename
out.write(json_encode(outdict))
out.write('\n')
#=================================================================
class CDX09(object):
def _write_header(self):
self.out.write(' CDX N b a m s k r V g\n')
def write_cdx_line(self, out, entry, filename):
out.write(entry['urlkey'])
out.write(' ')
out.write(entry['timestamp'])
out.write(' ')
try:
out.write(entry['url'])
except UnicodeDecodeError:
out.write(entry['url'].decode('utf-8'))
out.write(' ')
out.write(entry['mime'])
out.write(' ')
out.write(entry['status'])
out.write(' ')
out.write(entry['digest'])
out.write(' - ')
out.write(entry['offset'])
out.write(' ')
out.write(filename)
out.write('\n')
#=================================================================
class CDX11(object):
def _write_header(self):
self.out.write(' CDX N b a m s k r M S V g\n')
def write_cdx_line(self, out, entry, filename):
out.write(entry['urlkey'])
out.write(' ')
out.write(entry['timestamp'])
out.write(' ')
try:
out.write(entry['url'])
except UnicodeDecodeError:
out.write(entry['url'].decode('utf-8'))
out.write(' ')
out.write(entry['mime'])
out.write(' ')
out.write(entry['status'])
out.write(' ')
out.write(entry['digest'])
out.write(' - - ')
out.write(entry['length'])
out.write(' ')
out.write(entry['offset'])
out.write(' ')
out.write(filename)
out.write('\n')
#=================================================================
class SortedCDXWriter(BaseCDXWriter):
def __enter__(self):
self.sortlist = []
res = super(SortedCDXWriter, self).__enter__()
self.actual_out = self.out
return res
def write(self, entry, filename):
self.out = StringIO()
super(SortedCDXWriter, self).write(entry, filename)
line = self.out.getvalue()
if line:
self.sortlist.append(line)
def __exit__(self, *args):
self.sortlist.sort()
self.actual_out.write(''.join(self.sortlist))
return False
#=================================================================
ALLOWED_EXT = ('.arc', '.arc.gz', '.warc', '.warc.gz')
#=================================================================
def _resolve_rel_path(path, rel_root):
path = os.path.relpath(path, rel_root)
if os.path.sep != '/': #pragma: no cover
path = path.replace(os.path.sep, '/')
return path
#=================================================================
def iter_file_or_dir(inputs, recursive=True, rel_root=None):
for input_ in inputs:
if not os.path.isdir(input_):
if not rel_root:
filename = os.path.basename(input_)
else:
filename = _resolve_rel_path(input_, rel_root)
yield input_, filename
elif not recursive:
for filename in os.listdir(input_):
if filename.endswith(ALLOWED_EXT):
full_path = os.path.join(input_, filename)
if rel_root:
filename = _resolve_rel_path(full_path, rel_root)
yield full_path, filename
else:
for root, dirs, files in os.walk(input_):
for filename in files:
if filename.endswith(ALLOWED_EXT):
full_path = os.path.join(root, filename)
if not rel_root:
rel_root = input_
rel_path = _resolve_rel_path(full_path, rel_root)
yield full_path, rel_path
#=================================================================
def remove_ext(filename):
for ext in ALLOWED_EXT:
if filename.endswith(ext):
filename = filename[:-len(ext)]
break
return filename
#=================================================================
def cdx_filename(filename):
return remove_ext(filename) + '.cdx'
#=================================================================
def get_cdx_writer_cls(options):
if options.get('minimal'):
options['cdxj'] = True
writer_cls = options.get('writer_cls')
if writer_cls:
if not options.get('writer_add_mixin'):
return writer_cls
elif options.get('sort'):
writer_cls = SortedCDXWriter
else:
writer_cls = BaseCDXWriter
if options.get('cdxj'):
format_mixin = CDXJ
elif options.get('cdx09'):
format_mixin = CDX09
else:
format_mixin = CDX11
class CDXWriter(writer_cls, format_mixin):
pass
return CDXWriter
#=================================================================
def write_multi_cdx_index(output, inputs, **options):
recurse = options.get('recurse', False)
rel_root = options.get('rel_root')
# write one cdx per dir
if output != '-' and os.path.isdir(output):
for fullpath, filename in iter_file_or_dir(inputs,
recurse,
rel_root):
outpath = cdx_filename(filename)
outpath = os.path.join(output, outpath)
with open(outpath, 'wb') as outfile:
with open(fullpath, 'rb') as infile:
writer = write_cdx_index(outfile, infile, filename,
**options)
return writer
# write to one cdx file
else:
if output == '-':
if hasattr(sys.stdout, 'buffer'):
outfile = sys.stdout.buffer
else:
outfile = sys.stdout
else:
outfile = open(output, 'wb')
writer_cls = get_cdx_writer_cls(options)
record_iter = DefaultRecordParser(**options)
with writer_cls(outfile) as writer:
for fullpath, filename in iter_file_or_dir(inputs,
recurse,
rel_root):
with open(fullpath, 'rb') as infile:
entry_iter = record_iter(infile)
for entry in entry_iter:
writer.write(entry, filename)
return writer
#=================================================================
def write_cdx_index(outfile, infile, filename, **options):
#filename = filename.encode(sys.getfilesystemencoding())
writer_cls = get_cdx_writer_cls(options)
with writer_cls(outfile) as writer:
entry_iter = DefaultRecordParser(**options)(infile)
for entry in entry_iter:
writer.write(entry, filename)
return writer
#=================================================================
def main(args=None):
description = """
Generate .cdx index files for WARCs and ARCs
Compressed (.warc.gz / .arc.gz) or uncompressed (.warc / .arc) formats
are supported.
"""
epilog = """
Some examples:
* Create "example.cdx" index from example.warc.gz
{0} --output ./cdx/example.cdx ./warcs/example.warc.gz
* Create "combined.cdx", a combined, sorted index of all warcs in ./warcs/
{0} --sort --output combined.cdx ./warcs/
* Create a sorted cdx per file in ./cdx/ for each archive file in ./warcs/
{0} --sort --output ./cdx/ ./warcs/
""".format(os.path.basename(sys.argv[0]))
sort_help = """
Sort the output to each file before writing to create a total ordering
"""
unsurt_help = """
Convert SURT (Sort-friendly URI Reordering Transform) back to regular
urls for the cdx key. Default is to use SURT keys.
Not-recommended for new cdx, use only for backwards-compatibility.
"""
verify_help = """
Verify HTTP protocol (1.0/1.1) status in response records and http verb
on request records, ensuring the protocol or verb matches the expected list.
Raise an exception on failure. (This was previously the default behavior).
"""
cdx09_help = """
Use older 9-field cdx format, default is 11-cdx field
"""
minimal_json_help = """
CDX JSON output, but with minimal fields only, available w/o parsing
http record. The fields are: canonicalized url, timestamp,
original url, digest, archive offset, archive length
and archive filename. mimetype is included to indicate warc/revisit only.
This option skips record parsing and will not work with
POST append (-p) option
"""
json_help = """
Output CDX JSON format per line, with url timestamp first,
followed by a json dict for all other fields:
url timestamp { ... }
"""
output_help = """
Output file or directory.
- If directory, each input file is written to a seperate output file
with a .cdx extension
- If output is '-', output is written to stdout
"""
input_help = """
Input file or directory.
- If directory, all archive files from that directory are read
"""
allrecords_help = """
Include All records.
currently includes the 'request' records in addition to all
response records
"""
post_append_help = """
For POST requests, append form query to url key.
(Only applies to form url encoded posts)
"""
recurse_dirs_help = """
Recurse through all subdirectories if the input is a directory
"""
dir_root_help = """
Make CDX filenames relative to specified root directory,
instead of current working directory
"""
parser = ArgumentParser(description=description,
epilog=epilog,
formatter_class=RawTextHelpFormatter)
parser.add_argument('-s', '--sort',
action='store_true',
help=sort_help)
parser.add_argument('-a', '--allrecords',
action='store_true',
help=allrecords_help)
parser.add_argument('-p', '--postappend',
action='store_true',
help=post_append_help)
parser.add_argument('-r', '--recurse',
action='store_true',
help=recurse_dirs_help)
parser.add_argument('-d', '--dir-root',
help=dir_root_help)
parser.add_argument('-u', '--unsurt',
action='store_true',
help=unsurt_help)
parser.add_argument('-v', '--verify',
action='store_true',
help=verify_help)
group = parser.add_mutually_exclusive_group()
group.add_argument('-9', '--cdx09',
action='store_true',
help=cdx09_help)
group.add_argument('-j', '--cdxj',
action='store_true',
help=json_help)
parser.add_argument('-mj', '--minimal-cdxj',
action='store_true',
help=minimal_json_help)
parser.add_argument('-o', '--output',
default='-', help=output_help)
parser.add_argument('inputs', nargs='+', help=input_help)
cmd = parser.parse_args(args=args)
write_multi_cdx_index(cmd.output, cmd.inputs,
sort=cmd.sort,
surt_ordered=not cmd.unsurt,
include_all=cmd.allrecords,
append_post=cmd.postappend,
recurse=cmd.recurse,
rel_root=cmd.dir_root,
verify_http=cmd.verify,
cdx09=cmd.cdx09,
cdxj=cmd.cdxj,
minimal=cmd.minimal_cdxj)
if __name__ == '__main__':
main()