1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

indexing: refactor ArchiveIndexEntry to be a dict instead of adding attrib. Allows for better track of indexed properties.

Add json-based cdx! (cdxj) output where all fields except url + key are in json dict. Support for both minimal and full json cdx, tracked via #76
This commit is contained in:
Ilya Kreymer 2015-03-08 12:01:24 -07:00
parent bfe590996b
commit 6f9808f090
5 changed files with 111 additions and 75 deletions

View File

@ -23,7 +23,7 @@ def get_input(msg): #pragma: no cover
#============================================================================= #=============================================================================
class CollectionsManager(object): class CollectionsManager(object): #pragma: no cover
""" This utility is designed to """ This utility is designed to
simplify the creation and management of web archive collections simplify the creation and management of web archive collections

View File

@ -189,36 +189,31 @@ class ArchiveIterator(object):
#================================================================= #=================================================================
class ArchiveIndexEntry(object): class ArchiveIndexEntry(dict):
MIME_RE = re.compile('[; ]') MIME_RE = re.compile('[; ]')
def __init__(self):
self.url = None
self.key = None
self.digest = '-'
def extract_mime(self, mime, def_mime='unk'): def extract_mime(self, mime, def_mime='unk'):
""" Utility function to extract mimetype only """ Utility function to extract mimetype only
from a full content type, removing charset settings from a full content type, removing charset settings
""" """
self.mime = def_mime self['mime'] = def_mime
if mime: if mime:
self.mime = self.MIME_RE.split(mime, 1)[0] self['mime'] = self.MIME_RE.split(mime, 1)[0]
def extract_status(self, status_headers): def extract_status(self, status_headers):
""" Extract status code only from status line """ Extract status code only from status line
""" """
self.status = status_headers.get_statuscode() self['status'] = status_headers.get_statuscode()
if not self.status: if not self['status']:
self.status = '-' self['status'] = '-'
if self.status == '204' and 'Error' in status_headers.statusline: elif self['status'] == '204' and 'Error' in status_headers.statusline:
self.status = '-' self['status'] = '-'
def set_rec_info(self, offset, length, digest): def set_rec_info(self, offset, length, digest):
self.offset = str(offset) self['offset'] = str(offset)
self.length = str(length) self['length'] = str(length)
if digest: if digest:
self.digest = digest self['digest'] = digest
def merge_request_data(self, other, options): def merge_request_data(self, other, options):
surt_ordered = options.get('surt_ordered', True) surt_ordered = options.get('surt_ordered', True)
@ -231,14 +226,15 @@ class ArchiveIndexEntry(object):
return False return False
# merge POST/PUT body query # merge POST/PUT body query
if hasattr(other, 'post_query'): post_query = other.get('_post_query')
url = append_post_query(self.url, other.post_query) if post_query:
self.key = canonicalize(url, surt_ordered) url = append_post_query(self['url'], post_query)
other.key = self.key self['key'] = canonicalize(url, surt_ordered)
other['key'] = self['key']
referer = other.record.status_headers.get_header('referer') referer = other.record.status_headers.get_header('referer')
if referer: if referer:
self.referer = referer self['_referer'] = referer
return True return True
@ -263,7 +259,7 @@ class DefaultRecordIter(object):
for record in arcv_iter.iter_records(block_size): for record in arcv_iter.iter_records(block_size):
entry = None entry = None
if not include_all and (record.status_headers.get_statuscode() == '-'): if not include_all and not minimal and (record.status_headers.get_statuscode() == '-'):
continue continue
if record.format == 'warc': if record.format == 'warc':
@ -283,27 +279,27 @@ class DefaultRecordIter(object):
if not entry: if not entry:
continue continue
if entry.url and not entry.key: if entry.get('url') and not entry.get('key'):
entry.key = canonicalize(entry.url, surt_ordered) entry['key'] = canonicalize(entry['url'], surt_ordered)
compute_digest = False compute_digest = False
if (not minimal and if (not minimal and
entry.digest == '-' and entry.get('digest', '-') == '-' and
record.rec_type not in ('revisit', 'request', 'warcinfo')): record.rec_type not in ('revisit', 'request', 'warcinfo')):
compute_digest = True compute_digest = True
elif record.rec_type == 'request' and append_post: elif not minimal and record.rec_type == 'request' and append_post:
method = record.status_headers.protocol method = record.status_headers.protocol
len_ = record.status_headers.get_header('Content-Length') len_ = record.status_headers.get_header('Content-Length')
post_query = extract_post_query(method, post_query = extract_post_query(method,
entry.mime, entry.get('mime'),
len_, len_,
record.stream) record.stream)
entry.post_query = post_query entry['_post_query'] = post_query
#entry.set_rec_info(*arcv_iter.read_to_end(record, compute_digest)) #entry.set_rec_info(*arcv_iter.read_to_end(record, compute_digest))
arcv_iter.read_to_end(record, compute_digest) arcv_iter.read_to_end(record, compute_digest)
@ -321,7 +317,7 @@ class DefaultRecordIter(object):
continue continue
# check for url match # check for url match
if (entry.url != prev_entry.url): if (entry['url'] != prev_entry['url']):
pass pass
# check for concurrency also # check for concurrency also
@ -351,23 +347,23 @@ class DefaultRecordIter(object):
entry = ArchiveIndexEntry() entry = ArchiveIndexEntry()
if record.rec_type == 'warcinfo': if record.rec_type == 'warcinfo':
entry.url = record.rec_headers.get_header('WARC-Filename') entry['url'] = record.rec_headers.get_header('WARC-Filename')
entry.key = entry.url entry['key'] = entry['url']
entry.warcinfo = record.stream.read(record.length) entry['_warcinfo'] = record.stream.read(record.length)
return entry return entry
entry.url = record.rec_headers.get_header('WARC-Target-Uri') entry['url'] = record.rec_headers.get_header('WARC-Target-Uri')
# timestamp # timestamp
entry.timestamp = iso_date_to_timestamp(record.rec_headers. entry['timestamp'] = iso_date_to_timestamp(record.rec_headers.
get_header('WARC-Date')) get_header('WARC-Date'))
if self.options.get('minimal'): if self.options.get('minimal'):
return entry return entry
# mime # mime
if record.rec_type == 'revisit': if record.rec_type == 'revisit':
entry.mime = 'warc/revisit' entry['mime'] = 'warc/revisit'
else: else:
def_mime = '-' if record.rec_type == 'request' else 'unk' def_mime = '-' if record.rec_type == 'request' else 'unk'
entry.extract_mime(record.status_headers. entry.extract_mime(record.status_headers.
@ -378,15 +374,16 @@ class DefaultRecordIter(object):
if record.rec_type == 'response': if record.rec_type == 'response':
entry.extract_status(record.status_headers) entry.extract_status(record.status_headers)
else: else:
entry.status = '-' entry['status'] = '-'
# digest # digest
entry.digest = record.rec_headers.get_header('WARC-Payload-Digest') digest = record.rec_headers.get_header('WARC-Payload-Digest')
if entry.digest and entry.digest.startswith('sha1:'): entry['digest'] = digest
entry.digest = entry.digest[len('sha1:'):] if digest and digest.startswith('sha1:'):
entry['digest'] = digest[len('sha1:'):]
if not entry.digest: elif not entry.get('digest'):
entry.digest = '-' entry['digest'] = '-'
return entry return entry
@ -407,12 +404,12 @@ class DefaultRecordIter(object):
url = url.replace('\x00', '%00') url = url.replace('\x00', '%00')
entry = ArchiveIndexEntry() entry = ArchiveIndexEntry()
entry.url = url entry['url'] = url
# timestamp # timestamp
entry.timestamp = record.rec_headers.get_header('archive-date') entry['timestamp'] = record.rec_headers.get_header('archive-date')
if len(entry.timestamp) > 14: if len(entry['timestamp']) > 14:
entry.timestamp = entry.timestamp[:14] entry['timestamp'] = entry['timestamp'][:14]
if self.options.get('minimal'): if self.options.get('minimal'):
return entry return entry
@ -424,7 +421,7 @@ class DefaultRecordIter(object):
entry.extract_mime(record.rec_headers.get_header('content-type')) entry.extract_mime(record.rec_headers.get_header('content-type'))
# digest # digest
entry.digest = '-' entry['digest'] = '-'
return entry return entry

View File

@ -1,5 +1,7 @@
import os import os
import sys import sys
import json
from argparse import ArgumentParser, RawTextHelpFormatter from argparse import ArgumentParser, RawTextHelpFormatter
from bisect import insort from bisect import insort
@ -17,7 +19,7 @@ class BaseCDXWriter(object):
return self return self
def write(self, entry, filename): def write(self, entry, filename):
if not entry.url or not entry.key: if not entry.get('url') or not entry.get('key'):
return return
if entry.record.rec_type == 'warcinfo': if entry.record.rec_type == 'warcinfo':
@ -29,21 +31,48 @@ class BaseCDXWriter(object):
return False return False
#=================================================================
class CDXJ(object):
def _write_header(self):
pass
def write_cdx_line(self, out, entry, filename):
out.write(entry['key'])
out.write(' ')
out.write(entry['timestamp'])
out.write(' ')
outdict = {}
outdict['filename'] = filename
for n, v in entry.iteritems():
if n in ('key', 'timestamp'):
continue
if n.startswith('_'):
continue
outdict[n] = v
out.write(json.dumps(outdict))
out.write('\n')
#================================================================= #=================================================================
class CDX06(object): class CDX06(object):
def _write_header(self): def _write_header(self):
self.out.write(' CDX N b a S V g\n') self.out.write(' CDX N b a S V g\n')
def write_cdx_line(self, out, entry, filename): def write_cdx_line(self, out, entry, filename):
out.write(entry.key) out.write(entry['key'])
out.write(' ') out.write(' ')
out.write(entry.timestamp) out.write(entry['timestamp'])
out.write(' ') out.write(' ')
out.write(entry.url) out.write(entry['url'])
out.write(' ') out.write(' ')
out.write(entry.length) out.write(entry['length'])
out.write(' ') out.write(' ')
out.write(entry.offset) out.write(entry['offset'])
out.write(' ') out.write(' ')
out.write(filename) out.write(filename)
out.write('\n') out.write('\n')
@ -55,19 +84,19 @@ class CDX09(object):
self.out.write(' CDX N b a m s k r V g\n') self.out.write(' CDX N b a m s k r V g\n')
def write_cdx_line(self, out, entry, filename): def write_cdx_line(self, out, entry, filename):
out.write(entry.key) out.write(entry['key'])
out.write(' ') out.write(' ')
out.write(entry.timestamp) out.write(entry['timestamp'])
out.write(' ') out.write(' ')
out.write(entry.url) out.write(entry['url'])
out.write(' ') out.write(' ')
out.write(entry.mime) out.write(entry['mime'])
out.write(' ') out.write(' ')
out.write(entry.status) out.write(entry['status'])
out.write(' ') out.write(' ')
out.write(entry.digest) out.write(entry['digest'])
out.write(' - ') out.write(' - ')
out.write(entry.offset) out.write(entry['offset'])
out.write(' ') out.write(' ')
out.write(filename) out.write(filename)
out.write('\n') out.write('\n')
@ -79,21 +108,21 @@ class CDX11(object):
self.out.write(' CDX N b a m s k r M S V g\n') self.out.write(' CDX N b a m s k r M S V g\n')
def write_cdx_line(self, out, entry, filename): def write_cdx_line(self, out, entry, filename):
out.write(entry.key) out.write(entry['key'])
out.write(' ') out.write(' ')
out.write(entry.timestamp) out.write(entry['timestamp'])
out.write(' ') out.write(' ')
out.write(entry.url) out.write(entry['url'])
out.write(' ') out.write(' ')
out.write(entry.mime) out.write(entry['mime'])
out.write(' ') out.write(' ')
out.write(entry.status) out.write(entry['status'])
out.write(' ') out.write(' ')
out.write(entry.digest) out.write(entry['digest'])
out.write(' - - ') out.write(' - - ')
out.write(entry.length) out.write(entry['length'])
out.write(' ') out.write(' ')
out.write(entry.offset) out.write(entry['offset'])
out.write(' ') out.write(' ')
out.write(filename) out.write(filename)
out.write('\n') out.write('\n')
@ -171,10 +200,12 @@ def get_cdx_writer_cls(options):
else: else:
writer_cls = BaseCDXWriter writer_cls = BaseCDXWriter
if options.get('cdx09'): if options.get('cdxj'):
format_mixin = CDX09 format_mixin = CDXJ
elif options.get('minimal'): elif options.get('cdx06') or options.get('minimal'):
format_mixin = CDX06 format_mixin = CDX06
elif options.get('cdx09'):
format_mixin = CDX09
else: else:
format_mixin = CDX11 format_mixin = CDX11
@ -329,7 +360,13 @@ if input is a directory"""
action='store_true', action='store_true',
help=cdx09_help) help=cdx09_help)
group.add_argument('-m', '--minimal', group.add_argument('-6', '--cdx06',
action='store_true')
group.add_argument('-j', '--cdxj',
action='store_true')
parser.add_argument('-m', '--minimal',
action='store_true', action='store_true',
help=minimal_help) help=minimal_help)
@ -345,6 +382,8 @@ if input is a directory"""
append_post=cmd.postappend, append_post=cmd.postappend,
recurse=cmd.recurse, recurse=cmd.recurse,
cdx09=cmd.cdx09, cdx09=cmd.cdx09,
cdx06=cmd.cdx06,
cdxj=cmd.cdxj,
minimal=cmd.minimal) minimal=cmd.minimal)

View File

@ -132,7 +132,7 @@ class ArcWarcRecordLoader:
# don't parse the http record at all # don't parse the http record at all
if no_record_parse: if no_record_parse:
status_headers = StatusAndHeaders('', []) status_headers = None#StatusAndHeaders('', [])
# if empty record (error or otherwise) set status to 204 # if empty record (error or otherwise) set status to 204
elif length == 0: elif length == 0:

View File

@ -126,7 +126,7 @@ def create_cdx_server_app(passed_config):
#================================================================= #=================================================================
class DirectoryCollsLoader(object): class DirectoryCollsLoader(object): #pragma: no cover
def __init__(self, config, static_routes): def __init__(self, config, static_routes):
self.config = config self.config = config
self.static_routes = static_routes self.static_routes = static_routes