1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

indexing: refactor ArchiveIndexEntry to be a dict instead of adding attrib. Allows for better track of indexed properties.

Add json-based cdx! (cdxj) output where all fields except url + key are in json dict. Support for both minimal and full json cdx, tracked via #76
This commit is contained in:
Ilya Kreymer 2015-03-08 12:01:24 -07:00
parent bfe590996b
commit 6f9808f090
5 changed files with 111 additions and 75 deletions

View File

@ -23,7 +23,7 @@ def get_input(msg): #pragma: no cover
#=============================================================================
class CollectionsManager(object):
class CollectionsManager(object): #pragma: no cover
""" This utility is designed to
simplify the creation and management of web archive collections

View File

@ -189,36 +189,31 @@ class ArchiveIterator(object):
#=================================================================
class ArchiveIndexEntry(object):
class ArchiveIndexEntry(dict):
MIME_RE = re.compile('[; ]')
def __init__(self):
self.url = None
self.key = None
self.digest = '-'
def extract_mime(self, mime, def_mime='unk'):
""" Utility function to extract mimetype only
from a full content type, removing charset settings
"""
self.mime = def_mime
self['mime'] = def_mime
if mime:
self.mime = self.MIME_RE.split(mime, 1)[0]
self['mime'] = self.MIME_RE.split(mime, 1)[0]
def extract_status(self, status_headers):
""" Extract status code only from status line
"""
self.status = status_headers.get_statuscode()
if not self.status:
self.status = '-'
if self.status == '204' and 'Error' in status_headers.statusline:
self.status = '-'
self['status'] = status_headers.get_statuscode()
if not self['status']:
self['status'] = '-'
elif self['status'] == '204' and 'Error' in status_headers.statusline:
self['status'] = '-'
def set_rec_info(self, offset, length, digest):
self.offset = str(offset)
self.length = str(length)
self['offset'] = str(offset)
self['length'] = str(length)
if digest:
self.digest = digest
self['digest'] = digest
def merge_request_data(self, other, options):
surt_ordered = options.get('surt_ordered', True)
@ -231,14 +226,15 @@ class ArchiveIndexEntry(object):
return False
# merge POST/PUT body query
if hasattr(other, 'post_query'):
url = append_post_query(self.url, other.post_query)
self.key = canonicalize(url, surt_ordered)
other.key = self.key
post_query = other.get('_post_query')
if post_query:
url = append_post_query(self['url'], post_query)
self['key'] = canonicalize(url, surt_ordered)
other['key'] = self['key']
referer = other.record.status_headers.get_header('referer')
if referer:
self.referer = referer
self['_referer'] = referer
return True
@ -263,7 +259,7 @@ class DefaultRecordIter(object):
for record in arcv_iter.iter_records(block_size):
entry = None
if not include_all and (record.status_headers.get_statuscode() == '-'):
if not include_all and not minimal and (record.status_headers.get_statuscode() == '-'):
continue
if record.format == 'warc':
@ -283,27 +279,27 @@ class DefaultRecordIter(object):
if not entry:
continue
if entry.url and not entry.key:
entry.key = canonicalize(entry.url, surt_ordered)
if entry.get('url') and not entry.get('key'):
entry['key'] = canonicalize(entry['url'], surt_ordered)
compute_digest = False
if (not minimal and
entry.digest == '-' and
entry.get('digest', '-') == '-' and
record.rec_type not in ('revisit', 'request', 'warcinfo')):
compute_digest = True
elif record.rec_type == 'request' and append_post:
elif not minimal and record.rec_type == 'request' and append_post:
method = record.status_headers.protocol
len_ = record.status_headers.get_header('Content-Length')
post_query = extract_post_query(method,
entry.mime,
entry.get('mime'),
len_,
record.stream)
entry.post_query = post_query
entry['_post_query'] = post_query
#entry.set_rec_info(*arcv_iter.read_to_end(record, compute_digest))
arcv_iter.read_to_end(record, compute_digest)
@ -321,7 +317,7 @@ class DefaultRecordIter(object):
continue
# check for url match
if (entry.url != prev_entry.url):
if (entry['url'] != prev_entry['url']):
pass
# check for concurrency also
@ -351,23 +347,23 @@ class DefaultRecordIter(object):
entry = ArchiveIndexEntry()
if record.rec_type == 'warcinfo':
entry.url = record.rec_headers.get_header('WARC-Filename')
entry.key = entry.url
entry.warcinfo = record.stream.read(record.length)
entry['url'] = record.rec_headers.get_header('WARC-Filename')
entry['key'] = entry['url']
entry['_warcinfo'] = record.stream.read(record.length)
return entry
entry.url = record.rec_headers.get_header('WARC-Target-Uri')
entry['url'] = record.rec_headers.get_header('WARC-Target-Uri')
# timestamp
entry.timestamp = iso_date_to_timestamp(record.rec_headers.
get_header('WARC-Date'))
entry['timestamp'] = iso_date_to_timestamp(record.rec_headers.
get_header('WARC-Date'))
if self.options.get('minimal'):
return entry
# mime
if record.rec_type == 'revisit':
entry.mime = 'warc/revisit'
entry['mime'] = 'warc/revisit'
else:
def_mime = '-' if record.rec_type == 'request' else 'unk'
entry.extract_mime(record.status_headers.
@ -378,15 +374,16 @@ class DefaultRecordIter(object):
if record.rec_type == 'response':
entry.extract_status(record.status_headers)
else:
entry.status = '-'
entry['status'] = '-'
# digest
entry.digest = record.rec_headers.get_header('WARC-Payload-Digest')
if entry.digest and entry.digest.startswith('sha1:'):
entry.digest = entry.digest[len('sha1:'):]
digest = record.rec_headers.get_header('WARC-Payload-Digest')
entry['digest'] = digest
if digest and digest.startswith('sha1:'):
entry['digest'] = digest[len('sha1:'):]
if not entry.digest:
entry.digest = '-'
elif not entry.get('digest'):
entry['digest'] = '-'
return entry
@ -407,12 +404,12 @@ class DefaultRecordIter(object):
url = url.replace('\x00', '%00')
entry = ArchiveIndexEntry()
entry.url = url
entry['url'] = url
# timestamp
entry.timestamp = record.rec_headers.get_header('archive-date')
if len(entry.timestamp) > 14:
entry.timestamp = entry.timestamp[:14]
entry['timestamp'] = record.rec_headers.get_header('archive-date')
if len(entry['timestamp']) > 14:
entry['timestamp'] = entry['timestamp'][:14]
if self.options.get('minimal'):
return entry
@ -424,7 +421,7 @@ class DefaultRecordIter(object):
entry.extract_mime(record.rec_headers.get_header('content-type'))
# digest
entry.digest = '-'
entry['digest'] = '-'
return entry

View File

@ -1,5 +1,7 @@
import os
import sys
import json
from argparse import ArgumentParser, RawTextHelpFormatter
from bisect import insort
@ -17,7 +19,7 @@ class BaseCDXWriter(object):
return self
def write(self, entry, filename):
if not entry.url or not entry.key:
if not entry.get('url') or not entry.get('key'):
return
if entry.record.rec_type == 'warcinfo':
@ -29,21 +31,48 @@ class BaseCDXWriter(object):
return False
#=================================================================
class CDXJ(object):
def _write_header(self):
pass
def write_cdx_line(self, out, entry, filename):
out.write(entry['key'])
out.write(' ')
out.write(entry['timestamp'])
out.write(' ')
outdict = {}
outdict['filename'] = filename
for n, v in entry.iteritems():
if n in ('key', 'timestamp'):
continue
if n.startswith('_'):
continue
outdict[n] = v
out.write(json.dumps(outdict))
out.write('\n')
#=================================================================
class CDX06(object):
def _write_header(self):
self.out.write(' CDX N b a S V g\n')
def write_cdx_line(self, out, entry, filename):
out.write(entry.key)
out.write(entry['key'])
out.write(' ')
out.write(entry.timestamp)
out.write(entry['timestamp'])
out.write(' ')
out.write(entry.url)
out.write(entry['url'])
out.write(' ')
out.write(entry.length)
out.write(entry['length'])
out.write(' ')
out.write(entry.offset)
out.write(entry['offset'])
out.write(' ')
out.write(filename)
out.write('\n')
@ -55,19 +84,19 @@ class CDX09(object):
self.out.write(' CDX N b a m s k r V g\n')
def write_cdx_line(self, out, entry, filename):
out.write(entry.key)
out.write(entry['key'])
out.write(' ')
out.write(entry.timestamp)
out.write(entry['timestamp'])
out.write(' ')
out.write(entry.url)
out.write(entry['url'])
out.write(' ')
out.write(entry.mime)
out.write(entry['mime'])
out.write(' ')
out.write(entry.status)
out.write(entry['status'])
out.write(' ')
out.write(entry.digest)
out.write(entry['digest'])
out.write(' - ')
out.write(entry.offset)
out.write(entry['offset'])
out.write(' ')
out.write(filename)
out.write('\n')
@ -79,21 +108,21 @@ class CDX11(object):
self.out.write(' CDX N b a m s k r M S V g\n')
def write_cdx_line(self, out, entry, filename):
out.write(entry.key)
out.write(entry['key'])
out.write(' ')
out.write(entry.timestamp)
out.write(entry['timestamp'])
out.write(' ')
out.write(entry.url)
out.write(entry['url'])
out.write(' ')
out.write(entry.mime)
out.write(entry['mime'])
out.write(' ')
out.write(entry.status)
out.write(entry['status'])
out.write(' ')
out.write(entry.digest)
out.write(entry['digest'])
out.write(' - - ')
out.write(entry.length)
out.write(entry['length'])
out.write(' ')
out.write(entry.offset)
out.write(entry['offset'])
out.write(' ')
out.write(filename)
out.write('\n')
@ -171,10 +200,12 @@ def get_cdx_writer_cls(options):
else:
writer_cls = BaseCDXWriter
if options.get('cdx09'):
format_mixin = CDX09
elif options.get('minimal'):
if options.get('cdxj'):
format_mixin = CDXJ
elif options.get('cdx06') or options.get('minimal'):
format_mixin = CDX06
elif options.get('cdx09'):
format_mixin = CDX09
else:
format_mixin = CDX11
@ -329,7 +360,13 @@ if input is a directory"""
action='store_true',
help=cdx09_help)
group.add_argument('-m', '--minimal',
group.add_argument('-6', '--cdx06',
action='store_true')
group.add_argument('-j', '--cdxj',
action='store_true')
parser.add_argument('-m', '--minimal',
action='store_true',
help=minimal_help)
@ -345,6 +382,8 @@ if input is a directory"""
append_post=cmd.postappend,
recurse=cmd.recurse,
cdx09=cmd.cdx09,
cdx06=cmd.cdx06,
cdxj=cmd.cdxj,
minimal=cmd.minimal)

View File

@ -132,7 +132,7 @@ class ArcWarcRecordLoader:
# don't parse the http record at all
if no_record_parse:
status_headers = StatusAndHeaders('', [])
status_headers = None#StatusAndHeaders('', [])
# if empty record (error or otherwise) set status to 204
elif length == 0:

View File

@ -126,7 +126,7 @@ def create_cdx_server_app(passed_config):
#=================================================================
class DirectoryCollsLoader(object):
class DirectoryCollsLoader(object): #pragma: no cover
def __init__(self, config, static_routes):
self.config = config
self.static_routes = static_routes