mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
indexing: refactor ArchiveIndexEntry to be a dict instead of adding attrib. Allows for better track of indexed properties.
Add json-based cdx! (cdxj) output where all fields except url + key are in json dict. Support for both minimal and full json cdx, tracked via #76
This commit is contained in:
parent
bfe590996b
commit
6f9808f090
@ -23,7 +23,7 @@ def get_input(msg): #pragma: no cover
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class CollectionsManager(object):
|
||||
class CollectionsManager(object): #pragma: no cover
|
||||
""" This utility is designed to
|
||||
simplify the creation and management of web archive collections
|
||||
|
||||
|
@ -189,36 +189,31 @@ class ArchiveIterator(object):
|
||||
|
||||
|
||||
#=================================================================
|
||||
class ArchiveIndexEntry(object):
|
||||
class ArchiveIndexEntry(dict):
|
||||
MIME_RE = re.compile('[; ]')
|
||||
|
||||
def __init__(self):
|
||||
self.url = None
|
||||
self.key = None
|
||||
self.digest = '-'
|
||||
|
||||
def extract_mime(self, mime, def_mime='unk'):
|
||||
""" Utility function to extract mimetype only
|
||||
from a full content type, removing charset settings
|
||||
"""
|
||||
self.mime = def_mime
|
||||
self['mime'] = def_mime
|
||||
if mime:
|
||||
self.mime = self.MIME_RE.split(mime, 1)[0]
|
||||
self['mime'] = self.MIME_RE.split(mime, 1)[0]
|
||||
|
||||
def extract_status(self, status_headers):
|
||||
""" Extract status code only from status line
|
||||
"""
|
||||
self.status = status_headers.get_statuscode()
|
||||
if not self.status:
|
||||
self.status = '-'
|
||||
if self.status == '204' and 'Error' in status_headers.statusline:
|
||||
self.status = '-'
|
||||
self['status'] = status_headers.get_statuscode()
|
||||
if not self['status']:
|
||||
self['status'] = '-'
|
||||
elif self['status'] == '204' and 'Error' in status_headers.statusline:
|
||||
self['status'] = '-'
|
||||
|
||||
def set_rec_info(self, offset, length, digest):
|
||||
self.offset = str(offset)
|
||||
self.length = str(length)
|
||||
self['offset'] = str(offset)
|
||||
self['length'] = str(length)
|
||||
if digest:
|
||||
self.digest = digest
|
||||
self['digest'] = digest
|
||||
|
||||
def merge_request_data(self, other, options):
|
||||
surt_ordered = options.get('surt_ordered', True)
|
||||
@ -231,14 +226,15 @@ class ArchiveIndexEntry(object):
|
||||
return False
|
||||
|
||||
# merge POST/PUT body query
|
||||
if hasattr(other, 'post_query'):
|
||||
url = append_post_query(self.url, other.post_query)
|
||||
self.key = canonicalize(url, surt_ordered)
|
||||
other.key = self.key
|
||||
post_query = other.get('_post_query')
|
||||
if post_query:
|
||||
url = append_post_query(self['url'], post_query)
|
||||
self['key'] = canonicalize(url, surt_ordered)
|
||||
other['key'] = self['key']
|
||||
|
||||
referer = other.record.status_headers.get_header('referer')
|
||||
if referer:
|
||||
self.referer = referer
|
||||
self['_referer'] = referer
|
||||
|
||||
return True
|
||||
|
||||
@ -263,7 +259,7 @@ class DefaultRecordIter(object):
|
||||
for record in arcv_iter.iter_records(block_size):
|
||||
entry = None
|
||||
|
||||
if not include_all and (record.status_headers.get_statuscode() == '-'):
|
||||
if not include_all and not minimal and (record.status_headers.get_statuscode() == '-'):
|
||||
continue
|
||||
|
||||
if record.format == 'warc':
|
||||
@ -283,27 +279,27 @@ class DefaultRecordIter(object):
|
||||
if not entry:
|
||||
continue
|
||||
|
||||
if entry.url and not entry.key:
|
||||
entry.key = canonicalize(entry.url, surt_ordered)
|
||||
if entry.get('url') and not entry.get('key'):
|
||||
entry['key'] = canonicalize(entry['url'], surt_ordered)
|
||||
|
||||
compute_digest = False
|
||||
|
||||
if (not minimal and
|
||||
entry.digest == '-' and
|
||||
entry.get('digest', '-') == '-' and
|
||||
record.rec_type not in ('revisit', 'request', 'warcinfo')):
|
||||
|
||||
compute_digest = True
|
||||
|
||||
elif record.rec_type == 'request' and append_post:
|
||||
elif not minimal and record.rec_type == 'request' and append_post:
|
||||
method = record.status_headers.protocol
|
||||
len_ = record.status_headers.get_header('Content-Length')
|
||||
|
||||
post_query = extract_post_query(method,
|
||||
entry.mime,
|
||||
entry.get('mime'),
|
||||
len_,
|
||||
record.stream)
|
||||
|
||||
entry.post_query = post_query
|
||||
entry['_post_query'] = post_query
|
||||
|
||||
#entry.set_rec_info(*arcv_iter.read_to_end(record, compute_digest))
|
||||
arcv_iter.read_to_end(record, compute_digest)
|
||||
@ -321,7 +317,7 @@ class DefaultRecordIter(object):
|
||||
continue
|
||||
|
||||
# check for url match
|
||||
if (entry.url != prev_entry.url):
|
||||
if (entry['url'] != prev_entry['url']):
|
||||
pass
|
||||
|
||||
# check for concurrency also
|
||||
@ -351,23 +347,23 @@ class DefaultRecordIter(object):
|
||||
entry = ArchiveIndexEntry()
|
||||
|
||||
if record.rec_type == 'warcinfo':
|
||||
entry.url = record.rec_headers.get_header('WARC-Filename')
|
||||
entry.key = entry.url
|
||||
entry.warcinfo = record.stream.read(record.length)
|
||||
entry['url'] = record.rec_headers.get_header('WARC-Filename')
|
||||
entry['key'] = entry['url']
|
||||
entry['_warcinfo'] = record.stream.read(record.length)
|
||||
return entry
|
||||
|
||||
entry.url = record.rec_headers.get_header('WARC-Target-Uri')
|
||||
entry['url'] = record.rec_headers.get_header('WARC-Target-Uri')
|
||||
|
||||
# timestamp
|
||||
entry.timestamp = iso_date_to_timestamp(record.rec_headers.
|
||||
get_header('WARC-Date'))
|
||||
entry['timestamp'] = iso_date_to_timestamp(record.rec_headers.
|
||||
get_header('WARC-Date'))
|
||||
|
||||
if self.options.get('minimal'):
|
||||
return entry
|
||||
|
||||
# mime
|
||||
if record.rec_type == 'revisit':
|
||||
entry.mime = 'warc/revisit'
|
||||
entry['mime'] = 'warc/revisit'
|
||||
else:
|
||||
def_mime = '-' if record.rec_type == 'request' else 'unk'
|
||||
entry.extract_mime(record.status_headers.
|
||||
@ -378,15 +374,16 @@ class DefaultRecordIter(object):
|
||||
if record.rec_type == 'response':
|
||||
entry.extract_status(record.status_headers)
|
||||
else:
|
||||
entry.status = '-'
|
||||
entry['status'] = '-'
|
||||
|
||||
# digest
|
||||
entry.digest = record.rec_headers.get_header('WARC-Payload-Digest')
|
||||
if entry.digest and entry.digest.startswith('sha1:'):
|
||||
entry.digest = entry.digest[len('sha1:'):]
|
||||
digest = record.rec_headers.get_header('WARC-Payload-Digest')
|
||||
entry['digest'] = digest
|
||||
if digest and digest.startswith('sha1:'):
|
||||
entry['digest'] = digest[len('sha1:'):]
|
||||
|
||||
if not entry.digest:
|
||||
entry.digest = '-'
|
||||
elif not entry.get('digest'):
|
||||
entry['digest'] = '-'
|
||||
|
||||
return entry
|
||||
|
||||
@ -407,12 +404,12 @@ class DefaultRecordIter(object):
|
||||
url = url.replace('\x00', '%00')
|
||||
|
||||
entry = ArchiveIndexEntry()
|
||||
entry.url = url
|
||||
entry['url'] = url
|
||||
|
||||
# timestamp
|
||||
entry.timestamp = record.rec_headers.get_header('archive-date')
|
||||
if len(entry.timestamp) > 14:
|
||||
entry.timestamp = entry.timestamp[:14]
|
||||
entry['timestamp'] = record.rec_headers.get_header('archive-date')
|
||||
if len(entry['timestamp']) > 14:
|
||||
entry['timestamp'] = entry['timestamp'][:14]
|
||||
|
||||
if self.options.get('minimal'):
|
||||
return entry
|
||||
@ -424,7 +421,7 @@ class DefaultRecordIter(object):
|
||||
entry.extract_mime(record.rec_headers.get_header('content-type'))
|
||||
|
||||
# digest
|
||||
entry.digest = '-'
|
||||
entry['digest'] = '-'
|
||||
|
||||
return entry
|
||||
|
||||
|
@ -1,5 +1,7 @@
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
|
||||
from argparse import ArgumentParser, RawTextHelpFormatter
|
||||
from bisect import insort
|
||||
|
||||
@ -17,7 +19,7 @@ class BaseCDXWriter(object):
|
||||
return self
|
||||
|
||||
def write(self, entry, filename):
|
||||
if not entry.url or not entry.key:
|
||||
if not entry.get('url') or not entry.get('key'):
|
||||
return
|
||||
|
||||
if entry.record.rec_type == 'warcinfo':
|
||||
@ -29,21 +31,48 @@ class BaseCDXWriter(object):
|
||||
return False
|
||||
|
||||
|
||||
#=================================================================
|
||||
class CDXJ(object):
|
||||
def _write_header(self):
|
||||
pass
|
||||
|
||||
def write_cdx_line(self, out, entry, filename):
|
||||
out.write(entry['key'])
|
||||
out.write(' ')
|
||||
out.write(entry['timestamp'])
|
||||
out.write(' ')
|
||||
|
||||
outdict = {}
|
||||
outdict['filename'] = filename
|
||||
|
||||
for n, v in entry.iteritems():
|
||||
if n in ('key', 'timestamp'):
|
||||
continue
|
||||
|
||||
if n.startswith('_'):
|
||||
continue
|
||||
|
||||
outdict[n] = v
|
||||
|
||||
out.write(json.dumps(outdict))
|
||||
out.write('\n')
|
||||
|
||||
|
||||
#=================================================================
|
||||
class CDX06(object):
|
||||
def _write_header(self):
|
||||
self.out.write(' CDX N b a S V g\n')
|
||||
|
||||
def write_cdx_line(self, out, entry, filename):
|
||||
out.write(entry.key)
|
||||
out.write(entry['key'])
|
||||
out.write(' ')
|
||||
out.write(entry.timestamp)
|
||||
out.write(entry['timestamp'])
|
||||
out.write(' ')
|
||||
out.write(entry.url)
|
||||
out.write(entry['url'])
|
||||
out.write(' ')
|
||||
out.write(entry.length)
|
||||
out.write(entry['length'])
|
||||
out.write(' ')
|
||||
out.write(entry.offset)
|
||||
out.write(entry['offset'])
|
||||
out.write(' ')
|
||||
out.write(filename)
|
||||
out.write('\n')
|
||||
@ -55,19 +84,19 @@ class CDX09(object):
|
||||
self.out.write(' CDX N b a m s k r V g\n')
|
||||
|
||||
def write_cdx_line(self, out, entry, filename):
|
||||
out.write(entry.key)
|
||||
out.write(entry['key'])
|
||||
out.write(' ')
|
||||
out.write(entry.timestamp)
|
||||
out.write(entry['timestamp'])
|
||||
out.write(' ')
|
||||
out.write(entry.url)
|
||||
out.write(entry['url'])
|
||||
out.write(' ')
|
||||
out.write(entry.mime)
|
||||
out.write(entry['mime'])
|
||||
out.write(' ')
|
||||
out.write(entry.status)
|
||||
out.write(entry['status'])
|
||||
out.write(' ')
|
||||
out.write(entry.digest)
|
||||
out.write(entry['digest'])
|
||||
out.write(' - ')
|
||||
out.write(entry.offset)
|
||||
out.write(entry['offset'])
|
||||
out.write(' ')
|
||||
out.write(filename)
|
||||
out.write('\n')
|
||||
@ -79,21 +108,21 @@ class CDX11(object):
|
||||
self.out.write(' CDX N b a m s k r M S V g\n')
|
||||
|
||||
def write_cdx_line(self, out, entry, filename):
|
||||
out.write(entry.key)
|
||||
out.write(entry['key'])
|
||||
out.write(' ')
|
||||
out.write(entry.timestamp)
|
||||
out.write(entry['timestamp'])
|
||||
out.write(' ')
|
||||
out.write(entry.url)
|
||||
out.write(entry['url'])
|
||||
out.write(' ')
|
||||
out.write(entry.mime)
|
||||
out.write(entry['mime'])
|
||||
out.write(' ')
|
||||
out.write(entry.status)
|
||||
out.write(entry['status'])
|
||||
out.write(' ')
|
||||
out.write(entry.digest)
|
||||
out.write(entry['digest'])
|
||||
out.write(' - - ')
|
||||
out.write(entry.length)
|
||||
out.write(entry['length'])
|
||||
out.write(' ')
|
||||
out.write(entry.offset)
|
||||
out.write(entry['offset'])
|
||||
out.write(' ')
|
||||
out.write(filename)
|
||||
out.write('\n')
|
||||
@ -171,10 +200,12 @@ def get_cdx_writer_cls(options):
|
||||
else:
|
||||
writer_cls = BaseCDXWriter
|
||||
|
||||
if options.get('cdx09'):
|
||||
format_mixin = CDX09
|
||||
elif options.get('minimal'):
|
||||
if options.get('cdxj'):
|
||||
format_mixin = CDXJ
|
||||
elif options.get('cdx06') or options.get('minimal'):
|
||||
format_mixin = CDX06
|
||||
elif options.get('cdx09'):
|
||||
format_mixin = CDX09
|
||||
else:
|
||||
format_mixin = CDX11
|
||||
|
||||
@ -329,7 +360,13 @@ if input is a directory"""
|
||||
action='store_true',
|
||||
help=cdx09_help)
|
||||
|
||||
group.add_argument('-m', '--minimal',
|
||||
group.add_argument('-6', '--cdx06',
|
||||
action='store_true')
|
||||
|
||||
group.add_argument('-j', '--cdxj',
|
||||
action='store_true')
|
||||
|
||||
parser.add_argument('-m', '--minimal',
|
||||
action='store_true',
|
||||
help=minimal_help)
|
||||
|
||||
@ -345,6 +382,8 @@ if input is a directory"""
|
||||
append_post=cmd.postappend,
|
||||
recurse=cmd.recurse,
|
||||
cdx09=cmd.cdx09,
|
||||
cdx06=cmd.cdx06,
|
||||
cdxj=cmd.cdxj,
|
||||
minimal=cmd.minimal)
|
||||
|
||||
|
||||
|
@ -132,7 +132,7 @@ class ArcWarcRecordLoader:
|
||||
|
||||
# don't parse the http record at all
|
||||
if no_record_parse:
|
||||
status_headers = StatusAndHeaders('', [])
|
||||
status_headers = None#StatusAndHeaders('', [])
|
||||
|
||||
# if empty record (error or otherwise) set status to 204
|
||||
elif length == 0:
|
||||
|
@ -126,7 +126,7 @@ def create_cdx_server_app(passed_config):
|
||||
|
||||
|
||||
#=================================================================
|
||||
class DirectoryCollsLoader(object):
|
||||
class DirectoryCollsLoader(object): #pragma: no cover
|
||||
def __init__(self, config, static_routes):
|
||||
self.config = config
|
||||
self.static_routes = static_routes
|
||||
|
Loading…
x
Reference in New Issue
Block a user