mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
indexing: refactor ArchiveIndexEntry to be a dict instead of adding attrib. Allows for better track of indexed properties.
Add json-based cdx! (cdxj) output where all fields except url + key are in json dict. Support for both minimal and full json cdx, tracked via #76
This commit is contained in:
parent
bfe590996b
commit
6f9808f090
@ -23,7 +23,7 @@ def get_input(msg): #pragma: no cover
|
|||||||
|
|
||||||
|
|
||||||
#=============================================================================
|
#=============================================================================
|
||||||
class CollectionsManager(object):
|
class CollectionsManager(object): #pragma: no cover
|
||||||
""" This utility is designed to
|
""" This utility is designed to
|
||||||
simplify the creation and management of web archive collections
|
simplify the creation and management of web archive collections
|
||||||
|
|
||||||
|
@ -189,36 +189,31 @@ class ArchiveIterator(object):
|
|||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class ArchiveIndexEntry(object):
|
class ArchiveIndexEntry(dict):
|
||||||
MIME_RE = re.compile('[; ]')
|
MIME_RE = re.compile('[; ]')
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
self.url = None
|
|
||||||
self.key = None
|
|
||||||
self.digest = '-'
|
|
||||||
|
|
||||||
def extract_mime(self, mime, def_mime='unk'):
|
def extract_mime(self, mime, def_mime='unk'):
|
||||||
""" Utility function to extract mimetype only
|
""" Utility function to extract mimetype only
|
||||||
from a full content type, removing charset settings
|
from a full content type, removing charset settings
|
||||||
"""
|
"""
|
||||||
self.mime = def_mime
|
self['mime'] = def_mime
|
||||||
if mime:
|
if mime:
|
||||||
self.mime = self.MIME_RE.split(mime, 1)[0]
|
self['mime'] = self.MIME_RE.split(mime, 1)[0]
|
||||||
|
|
||||||
def extract_status(self, status_headers):
|
def extract_status(self, status_headers):
|
||||||
""" Extract status code only from status line
|
""" Extract status code only from status line
|
||||||
"""
|
"""
|
||||||
self.status = status_headers.get_statuscode()
|
self['status'] = status_headers.get_statuscode()
|
||||||
if not self.status:
|
if not self['status']:
|
||||||
self.status = '-'
|
self['status'] = '-'
|
||||||
if self.status == '204' and 'Error' in status_headers.statusline:
|
elif self['status'] == '204' and 'Error' in status_headers.statusline:
|
||||||
self.status = '-'
|
self['status'] = '-'
|
||||||
|
|
||||||
def set_rec_info(self, offset, length, digest):
|
def set_rec_info(self, offset, length, digest):
|
||||||
self.offset = str(offset)
|
self['offset'] = str(offset)
|
||||||
self.length = str(length)
|
self['length'] = str(length)
|
||||||
if digest:
|
if digest:
|
||||||
self.digest = digest
|
self['digest'] = digest
|
||||||
|
|
||||||
def merge_request_data(self, other, options):
|
def merge_request_data(self, other, options):
|
||||||
surt_ordered = options.get('surt_ordered', True)
|
surt_ordered = options.get('surt_ordered', True)
|
||||||
@ -231,14 +226,15 @@ class ArchiveIndexEntry(object):
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
# merge POST/PUT body query
|
# merge POST/PUT body query
|
||||||
if hasattr(other, 'post_query'):
|
post_query = other.get('_post_query')
|
||||||
url = append_post_query(self.url, other.post_query)
|
if post_query:
|
||||||
self.key = canonicalize(url, surt_ordered)
|
url = append_post_query(self['url'], post_query)
|
||||||
other.key = self.key
|
self['key'] = canonicalize(url, surt_ordered)
|
||||||
|
other['key'] = self['key']
|
||||||
|
|
||||||
referer = other.record.status_headers.get_header('referer')
|
referer = other.record.status_headers.get_header('referer')
|
||||||
if referer:
|
if referer:
|
||||||
self.referer = referer
|
self['_referer'] = referer
|
||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
@ -263,7 +259,7 @@ class DefaultRecordIter(object):
|
|||||||
for record in arcv_iter.iter_records(block_size):
|
for record in arcv_iter.iter_records(block_size):
|
||||||
entry = None
|
entry = None
|
||||||
|
|
||||||
if not include_all and (record.status_headers.get_statuscode() == '-'):
|
if not include_all and not minimal and (record.status_headers.get_statuscode() == '-'):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if record.format == 'warc':
|
if record.format == 'warc':
|
||||||
@ -283,27 +279,27 @@ class DefaultRecordIter(object):
|
|||||||
if not entry:
|
if not entry:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if entry.url and not entry.key:
|
if entry.get('url') and not entry.get('key'):
|
||||||
entry.key = canonicalize(entry.url, surt_ordered)
|
entry['key'] = canonicalize(entry['url'], surt_ordered)
|
||||||
|
|
||||||
compute_digest = False
|
compute_digest = False
|
||||||
|
|
||||||
if (not minimal and
|
if (not minimal and
|
||||||
entry.digest == '-' and
|
entry.get('digest', '-') == '-' and
|
||||||
record.rec_type not in ('revisit', 'request', 'warcinfo')):
|
record.rec_type not in ('revisit', 'request', 'warcinfo')):
|
||||||
|
|
||||||
compute_digest = True
|
compute_digest = True
|
||||||
|
|
||||||
elif record.rec_type == 'request' and append_post:
|
elif not minimal and record.rec_type == 'request' and append_post:
|
||||||
method = record.status_headers.protocol
|
method = record.status_headers.protocol
|
||||||
len_ = record.status_headers.get_header('Content-Length')
|
len_ = record.status_headers.get_header('Content-Length')
|
||||||
|
|
||||||
post_query = extract_post_query(method,
|
post_query = extract_post_query(method,
|
||||||
entry.mime,
|
entry.get('mime'),
|
||||||
len_,
|
len_,
|
||||||
record.stream)
|
record.stream)
|
||||||
|
|
||||||
entry.post_query = post_query
|
entry['_post_query'] = post_query
|
||||||
|
|
||||||
#entry.set_rec_info(*arcv_iter.read_to_end(record, compute_digest))
|
#entry.set_rec_info(*arcv_iter.read_to_end(record, compute_digest))
|
||||||
arcv_iter.read_to_end(record, compute_digest)
|
arcv_iter.read_to_end(record, compute_digest)
|
||||||
@ -321,7 +317,7 @@ class DefaultRecordIter(object):
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
# check for url match
|
# check for url match
|
||||||
if (entry.url != prev_entry.url):
|
if (entry['url'] != prev_entry['url']):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# check for concurrency also
|
# check for concurrency also
|
||||||
@ -351,23 +347,23 @@ class DefaultRecordIter(object):
|
|||||||
entry = ArchiveIndexEntry()
|
entry = ArchiveIndexEntry()
|
||||||
|
|
||||||
if record.rec_type == 'warcinfo':
|
if record.rec_type == 'warcinfo':
|
||||||
entry.url = record.rec_headers.get_header('WARC-Filename')
|
entry['url'] = record.rec_headers.get_header('WARC-Filename')
|
||||||
entry.key = entry.url
|
entry['key'] = entry['url']
|
||||||
entry.warcinfo = record.stream.read(record.length)
|
entry['_warcinfo'] = record.stream.read(record.length)
|
||||||
return entry
|
return entry
|
||||||
|
|
||||||
entry.url = record.rec_headers.get_header('WARC-Target-Uri')
|
entry['url'] = record.rec_headers.get_header('WARC-Target-Uri')
|
||||||
|
|
||||||
# timestamp
|
# timestamp
|
||||||
entry.timestamp = iso_date_to_timestamp(record.rec_headers.
|
entry['timestamp'] = iso_date_to_timestamp(record.rec_headers.
|
||||||
get_header('WARC-Date'))
|
get_header('WARC-Date'))
|
||||||
|
|
||||||
if self.options.get('minimal'):
|
if self.options.get('minimal'):
|
||||||
return entry
|
return entry
|
||||||
|
|
||||||
# mime
|
# mime
|
||||||
if record.rec_type == 'revisit':
|
if record.rec_type == 'revisit':
|
||||||
entry.mime = 'warc/revisit'
|
entry['mime'] = 'warc/revisit'
|
||||||
else:
|
else:
|
||||||
def_mime = '-' if record.rec_type == 'request' else 'unk'
|
def_mime = '-' if record.rec_type == 'request' else 'unk'
|
||||||
entry.extract_mime(record.status_headers.
|
entry.extract_mime(record.status_headers.
|
||||||
@ -378,15 +374,16 @@ class DefaultRecordIter(object):
|
|||||||
if record.rec_type == 'response':
|
if record.rec_type == 'response':
|
||||||
entry.extract_status(record.status_headers)
|
entry.extract_status(record.status_headers)
|
||||||
else:
|
else:
|
||||||
entry.status = '-'
|
entry['status'] = '-'
|
||||||
|
|
||||||
# digest
|
# digest
|
||||||
entry.digest = record.rec_headers.get_header('WARC-Payload-Digest')
|
digest = record.rec_headers.get_header('WARC-Payload-Digest')
|
||||||
if entry.digest and entry.digest.startswith('sha1:'):
|
entry['digest'] = digest
|
||||||
entry.digest = entry.digest[len('sha1:'):]
|
if digest and digest.startswith('sha1:'):
|
||||||
|
entry['digest'] = digest[len('sha1:'):]
|
||||||
|
|
||||||
if not entry.digest:
|
elif not entry.get('digest'):
|
||||||
entry.digest = '-'
|
entry['digest'] = '-'
|
||||||
|
|
||||||
return entry
|
return entry
|
||||||
|
|
||||||
@ -407,12 +404,12 @@ class DefaultRecordIter(object):
|
|||||||
url = url.replace('\x00', '%00')
|
url = url.replace('\x00', '%00')
|
||||||
|
|
||||||
entry = ArchiveIndexEntry()
|
entry = ArchiveIndexEntry()
|
||||||
entry.url = url
|
entry['url'] = url
|
||||||
|
|
||||||
# timestamp
|
# timestamp
|
||||||
entry.timestamp = record.rec_headers.get_header('archive-date')
|
entry['timestamp'] = record.rec_headers.get_header('archive-date')
|
||||||
if len(entry.timestamp) > 14:
|
if len(entry['timestamp']) > 14:
|
||||||
entry.timestamp = entry.timestamp[:14]
|
entry['timestamp'] = entry['timestamp'][:14]
|
||||||
|
|
||||||
if self.options.get('minimal'):
|
if self.options.get('minimal'):
|
||||||
return entry
|
return entry
|
||||||
@ -424,7 +421,7 @@ class DefaultRecordIter(object):
|
|||||||
entry.extract_mime(record.rec_headers.get_header('content-type'))
|
entry.extract_mime(record.rec_headers.get_header('content-type'))
|
||||||
|
|
||||||
# digest
|
# digest
|
||||||
entry.digest = '-'
|
entry['digest'] = '-'
|
||||||
|
|
||||||
return entry
|
return entry
|
||||||
|
|
||||||
|
@ -1,5 +1,7 @@
|
|||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
import json
|
||||||
|
|
||||||
from argparse import ArgumentParser, RawTextHelpFormatter
|
from argparse import ArgumentParser, RawTextHelpFormatter
|
||||||
from bisect import insort
|
from bisect import insort
|
||||||
|
|
||||||
@ -17,7 +19,7 @@ class BaseCDXWriter(object):
|
|||||||
return self
|
return self
|
||||||
|
|
||||||
def write(self, entry, filename):
|
def write(self, entry, filename):
|
||||||
if not entry.url or not entry.key:
|
if not entry.get('url') or not entry.get('key'):
|
||||||
return
|
return
|
||||||
|
|
||||||
if entry.record.rec_type == 'warcinfo':
|
if entry.record.rec_type == 'warcinfo':
|
||||||
@ -29,21 +31,48 @@ class BaseCDXWriter(object):
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
class CDXJ(object):
|
||||||
|
def _write_header(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def write_cdx_line(self, out, entry, filename):
|
||||||
|
out.write(entry['key'])
|
||||||
|
out.write(' ')
|
||||||
|
out.write(entry['timestamp'])
|
||||||
|
out.write(' ')
|
||||||
|
|
||||||
|
outdict = {}
|
||||||
|
outdict['filename'] = filename
|
||||||
|
|
||||||
|
for n, v in entry.iteritems():
|
||||||
|
if n in ('key', 'timestamp'):
|
||||||
|
continue
|
||||||
|
|
||||||
|
if n.startswith('_'):
|
||||||
|
continue
|
||||||
|
|
||||||
|
outdict[n] = v
|
||||||
|
|
||||||
|
out.write(json.dumps(outdict))
|
||||||
|
out.write('\n')
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class CDX06(object):
|
class CDX06(object):
|
||||||
def _write_header(self):
|
def _write_header(self):
|
||||||
self.out.write(' CDX N b a S V g\n')
|
self.out.write(' CDX N b a S V g\n')
|
||||||
|
|
||||||
def write_cdx_line(self, out, entry, filename):
|
def write_cdx_line(self, out, entry, filename):
|
||||||
out.write(entry.key)
|
out.write(entry['key'])
|
||||||
out.write(' ')
|
out.write(' ')
|
||||||
out.write(entry.timestamp)
|
out.write(entry['timestamp'])
|
||||||
out.write(' ')
|
out.write(' ')
|
||||||
out.write(entry.url)
|
out.write(entry['url'])
|
||||||
out.write(' ')
|
out.write(' ')
|
||||||
out.write(entry.length)
|
out.write(entry['length'])
|
||||||
out.write(' ')
|
out.write(' ')
|
||||||
out.write(entry.offset)
|
out.write(entry['offset'])
|
||||||
out.write(' ')
|
out.write(' ')
|
||||||
out.write(filename)
|
out.write(filename)
|
||||||
out.write('\n')
|
out.write('\n')
|
||||||
@ -55,19 +84,19 @@ class CDX09(object):
|
|||||||
self.out.write(' CDX N b a m s k r V g\n')
|
self.out.write(' CDX N b a m s k r V g\n')
|
||||||
|
|
||||||
def write_cdx_line(self, out, entry, filename):
|
def write_cdx_line(self, out, entry, filename):
|
||||||
out.write(entry.key)
|
out.write(entry['key'])
|
||||||
out.write(' ')
|
out.write(' ')
|
||||||
out.write(entry.timestamp)
|
out.write(entry['timestamp'])
|
||||||
out.write(' ')
|
out.write(' ')
|
||||||
out.write(entry.url)
|
out.write(entry['url'])
|
||||||
out.write(' ')
|
out.write(' ')
|
||||||
out.write(entry.mime)
|
out.write(entry['mime'])
|
||||||
out.write(' ')
|
out.write(' ')
|
||||||
out.write(entry.status)
|
out.write(entry['status'])
|
||||||
out.write(' ')
|
out.write(' ')
|
||||||
out.write(entry.digest)
|
out.write(entry['digest'])
|
||||||
out.write(' - ')
|
out.write(' - ')
|
||||||
out.write(entry.offset)
|
out.write(entry['offset'])
|
||||||
out.write(' ')
|
out.write(' ')
|
||||||
out.write(filename)
|
out.write(filename)
|
||||||
out.write('\n')
|
out.write('\n')
|
||||||
@ -79,21 +108,21 @@ class CDX11(object):
|
|||||||
self.out.write(' CDX N b a m s k r M S V g\n')
|
self.out.write(' CDX N b a m s k r M S V g\n')
|
||||||
|
|
||||||
def write_cdx_line(self, out, entry, filename):
|
def write_cdx_line(self, out, entry, filename):
|
||||||
out.write(entry.key)
|
out.write(entry['key'])
|
||||||
out.write(' ')
|
out.write(' ')
|
||||||
out.write(entry.timestamp)
|
out.write(entry['timestamp'])
|
||||||
out.write(' ')
|
out.write(' ')
|
||||||
out.write(entry.url)
|
out.write(entry['url'])
|
||||||
out.write(' ')
|
out.write(' ')
|
||||||
out.write(entry.mime)
|
out.write(entry['mime'])
|
||||||
out.write(' ')
|
out.write(' ')
|
||||||
out.write(entry.status)
|
out.write(entry['status'])
|
||||||
out.write(' ')
|
out.write(' ')
|
||||||
out.write(entry.digest)
|
out.write(entry['digest'])
|
||||||
out.write(' - - ')
|
out.write(' - - ')
|
||||||
out.write(entry.length)
|
out.write(entry['length'])
|
||||||
out.write(' ')
|
out.write(' ')
|
||||||
out.write(entry.offset)
|
out.write(entry['offset'])
|
||||||
out.write(' ')
|
out.write(' ')
|
||||||
out.write(filename)
|
out.write(filename)
|
||||||
out.write('\n')
|
out.write('\n')
|
||||||
@ -171,10 +200,12 @@ def get_cdx_writer_cls(options):
|
|||||||
else:
|
else:
|
||||||
writer_cls = BaseCDXWriter
|
writer_cls = BaseCDXWriter
|
||||||
|
|
||||||
if options.get('cdx09'):
|
if options.get('cdxj'):
|
||||||
format_mixin = CDX09
|
format_mixin = CDXJ
|
||||||
elif options.get('minimal'):
|
elif options.get('cdx06') or options.get('minimal'):
|
||||||
format_mixin = CDX06
|
format_mixin = CDX06
|
||||||
|
elif options.get('cdx09'):
|
||||||
|
format_mixin = CDX09
|
||||||
else:
|
else:
|
||||||
format_mixin = CDX11
|
format_mixin = CDX11
|
||||||
|
|
||||||
@ -329,7 +360,13 @@ if input is a directory"""
|
|||||||
action='store_true',
|
action='store_true',
|
||||||
help=cdx09_help)
|
help=cdx09_help)
|
||||||
|
|
||||||
group.add_argument('-m', '--minimal',
|
group.add_argument('-6', '--cdx06',
|
||||||
|
action='store_true')
|
||||||
|
|
||||||
|
group.add_argument('-j', '--cdxj',
|
||||||
|
action='store_true')
|
||||||
|
|
||||||
|
parser.add_argument('-m', '--minimal',
|
||||||
action='store_true',
|
action='store_true',
|
||||||
help=minimal_help)
|
help=minimal_help)
|
||||||
|
|
||||||
@ -345,6 +382,8 @@ if input is a directory"""
|
|||||||
append_post=cmd.postappend,
|
append_post=cmd.postappend,
|
||||||
recurse=cmd.recurse,
|
recurse=cmd.recurse,
|
||||||
cdx09=cmd.cdx09,
|
cdx09=cmd.cdx09,
|
||||||
|
cdx06=cmd.cdx06,
|
||||||
|
cdxj=cmd.cdxj,
|
||||||
minimal=cmd.minimal)
|
minimal=cmd.minimal)
|
||||||
|
|
||||||
|
|
||||||
|
@ -132,7 +132,7 @@ class ArcWarcRecordLoader:
|
|||||||
|
|
||||||
# don't parse the http record at all
|
# don't parse the http record at all
|
||||||
if no_record_parse:
|
if no_record_parse:
|
||||||
status_headers = StatusAndHeaders('', [])
|
status_headers = None#StatusAndHeaders('', [])
|
||||||
|
|
||||||
# if empty record (error or otherwise) set status to 204
|
# if empty record (error or otherwise) set status to 204
|
||||||
elif length == 0:
|
elif length == 0:
|
||||||
|
@ -126,7 +126,7 @@ def create_cdx_server_app(passed_config):
|
|||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class DirectoryCollsLoader(object):
|
class DirectoryCollsLoader(object): #pragma: no cover
|
||||||
def __init__(self, config, static_routes):
|
def __init__(self, config, static_routes):
|
||||||
self.config = config
|
self.config = config
|
||||||
self.static_routes = static_routes
|
self.static_routes = static_routes
|
||||||
|
Loading…
x
Reference in New Issue
Block a user