1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

indexing: cdx json support (#76): use OrderedDict when indexing json to ensure consistent ordering

skip empty or '-' fields
add tests for cdx json
This commit is contained in:
Ilya Kreymer 2015-03-17 21:11:35 -07:00
parent 6f9808f090
commit 3f084625b0
5 changed files with 58 additions and 13 deletions

View File

@ -23,7 +23,7 @@ def get_input(msg): #pragma: no cover
#============================================================================= #=============================================================================
class CollectionsManager(object): #pragma: no cover class CollectionsManager(object):
""" This utility is designed to """ This utility is designed to
simplify the creation and management of web archive collections simplify the creation and management of web archive collections

View File

@ -10,6 +10,11 @@ import base64
import re import re
try: # pragma: no cover
from collections import OrderedDict
except ImportError: # pragma: no cover
from ordereddict import OrderedDict
#================================================================= #=================================================================
class ArchiveIterator(object): class ArchiveIterator(object):
@ -189,7 +194,7 @@ class ArchiveIterator(object):
#================================================================= #=================================================================
class ArchiveIndexEntry(dict): class ArchiveIndexEntryMixin(object):
MIME_RE = re.compile('[; ]') MIME_RE = re.compile('[; ]')
def extract_mime(self, mime, def_mime='unk'): def extract_mime(self, mime, def_mime='unk'):
@ -210,8 +215,8 @@ class ArchiveIndexEntry(dict):
self['status'] = '-' self['status'] = '-'
def set_rec_info(self, offset, length, digest): def set_rec_info(self, offset, length, digest):
self['offset'] = str(offset)
self['length'] = str(length) self['length'] = str(length)
self['offset'] = str(offset)
if digest: if digest:
self['digest'] = digest self['digest'] = digest
@ -244,6 +249,12 @@ class DefaultRecordIter(object):
def __init__(self, **options): def __init__(self, **options):
self.options = options self.options = options
def _create_index_entry(self):
if self.options.get('cdxj'):
return OrderedArchiveIndexEntry()
else:
return ArchiveIndexEntry()
def create_record_iter(self, arcv_iter): def create_record_iter(self, arcv_iter):
append_post = self.options.get('append_post') append_post = self.options.get('append_post')
include_all = self.options.get('include_all') include_all = self.options.get('include_all')
@ -344,7 +355,7 @@ class DefaultRecordIter(object):
""" Parse warc record """ Parse warc record
""" """
entry = ArchiveIndexEntry() entry = self._create_index_entry()
if record.rec_type == 'warcinfo': if record.rec_type == 'warcinfo':
entry['url'] = record.rec_headers.get_header('WARC-Filename') entry['url'] = record.rec_headers.get_header('WARC-Filename')
@ -403,7 +414,7 @@ class DefaultRecordIter(object):
# replace nulls # replace nulls
url = url.replace('\x00', '%00') url = url.replace('\x00', '%00')
entry = ArchiveIndexEntry() entry = self._create_index_entry()
entry['url'] = url entry['url'] = url
# timestamp # timestamp
@ -414,12 +425,12 @@ class DefaultRecordIter(object):
if self.options.get('minimal'): if self.options.get('minimal'):
return entry return entry
# status
entry.extract_status(record.status_headers)
# mime # mime
entry.extract_mime(record.rec_headers.get_header('content-type')) entry.extract_mime(record.rec_headers.get_header('content-type'))
# status
entry.extract_status(record.status_headers)
# digest # digest
entry['digest'] = '-' entry['digest'] = '-'
@ -439,3 +450,9 @@ class DefaultRecordIter(object):
continue continue
yield entry yield entry
class ArchiveIndexEntry(ArchiveIndexEntryMixin, dict):
pass
class OrderedArchiveIndexEntry(ArchiveIndexEntryMixin, OrderedDict):
pass

View File

@ -1,6 +1,13 @@
import os import os
import sys import sys
import json
from json import dumps as json_encode
try: # pragma: no cover
from collections import OrderedDict
except ImportError: # pragma: no cover
from ordereddict import OrderedDict
from argparse import ArgumentParser, RawTextHelpFormatter from argparse import ArgumentParser, RawTextHelpFormatter
from bisect import insort from bisect import insort
@ -42,8 +49,7 @@ class CDXJ(object):
out.write(entry['timestamp']) out.write(entry['timestamp'])
out.write(' ') out.write(' ')
outdict = {} outdict = OrderedDict()
outdict['filename'] = filename
for n, v in entry.iteritems(): for n, v in entry.iteritems():
if n in ('key', 'timestamp'): if n in ('key', 'timestamp'):
@ -52,9 +58,13 @@ class CDXJ(object):
if n.startswith('_'): if n.startswith('_'):
continue continue
if not v or v == '-':
continue
outdict[n] = v outdict[n] = v
out.write(json.dumps(outdict)) outdict['filename'] = filename
out.write(json_encode(outdict))
out.write('\n') out.write('\n')

View File

@ -15,6 +15,12 @@ com,example)/?example=1 20140103030321 http://example.com?example=1 1043 333 exa
com,example)/?example=1 20140103030341 http://example.com?example=1 553 1864 example.warc.gz com,example)/?example=1 20140103030341 http://example.com?example=1 553 1864 example.warc.gz
org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example 577 2907 example.warc.gz org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example 577 2907 example.warc.gz
# warc.gz -- minimal CDXJ
>>> print_cdx_index('example.warc.gz', minimal=True, cdxj=True)
com,example)/?example=1 20140103030321 {"url": "http://example.com?example=1", "length": "1043", "offset": "333", "filename": "example.warc.gz"}
com,example)/?example=1 20140103030341 {"url": "http://example.com?example=1", "length": "553", "offset": "1864", "filename": "example.warc.gz"}
org,iana)/domains/example 20140128051539 {"url": "http://www.iana.org/domains/example", "length": "577", "offset": "2907", "filename": "example.warc.gz"}
# warc.gz -- parse all # warc.gz -- parse all
>>> print_cdx_index('example.warc.gz', include_all=True) >>> print_cdx_index('example.warc.gz', include_all=True)
CDX N b a m s k r M S V g CDX N b a m s k r M S V g
@ -24,6 +30,14 @@ com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit
com,example)/?example=1 20140103030341 http://example.com?example=1 - - - - - 490 2417 example.warc.gz com,example)/?example=1 20140103030341 http://example.com?example=1 - - - - - 490 2417 example.warc.gz
org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
# warc.gz -- parse all -- CDXJ
>>> print_cdx_index('example.warc.gz', include_all=True, cdxj=True)
com,example)/?example=1 20140103030321 {"url": "http://example.com?example=1", "mime": "text/html", "status": "200", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "1043", "offset": "333", "filename": "example.warc.gz"}
com,example)/?example=1 20140103030321 {"url": "http://example.com?example=1", "length": "488", "offset": "1376", "filename": "example.warc.gz"}
com,example)/?example=1 20140103030341 {"url": "http://example.com?example=1", "mime": "warc/revisit", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "553", "offset": "1864", "filename": "example.warc.gz"}
com,example)/?example=1 20140103030341 {"url": "http://example.com?example=1", "length": "490", "offset": "2417", "filename": "example.warc.gz"}
org,iana)/domains/example 20140128051539 {"url": "http://www.iana.org/domains/example", "mime": "text/html", "status": "302", "digest": "JZ622UA23G5ZU6Y3XAKH4LINONUEICEG", "length": "577", "offset": "2907", "filename": "example.warc.gz"}
# warc # warc
>>> print_cdx_index('example.warc') >>> print_cdx_index('example.warc')
CDX N b a m s k r M S V g CDX N b a m s k r M S V g
@ -45,6 +59,10 @@ org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example tex
CDX N b a m s k r M S V g CDX N b a m s k r M S V g
com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 856 171 example.arc.gz com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 856 171 example.arc.gz
# arc.gz -- json
>>> print_cdx_index('example.arc.gz', cdxj=True)
com,example)/ 20140216050221 {"url": "http://example.com/", "mime": "text/html", "status": "200", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "856", "offset": "171", "filename": "example.arc.gz"}
# arc # arc
>>> print_cdx_index('example.arc') >>> print_cdx_index('example.arc')
CDX N b a m s k r M S V g CDX N b a m s k r M S V g

View File

@ -126,7 +126,7 @@ def create_cdx_server_app(passed_config):
#================================================================= #=================================================================
class DirectoryCollsLoader(object): #pragma: no cover class DirectoryCollsLoader(object):
def __init__(self, config, static_routes): def __init__(self, config, static_routes):
self.config = config self.config = config
self.static_routes = static_routes self.static_routes = static_routes