mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
indexing: cdx json support (#76): use OrderedDict when indexing json to ensure consistent ordering
skip empty or '-' fields add tests for cdx json
This commit is contained in:
parent
6f9808f090
commit
3f084625b0
@ -23,7 +23,7 @@ def get_input(msg): #pragma: no cover
|
|||||||
|
|
||||||
|
|
||||||
#=============================================================================
|
#=============================================================================
|
||||||
class CollectionsManager(object): #pragma: no cover
|
class CollectionsManager(object):
|
||||||
""" This utility is designed to
|
""" This utility is designed to
|
||||||
simplify the creation and management of web archive collections
|
simplify the creation and management of web archive collections
|
||||||
|
|
||||||
|
@ -10,6 +10,11 @@ import base64
|
|||||||
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
try: # pragma: no cover
|
||||||
|
from collections import OrderedDict
|
||||||
|
except ImportError: # pragma: no cover
|
||||||
|
from ordereddict import OrderedDict
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class ArchiveIterator(object):
|
class ArchiveIterator(object):
|
||||||
@ -189,7 +194,7 @@ class ArchiveIterator(object):
|
|||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class ArchiveIndexEntry(dict):
|
class ArchiveIndexEntryMixin(object):
|
||||||
MIME_RE = re.compile('[; ]')
|
MIME_RE = re.compile('[; ]')
|
||||||
|
|
||||||
def extract_mime(self, mime, def_mime='unk'):
|
def extract_mime(self, mime, def_mime='unk'):
|
||||||
@ -210,8 +215,8 @@ class ArchiveIndexEntry(dict):
|
|||||||
self['status'] = '-'
|
self['status'] = '-'
|
||||||
|
|
||||||
def set_rec_info(self, offset, length, digest):
|
def set_rec_info(self, offset, length, digest):
|
||||||
self['offset'] = str(offset)
|
|
||||||
self['length'] = str(length)
|
self['length'] = str(length)
|
||||||
|
self['offset'] = str(offset)
|
||||||
if digest:
|
if digest:
|
||||||
self['digest'] = digest
|
self['digest'] = digest
|
||||||
|
|
||||||
@ -244,6 +249,12 @@ class DefaultRecordIter(object):
|
|||||||
def __init__(self, **options):
|
def __init__(self, **options):
|
||||||
self.options = options
|
self.options = options
|
||||||
|
|
||||||
|
def _create_index_entry(self):
|
||||||
|
if self.options.get('cdxj'):
|
||||||
|
return OrderedArchiveIndexEntry()
|
||||||
|
else:
|
||||||
|
return ArchiveIndexEntry()
|
||||||
|
|
||||||
def create_record_iter(self, arcv_iter):
|
def create_record_iter(self, arcv_iter):
|
||||||
append_post = self.options.get('append_post')
|
append_post = self.options.get('append_post')
|
||||||
include_all = self.options.get('include_all')
|
include_all = self.options.get('include_all')
|
||||||
@ -344,7 +355,7 @@ class DefaultRecordIter(object):
|
|||||||
""" Parse warc record
|
""" Parse warc record
|
||||||
"""
|
"""
|
||||||
|
|
||||||
entry = ArchiveIndexEntry()
|
entry = self._create_index_entry()
|
||||||
|
|
||||||
if record.rec_type == 'warcinfo':
|
if record.rec_type == 'warcinfo':
|
||||||
entry['url'] = record.rec_headers.get_header('WARC-Filename')
|
entry['url'] = record.rec_headers.get_header('WARC-Filename')
|
||||||
@ -403,7 +414,7 @@ class DefaultRecordIter(object):
|
|||||||
# replace nulls
|
# replace nulls
|
||||||
url = url.replace('\x00', '%00')
|
url = url.replace('\x00', '%00')
|
||||||
|
|
||||||
entry = ArchiveIndexEntry()
|
entry = self._create_index_entry()
|
||||||
entry['url'] = url
|
entry['url'] = url
|
||||||
|
|
||||||
# timestamp
|
# timestamp
|
||||||
@ -414,12 +425,12 @@ class DefaultRecordIter(object):
|
|||||||
if self.options.get('minimal'):
|
if self.options.get('minimal'):
|
||||||
return entry
|
return entry
|
||||||
|
|
||||||
# status
|
|
||||||
entry.extract_status(record.status_headers)
|
|
||||||
|
|
||||||
# mime
|
# mime
|
||||||
entry.extract_mime(record.rec_headers.get_header('content-type'))
|
entry.extract_mime(record.rec_headers.get_header('content-type'))
|
||||||
|
|
||||||
|
# status
|
||||||
|
entry.extract_status(record.status_headers)
|
||||||
|
|
||||||
# digest
|
# digest
|
||||||
entry['digest'] = '-'
|
entry['digest'] = '-'
|
||||||
|
|
||||||
@ -439,3 +450,9 @@ class DefaultRecordIter(object):
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
yield entry
|
yield entry
|
||||||
|
|
||||||
|
class ArchiveIndexEntry(ArchiveIndexEntryMixin, dict):
|
||||||
|
pass
|
||||||
|
|
||||||
|
class OrderedArchiveIndexEntry(ArchiveIndexEntryMixin, OrderedDict):
|
||||||
|
pass
|
||||||
|
@ -1,6 +1,13 @@
|
|||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import json
|
|
||||||
|
from json import dumps as json_encode
|
||||||
|
|
||||||
|
try: # pragma: no cover
|
||||||
|
from collections import OrderedDict
|
||||||
|
except ImportError: # pragma: no cover
|
||||||
|
from ordereddict import OrderedDict
|
||||||
|
|
||||||
|
|
||||||
from argparse import ArgumentParser, RawTextHelpFormatter
|
from argparse import ArgumentParser, RawTextHelpFormatter
|
||||||
from bisect import insort
|
from bisect import insort
|
||||||
@ -42,8 +49,7 @@ class CDXJ(object):
|
|||||||
out.write(entry['timestamp'])
|
out.write(entry['timestamp'])
|
||||||
out.write(' ')
|
out.write(' ')
|
||||||
|
|
||||||
outdict = {}
|
outdict = OrderedDict()
|
||||||
outdict['filename'] = filename
|
|
||||||
|
|
||||||
for n, v in entry.iteritems():
|
for n, v in entry.iteritems():
|
||||||
if n in ('key', 'timestamp'):
|
if n in ('key', 'timestamp'):
|
||||||
@ -52,9 +58,13 @@ class CDXJ(object):
|
|||||||
if n.startswith('_'):
|
if n.startswith('_'):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
if not v or v == '-':
|
||||||
|
continue
|
||||||
|
|
||||||
outdict[n] = v
|
outdict[n] = v
|
||||||
|
|
||||||
out.write(json.dumps(outdict))
|
outdict['filename'] = filename
|
||||||
|
out.write(json_encode(outdict))
|
||||||
out.write('\n')
|
out.write('\n')
|
||||||
|
|
||||||
|
|
||||||
|
@ -15,6 +15,12 @@ com,example)/?example=1 20140103030321 http://example.com?example=1 1043 333 exa
|
|||||||
com,example)/?example=1 20140103030341 http://example.com?example=1 553 1864 example.warc.gz
|
com,example)/?example=1 20140103030341 http://example.com?example=1 553 1864 example.warc.gz
|
||||||
org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example 577 2907 example.warc.gz
|
org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example 577 2907 example.warc.gz
|
||||||
|
|
||||||
|
# warc.gz -- minimal CDXJ
|
||||||
|
>>> print_cdx_index('example.warc.gz', minimal=True, cdxj=True)
|
||||||
|
com,example)/?example=1 20140103030321 {"url": "http://example.com?example=1", "length": "1043", "offset": "333", "filename": "example.warc.gz"}
|
||||||
|
com,example)/?example=1 20140103030341 {"url": "http://example.com?example=1", "length": "553", "offset": "1864", "filename": "example.warc.gz"}
|
||||||
|
org,iana)/domains/example 20140128051539 {"url": "http://www.iana.org/domains/example", "length": "577", "offset": "2907", "filename": "example.warc.gz"}
|
||||||
|
|
||||||
# warc.gz -- parse all
|
# warc.gz -- parse all
|
||||||
>>> print_cdx_index('example.warc.gz', include_all=True)
|
>>> print_cdx_index('example.warc.gz', include_all=True)
|
||||||
CDX N b a m s k r M S V g
|
CDX N b a m s k r M S V g
|
||||||
@ -24,6 +30,14 @@ com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit
|
|||||||
com,example)/?example=1 20140103030341 http://example.com?example=1 - - - - - 490 2417 example.warc.gz
|
com,example)/?example=1 20140103030341 http://example.com?example=1 - - - - - 490 2417 example.warc.gz
|
||||||
org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
|
org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
|
||||||
|
|
||||||
|
# warc.gz -- parse all -- CDXJ
|
||||||
|
>>> print_cdx_index('example.warc.gz', include_all=True, cdxj=True)
|
||||||
|
com,example)/?example=1 20140103030321 {"url": "http://example.com?example=1", "mime": "text/html", "status": "200", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "1043", "offset": "333", "filename": "example.warc.gz"}
|
||||||
|
com,example)/?example=1 20140103030321 {"url": "http://example.com?example=1", "length": "488", "offset": "1376", "filename": "example.warc.gz"}
|
||||||
|
com,example)/?example=1 20140103030341 {"url": "http://example.com?example=1", "mime": "warc/revisit", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "553", "offset": "1864", "filename": "example.warc.gz"}
|
||||||
|
com,example)/?example=1 20140103030341 {"url": "http://example.com?example=1", "length": "490", "offset": "2417", "filename": "example.warc.gz"}
|
||||||
|
org,iana)/domains/example 20140128051539 {"url": "http://www.iana.org/domains/example", "mime": "text/html", "status": "302", "digest": "JZ622UA23G5ZU6Y3XAKH4LINONUEICEG", "length": "577", "offset": "2907", "filename": "example.warc.gz"}
|
||||||
|
|
||||||
# warc
|
# warc
|
||||||
>>> print_cdx_index('example.warc')
|
>>> print_cdx_index('example.warc')
|
||||||
CDX N b a m s k r M S V g
|
CDX N b a m s k r M S V g
|
||||||
@ -45,6 +59,10 @@ org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example tex
|
|||||||
CDX N b a m s k r M S V g
|
CDX N b a m s k r M S V g
|
||||||
com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 856 171 example.arc.gz
|
com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 856 171 example.arc.gz
|
||||||
|
|
||||||
|
# arc.gz -- json
|
||||||
|
>>> print_cdx_index('example.arc.gz', cdxj=True)
|
||||||
|
com,example)/ 20140216050221 {"url": "http://example.com/", "mime": "text/html", "status": "200", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "856", "offset": "171", "filename": "example.arc.gz"}
|
||||||
|
|
||||||
# arc
|
# arc
|
||||||
>>> print_cdx_index('example.arc')
|
>>> print_cdx_index('example.arc')
|
||||||
CDX N b a m s k r M S V g
|
CDX N b a m s k r M S V g
|
||||||
|
@ -126,7 +126,7 @@ def create_cdx_server_app(passed_config):
|
|||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class DirectoryCollsLoader(object): #pragma: no cover
|
class DirectoryCollsLoader(object):
|
||||||
def __init__(self, config, static_routes):
|
def __init__(self, config, static_routes):
|
||||||
self.config = config
|
self.config = config
|
||||||
self.static_routes = static_routes
|
self.static_routes = static_routes
|
||||||
|
Loading…
x
Reference in New Issue
Block a user