From 3f084625b07f40525e04a8e82af7d40c6a8ec151 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 17 Mar 2015 21:11:35 -0700 Subject: [PATCH] indexing: cdx json support (#76): use OrderedDict when indexing json to ensure consistent ordering skip empty or '-' fields add tests for cdx json --- pywb/manager/manager.py | 2 +- pywb/warc/archiveiterator.py | 31 ++++++++++++++++++++++++------- pywb/warc/cdxindexer.py | 18 ++++++++++++++---- pywb/warc/test/test_indexing.py | 18 ++++++++++++++++++ pywb/webapp/pywb_init.py | 2 +- 5 files changed, 58 insertions(+), 13 deletions(-) diff --git a/pywb/manager/manager.py b/pywb/manager/manager.py index 595a71f9..9bd57a6b 100644 --- a/pywb/manager/manager.py +++ b/pywb/manager/manager.py @@ -23,7 +23,7 @@ def get_input(msg): #pragma: no cover #============================================================================= -class CollectionsManager(object): #pragma: no cover +class CollectionsManager(object): """ This utility is designed to simplify the creation and management of web archive collections diff --git a/pywb/warc/archiveiterator.py b/pywb/warc/archiveiterator.py index dfdc1835..7a88ec44 100644 --- a/pywb/warc/archiveiterator.py +++ b/pywb/warc/archiveiterator.py @@ -10,6 +10,11 @@ import base64 import re +try: # pragma: no cover + from collections import OrderedDict +except ImportError: # pragma: no cover + from ordereddict import OrderedDict + #================================================================= class ArchiveIterator(object): @@ -189,7 +194,7 @@ class ArchiveIterator(object): #================================================================= -class ArchiveIndexEntry(dict): +class ArchiveIndexEntryMixin(object): MIME_RE = re.compile('[; ]') def extract_mime(self, mime, def_mime='unk'): @@ -210,8 +215,8 @@ class ArchiveIndexEntry(dict): self['status'] = '-' def set_rec_info(self, offset, length, digest): - self['offset'] = str(offset) self['length'] = str(length) + self['offset'] = str(offset) if digest: self['digest'] = digest @@ -244,6 +249,12 @@ class DefaultRecordIter(object): def __init__(self, **options): self.options = options + def _create_index_entry(self): + if self.options.get('cdxj'): + return OrderedArchiveIndexEntry() + else: + return ArchiveIndexEntry() + def create_record_iter(self, arcv_iter): append_post = self.options.get('append_post') include_all = self.options.get('include_all') @@ -344,7 +355,7 @@ class DefaultRecordIter(object): """ Parse warc record """ - entry = ArchiveIndexEntry() + entry = self._create_index_entry() if record.rec_type == 'warcinfo': entry['url'] = record.rec_headers.get_header('WARC-Filename') @@ -403,7 +414,7 @@ class DefaultRecordIter(object): # replace nulls url = url.replace('\x00', '%00') - entry = ArchiveIndexEntry() + entry = self._create_index_entry() entry['url'] = url # timestamp @@ -414,12 +425,12 @@ class DefaultRecordIter(object): if self.options.get('minimal'): return entry - # status - entry.extract_status(record.status_headers) - # mime entry.extract_mime(record.rec_headers.get_header('content-type')) + # status + entry.extract_status(record.status_headers) + # digest entry['digest'] = '-' @@ -439,3 +450,9 @@ class DefaultRecordIter(object): continue yield entry + +class ArchiveIndexEntry(ArchiveIndexEntryMixin, dict): + pass + +class OrderedArchiveIndexEntry(ArchiveIndexEntryMixin, OrderedDict): + pass diff --git a/pywb/warc/cdxindexer.py b/pywb/warc/cdxindexer.py index c25c928c..7bbe4942 100644 --- a/pywb/warc/cdxindexer.py +++ b/pywb/warc/cdxindexer.py @@ -1,6 +1,13 @@ import os import sys -import json + +from json import dumps as json_encode + +try: # pragma: no cover + from collections import OrderedDict +except ImportError: # pragma: no cover + from ordereddict import OrderedDict + from argparse import ArgumentParser, RawTextHelpFormatter from bisect import insort @@ -42,8 +49,7 @@ class CDXJ(object): out.write(entry['timestamp']) out.write(' ') - outdict = {} - outdict['filename'] = filename + outdict = OrderedDict() for n, v in entry.iteritems(): if n in ('key', 'timestamp'): @@ -52,9 +58,13 @@ class CDXJ(object): if n.startswith('_'): continue + if not v or v == '-': + continue + outdict[n] = v - out.write(json.dumps(outdict)) + outdict['filename'] = filename + out.write(json_encode(outdict)) out.write('\n') diff --git a/pywb/warc/test/test_indexing.py b/pywb/warc/test/test_indexing.py index c01d3ed0..6d1d33f7 100644 --- a/pywb/warc/test/test_indexing.py +++ b/pywb/warc/test/test_indexing.py @@ -15,6 +15,12 @@ com,example)/?example=1 20140103030321 http://example.com?example=1 1043 333 exa com,example)/?example=1 20140103030341 http://example.com?example=1 553 1864 example.warc.gz org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example 577 2907 example.warc.gz +# warc.gz -- minimal CDXJ +>>> print_cdx_index('example.warc.gz', minimal=True, cdxj=True) +com,example)/?example=1 20140103030321 {"url": "http://example.com?example=1", "length": "1043", "offset": "333", "filename": "example.warc.gz"} +com,example)/?example=1 20140103030341 {"url": "http://example.com?example=1", "length": "553", "offset": "1864", "filename": "example.warc.gz"} +org,iana)/domains/example 20140128051539 {"url": "http://www.iana.org/domains/example", "length": "577", "offset": "2907", "filename": "example.warc.gz"} + # warc.gz -- parse all >>> print_cdx_index('example.warc.gz', include_all=True) CDX N b a m s k r M S V g @@ -24,6 +30,14 @@ com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit com,example)/?example=1 20140103030341 http://example.com?example=1 - - - - - 490 2417 example.warc.gz org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz +# warc.gz -- parse all -- CDXJ +>>> print_cdx_index('example.warc.gz', include_all=True, cdxj=True) +com,example)/?example=1 20140103030321 {"url": "http://example.com?example=1", "mime": "text/html", "status": "200", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "1043", "offset": "333", "filename": "example.warc.gz"} +com,example)/?example=1 20140103030321 {"url": "http://example.com?example=1", "length": "488", "offset": "1376", "filename": "example.warc.gz"} +com,example)/?example=1 20140103030341 {"url": "http://example.com?example=1", "mime": "warc/revisit", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "553", "offset": "1864", "filename": "example.warc.gz"} +com,example)/?example=1 20140103030341 {"url": "http://example.com?example=1", "length": "490", "offset": "2417", "filename": "example.warc.gz"} +org,iana)/domains/example 20140128051539 {"url": "http://www.iana.org/domains/example", "mime": "text/html", "status": "302", "digest": "JZ622UA23G5ZU6Y3XAKH4LINONUEICEG", "length": "577", "offset": "2907", "filename": "example.warc.gz"} + # warc >>> print_cdx_index('example.warc') CDX N b a m s k r M S V g @@ -45,6 +59,10 @@ org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example tex CDX N b a m s k r M S V g com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 856 171 example.arc.gz +# arc.gz -- json +>>> print_cdx_index('example.arc.gz', cdxj=True) +com,example)/ 20140216050221 {"url": "http://example.com/", "mime": "text/html", "status": "200", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "856", "offset": "171", "filename": "example.arc.gz"} + # arc >>> print_cdx_index('example.arc') CDX N b a m s k r M S V g diff --git a/pywb/webapp/pywb_init.py b/pywb/webapp/pywb_init.py index cd18cc25..cad3d1a6 100644 --- a/pywb/webapp/pywb_init.py +++ b/pywb/webapp/pywb_init.py @@ -126,7 +126,7 @@ def create_cdx_server_app(passed_config): #================================================================= -class DirectoryCollsLoader(object): #pragma: no cover +class DirectoryCollsLoader(object): def __init__(self, config, static_routes): self.config = config self.static_routes = static_routes