1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

cdxj: support loading cdxj (#76)

cdx obj: allow alt field names to be used (eg. mime, mimetype, m)
(status/statuscode/s) in querying and reading cdx
cdx minimal: (#75) now implies cdxj to avoid more formats
minimal includes digest always and mime when warc/revisit
tests for cdxj loading
indexing optimization: reuse same entry obj for records of same type
This commit is contained in:
Ilya Kreymer 2015-03-19 11:20:40 -07:00
parent 73f24f5a2b
commit fe1c32c8f7
12 changed files with 174 additions and 95 deletions

View File

@ -107,8 +107,8 @@ enable_http_proxy: true
# List of route names: # List of route names:
# <route>: <package or file path> # <route>: <package or file path>
# default route static/default for pywb defaults # default route static/default for pywb defaults
static_routes: #static_routes:
static/default: pywb/static/ # static/default: pywb/static/
# enable cdx server api for querying cdx directly (experimental) # enable cdx server api for querying cdx directly (experimental)
enable_cdx_api: true enable_cdx_api: true

View File

@ -10,6 +10,26 @@ from urlparse import parse_qs
from pywb.utils.wbexception import WbException from pywb.utils.wbexception import WbException
from json import loads as json_decode
#=================================================================
URLKEY = 'urlkey'
TIMESTAMP = 'timestamp'
ORIGINAL = 'original'
MIMETYPE = 'mimetype'
STATUSCODE = 'statuscode'
DIGEST = 'digest'
REDIRECT = 'redirect'
ROBOTFLAGS = 'robotflags'
LENGTH = 'length'
OFFSET = 'offset'
FILENAME = 'filename'
ORIG_LENGTH = 'orig.length'
ORIG_OFFSET = 'orig.offset'
ORIG_FILENAME = 'orig.filename'
#================================================================= #=================================================================
class CDXException(WbException): class CDXException(WbException):
@ -24,28 +44,53 @@ class CDXObject(OrderedDict):
""" """
CDX_FORMATS = [ CDX_FORMATS = [
# Public CDX Format # Public CDX Format
["urlkey", "timestamp", "original", "mimetype", "statuscode", [URLKEY, TIMESTAMP, ORIGINAL, MIMETYPE, STATUSCODE,
"digest", "length"], DIGEST, LENGTH],
# CDX 11 Format # CDX 11 Format
["urlkey", "timestamp", "original", "mimetype", "statuscode", [URLKEY, TIMESTAMP, ORIGINAL, MIMETYPE, STATUSCODE,
"digest", "redirect", "robotflags", "length", "offset", "filename"], DIGEST, REDIRECT, ROBOTFLAGS, LENGTH, OFFSET, FILENAME],
# CDX 9 Format # CDX 9 Format
["urlkey", "timestamp", "original", "mimetype", "statuscode", [URLKEY, TIMESTAMP, ORIGINAL, MIMETYPE, STATUSCODE,
"digest", "redirect", "offset", "filename"], DIGEST, REDIRECT, OFFSET, FILENAME],
# CDX 11 Format + 3 revisit resolve fields # CDX 11 Format + 3 revisit resolve fields
["urlkey", "timestamp", "original", "mimetype", "statuscode", [URLKEY, TIMESTAMP, ORIGINAL, MIMETYPE, STATUSCODE,
"digest", "redirect", "robotflags", "length", "offset", "filename", DIGEST, REDIRECT, ROBOTFLAGS, LENGTH, OFFSET, FILENAME,
"orig.length", "orig.offset", "orig.filename"], ORIG_LENGTH, ORIG_OFFSET, ORIG_FILENAME],
# CDX 9 Format + 3 revisit resolve fields # CDX 9 Format + 3 revisit resolve fields
["urlkey", "timestamp", "original", "mimetype", "statuscode", [URLKEY, TIMESTAMP, ORIGINAL, MIMETYPE, STATUSCODE,
"digest", "redirect", "offset", "filename", DIGEST, REDIRECT, OFFSET, FILENAME,
"orig.length", "orig.offset", "orig.filename"] ORIG_LENGTH, ORIG_OFFSET, ORIG_FILENAME],
] ]
CDX_ALT_FIELDS = {
'u': ORIGINAL,
'url': ORIGINAL,
'status': STATUSCODE,
's': STATUSCODE,
'mime': MIMETYPE,
'm': MIMETYPE,
'l': LENGTH,
's': LENGTH,
'o': OFFSET,
'd': DIGEST,
't': TIMESTAMP,
'k': URLKEY,
'f': FILENAME
}
def __init__(self, cdxline=''): def __init__(self, cdxline=''):
OrderedDict.__init__(self) OrderedDict.__init__(self)
@ -56,7 +101,20 @@ class CDXObject(OrderedDict):
self.cdxline = cdxline self.cdxline = cdxline
return return
fields = cdxline.split(' ') fields = cdxline.split(' ' , 2)
# Check for CDX JSON
if fields[-1].startswith('{'):
self[URLKEY] = fields[0]
self[TIMESTAMP] = fields[1]
json_fields = json_decode(fields[-1])
for n, v in json_fields.iteritems():
n = self.CDX_ALT_FIELDS.get(n, n)
self[n] = str(v)
self.cdxline = cdxline
return
more_fields = fields.pop().split(' ')
fields.extend(more_fields)
cdxformat = None cdxformat = None
for i in self.CDX_FORMATS: for i in self.CDX_FORMATS:
@ -80,8 +138,8 @@ class CDXObject(OrderedDict):
def is_revisit(self): def is_revisit(self):
"""return ``True`` if this record is a revisit record.""" """return ``True`` if this record is a revisit record."""
return (self['mimetype'] == 'warc/revisit' or return (self.get(MIMETYPE) == 'warc/revisit' or
self['filename'] == '-') self.get(FILENAME) == '-')
def to_text(self, fields=None): def to_text(self, fields=None):
""" """

View File

@ -172,6 +172,8 @@ def cdx_filter(cdx_iter, filter_strings):
# apply filter to cdx[field] # apply filter to cdx[field]
else: else:
self.field = parts[0] self.field = parts[0]
self.field = CDXObject.CDX_ALT_FIELDS.get(self.field,
self.field)
string = parts[1] string = parts[1]
# make regex if regex mode # make regex if regex mode
@ -181,7 +183,10 @@ def cdx_filter(cdx_iter, filter_strings):
self.filter_str = string self.filter_str = string
def __call__(self, cdx): def __call__(self, cdx):
val = cdx[self.field] if self.field else str(cdx) if not self.field:
val = str(cdx)
else:
val = cdx.get(self.field, '')
matched = self.compare_func(val) matched = self.compare_func(val)
@ -280,8 +285,8 @@ def cdx_resolve_revisits(cdx_iter):
if original_cdx and is_revisit: if original_cdx and is_revisit:
fill_orig = lambda field: original_cdx[field] fill_orig = lambda field: original_cdx[field]
# Transfer mimetype and statuscode # Transfer mimetype and statuscode
cdx['mimetype'] = original_cdx['mimetype'] cdx['mimetype'] = original_cdx.get('mimetype', 'none')
cdx['statuscode'] = original_cdx['statuscode'] cdx['statuscode'] = original_cdx.get('statuscode', 'none')
else: else:
fill_orig = lambda field: '-' fill_orig = lambda field: '-'

View File

@ -167,7 +167,7 @@ class CDXServer(BaseCDXServer):
if filename.startswith('redis://'): if filename.startswith('redis://'):
return RedisCDXSource(filename, config) return RedisCDXSource(filename, config)
if filename.endswith('.cdx'): if filename.endswith(('.cdx', '.cdxj')):
return CDXFile(filename) return CDXFile(filename)
if filename.endswith(('.summary', '.idx')): if filename.endswith(('.summary', '.idx')):

View File

@ -29,13 +29,10 @@ class CDXFile(CDXSource):
def load_cdx(self, query): def load_cdx(self, query):
def do_open(): def do_open():
try: with open(self.filename, 'rb') as source:
source = open(self.filename, 'rb')
gen = iter_range(source, query.key, query.end_key) gen = iter_range(source, query.key, query.end_key)
for line in gen: for line in gen:
yield line yield line
finally:
source.close()
return do_open() return do_open()
#return iter_range(do_open(), query.key, query.end_key) #return iter_range(do_open(), query.key, query.end_key)

View File

@ -48,6 +48,19 @@ org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/s
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', filter = 'statuscode:200') >>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', filter = 'statuscode:200')
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz
# Filter Alt field name
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', filter = 'status:200')
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz
# Filter -- no field specified, match regex on entire line
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', filter = '~screen.css 20140126200625')
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz
# Filter -- no such field, no matches
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', filter = 'blah:200')
Traceback (most recent call last):
NotFoundException: No Captures found for: http://iana.org/_css/2013.1/screen.css
# Filter exact # Filter exact
>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '=urlkey:com,example)/?example=1') >>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '=urlkey:com,example)/?example=1')
com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz
@ -82,7 +95,6 @@ org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz - - - org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz - - -
org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz 8754 41238 iana.warc.gz org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz 8754 41238 iana.warc.gz
# Sort by closest timestamp + field select output # Sort by closest timestamp + field select output
>>> cdx_ops_test(closest = '20140126200826', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', fields = 'timestamp', limit = 10) >>> cdx_ops_test(closest = '20140126200826', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', fields = 'timestamp', limit = 10)
20140126200826 20140126200826
@ -138,6 +150,12 @@ org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201249 http://www.iana.org/_
>>> cdx_ops_test('http://iana.org/domains/root/db', resolveRevisits = True) >>> cdx_ops_test('http://iana.org/domains/root/db', resolveRevisits = True)
org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz - - - org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz - - -
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz - - - org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz - - -
# Resolve Revisit -- cdxj minimal
#>>> cdx_ops_test(url = 'http://example.com/?example=1', sources=[get_test_dir() + 'cdxj/example.cdxj'], resolveRevisits=True)
""" """
#================================================================= #=================================================================

View File

@ -197,6 +197,9 @@ class ArchiveIterator(object):
class ArchiveIndexEntryMixin(object): class ArchiveIndexEntryMixin(object):
MIME_RE = re.compile('[; ]') MIME_RE = re.compile('[; ]')
def reset_entry(self):
self['key'] = ''
def extract_mime(self, mime, def_mime='unk'): def extract_mime(self, mime, def_mime='unk'):
""" Utility function to extract mimetype only """ Utility function to extract mimetype only
from a full content type, removing charset settings from a full content type, removing charset settings
@ -215,11 +218,12 @@ class ArchiveIndexEntryMixin(object):
self['status'] = '-' self['status'] = '-'
def set_rec_info(self, offset, length, digest): def set_rec_info(self, offset, length, digest):
self['length'] = str(length)
self['offset'] = str(offset)
if digest: if digest:
self['digest'] = digest self['digest'] = digest
self['length'] = str(length)
self['offset'] = str(offset)
def merge_request_data(self, other, options): def merge_request_data(self, other, options):
surt_ordered = options.get('surt_ordered', True) surt_ordered = options.get('surt_ordered', True)
@ -248,12 +252,21 @@ class ArchiveIndexEntryMixin(object):
class DefaultRecordIter(object): class DefaultRecordIter(object):
def __init__(self, **options): def __init__(self, **options):
self.options = options self.options = options
self.entry_cache = {}
def _create_index_entry(self): def _create_index_entry(self, rec_type):
if self.options.get('cdxj'): try:
return OrderedArchiveIndexEntry() entry = self.entry_cache[rec_type]
else: entry.reset_entry()
return ArchiveIndexEntry() except:
if self.options.get('cdxj'):
entry = OrderedArchiveIndexEntry()
else:
entry = ArchiveIndexEntry()
self.entry_cache[rec_type] = entry
return entry
def create_record_iter(self, arcv_iter): def create_record_iter(self, arcv_iter):
append_post = self.options.get('append_post') append_post = self.options.get('append_post')
@ -295,8 +308,7 @@ class DefaultRecordIter(object):
compute_digest = False compute_digest = False
if (not minimal and if (entry.get('digest', '-') == '-' and
entry.get('digest', '-') == '-' and
record.rec_type not in ('revisit', 'request', 'warcinfo')): record.rec_type not in ('revisit', 'request', 'warcinfo')):
compute_digest = True compute_digest = True
@ -312,7 +324,6 @@ class DefaultRecordIter(object):
entry['_post_query'] = post_query entry['_post_query'] = post_query
#entry.set_rec_info(*arcv_iter.read_to_end(record, compute_digest))
arcv_iter.read_to_end(record, compute_digest) arcv_iter.read_to_end(record, compute_digest)
entry.set_rec_info(*arcv_iter.member_info) entry.set_rec_info(*arcv_iter.member_info)
entry.record = record entry.record = record
@ -355,7 +366,7 @@ class DefaultRecordIter(object):
""" Parse warc record """ Parse warc record
""" """
entry = self._create_index_entry() entry = self._create_index_entry(record.rec_type)
if record.rec_type == 'warcinfo': if record.rec_type == 'warcinfo':
entry['url'] = record.rec_headers.get_header('WARC-Filename') entry['url'] = record.rec_headers.get_header('WARC-Filename')
@ -369,12 +380,11 @@ class DefaultRecordIter(object):
entry['timestamp'] = iso_date_to_timestamp(record.rec_headers. entry['timestamp'] = iso_date_to_timestamp(record.rec_headers.
get_header('WARC-Date')) get_header('WARC-Date'))
if self.options.get('minimal'):
return entry
# mime # mime
if record.rec_type == 'revisit': if record.rec_type == 'revisit':
entry['mime'] = 'warc/revisit' entry['mime'] = 'warc/revisit'
elif self.options.get('minimal'):
entry['mime'] = '-'
else: else:
def_mime = '-' if record.rec_type == 'request' else 'unk' def_mime = '-' if record.rec_type == 'request' else 'unk'
entry.extract_mime(record.status_headers. entry.extract_mime(record.status_headers.
@ -382,7 +392,7 @@ class DefaultRecordIter(object):
def_mime) def_mime)
# status -- only for response records (by convention): # status -- only for response records (by convention):
if record.rec_type == 'response': if record.rec_type == 'response' and not self.options.get('minimal'):
entry.extract_status(record.status_headers) entry.extract_status(record.status_headers)
else: else:
entry['status'] = '-' entry['status'] = '-'
@ -414,7 +424,7 @@ class DefaultRecordIter(object):
# replace nulls # replace nulls
url = url.replace('\x00', '%00') url = url.replace('\x00', '%00')
entry = self._create_index_entry() entry = self._create_index_entry(record.rec_type)
entry['url'] = url entry['url'] = url
# timestamp # timestamp
@ -422,14 +432,12 @@ class DefaultRecordIter(object):
if len(entry['timestamp']) > 14: if len(entry['timestamp']) > 14:
entry['timestamp'] = entry['timestamp'][:14] entry['timestamp'] = entry['timestamp'][:14]
if self.options.get('minimal'): if not self.options.get('minimal'):
return entry # mime
entry.extract_mime(record.rec_headers.get_header('content-type'))
# mime # status
entry.extract_mime(record.rec_headers.get_header('content-type')) entry.extract_status(record.status_headers)
# status
entry.extract_status(record.status_headers)
# digest # digest
entry['digest'] = '-' entry['digest'] = '-'

View File

@ -16,6 +16,7 @@ from io import BytesIO
from archiveiterator import DefaultRecordIter from archiveiterator import DefaultRecordIter
#================================================================= #=================================================================
class BaseCDXWriter(object): class BaseCDXWriter(object):
def __init__(self, out): def __init__(self, out):
@ -68,26 +69,6 @@ class CDXJ(object):
out.write('\n') out.write('\n')
#=================================================================
class CDX06(object):
def _write_header(self):
self.out.write(' CDX N b a S V g\n')
def write_cdx_line(self, out, entry, filename):
out.write(entry['key'])
out.write(' ')
out.write(entry['timestamp'])
out.write(' ')
out.write(entry['url'])
out.write(' ')
out.write(entry['length'])
out.write(' ')
out.write(entry['offset'])
out.write(' ')
out.write(filename)
out.write('\n')
#================================================================= #=================================================================
class CDX09(object): class CDX09(object):
def _write_header(self): def _write_header(self):
@ -201,6 +182,8 @@ def cdx_filename(filename):
#================================================================= #=================================================================
def get_cdx_writer_cls(options): def get_cdx_writer_cls(options):
writer_cls = options.get('writer_cls') writer_cls = options.get('writer_cls')
if options.get('minimal'):
options['cdxj'] = True
if writer_cls: if writer_cls:
if not options.get('writer_add_mixin'): if not options.get('writer_add_mixin'):
@ -212,8 +195,6 @@ def get_cdx_writer_cls(options):
if options.get('cdxj'): if options.get('cdxj'):
format_mixin = CDXJ format_mixin = CDXJ
elif options.get('cdx06') or options.get('minimal'):
format_mixin = CDX06
elif options.get('cdx09'): elif options.get('cdx09'):
format_mixin = CDX09 format_mixin = CDX09
else: else:
@ -311,14 +292,20 @@ Not-recommended for new cdx, use only for backwards-compatibility.
cdx09_help = """ cdx09_help = """
Use older 9-field cdx format, default is 11-cdx field Use older 9-field cdx format, default is 11-cdx field
""" """
minimal_help = """ minimal_json_help = """
Use a minimal 6-field cdx format, outputing only the basic fields CDX JSON output, but with minimal fields only, available w/o parsing
needed to identiyfy record: http record. The fields are:
canonicalized url, timestamp, original url, archive offset, archive length canonicalized url, timestamp, original url, digest, archive offset, archive length
and archive filename. and archive filename. mimetype is included to indicate warc/revisit only.
This option skips record parsing and will not work with This option skips record parsing and will not work with
POST append (-p) option POST append (-p) option
"""
json_help = """
Output CDX JSON format per line, with url timestamp first, followed by json dict
for all other fields:
url timestamp { ... }
""" """
output_help = """output file or directory. output_help = """output file or directory.
@ -370,15 +357,13 @@ if input is a directory"""
action='store_true', action='store_true',
help=cdx09_help) help=cdx09_help)
group.add_argument('-6', '--cdx06',
action='store_true')
group.add_argument('-j', '--cdxj', group.add_argument('-j', '--cdxj',
action='store_true')
parser.add_argument('-m', '--minimal',
action='store_true', action='store_true',
help=minimal_help) help=json_help)
parser.add_argument('-mj', '--minimal-cdxj',
action='store_true',
help=minimal_json_help)
parser.add_argument('output', nargs='?', default='-', help=output_help) parser.add_argument('output', nargs='?', default='-', help=output_help)
parser.add_argument('inputs', nargs='+', help=input_help) parser.add_argument('inputs', nargs='+', help=input_help)
@ -392,9 +377,8 @@ if input is a directory"""
append_post=cmd.postappend, append_post=cmd.postappend,
recurse=cmd.recurse, recurse=cmd.recurse,
cdx09=cmd.cdx09, cdx09=cmd.cdx09,
cdx06=cmd.cdx06,
cdxj=cmd.cdxj, cdxj=cmd.cdxj,
minimal=cmd.minimal) minimal=cmd.minimal_cdxj)
if __name__ == '__main__': if __name__ == '__main__':

View File

@ -8,18 +8,11 @@ com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 20
com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz
org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
# warc.gz -- minimal cdx
>>> print_cdx_index('example.warc.gz', minimal=True)
CDX N b a S V g
com,example)/?example=1 20140103030321 http://example.com?example=1 1043 333 example.warc.gz
com,example)/?example=1 20140103030341 http://example.com?example=1 553 1864 example.warc.gz
org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example 577 2907 example.warc.gz
# warc.gz -- minimal CDXJ # warc.gz -- minimal CDXJ
>>> print_cdx_index('example.warc.gz', minimal=True, cdxj=True) >>> print_cdx_index('example.warc.gz', minimal=True, cdxj=True)
com,example)/?example=1 20140103030321 {"url": "http://example.com?example=1", "length": "1043", "offset": "333", "filename": "example.warc.gz"} com,example)/?example=1 20140103030321 {"url": "http://example.com?example=1", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "1043", "offset": "333", "filename": "example.warc.gz"}
com,example)/?example=1 20140103030341 {"url": "http://example.com?example=1", "length": "553", "offset": "1864", "filename": "example.warc.gz"} com,example)/?example=1 20140103030341 {"url": "http://example.com?example=1", "mime": "warc/revisit", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "553", "offset": "1864", "filename": "example.warc.gz"}
org,iana)/domains/example 20140128051539 {"url": "http://www.iana.org/domains/example", "length": "577", "offset": "2907", "filename": "example.warc.gz"} org,iana)/domains/example 20140128051539 {"url": "http://www.iana.org/domains/example", "digest": "JZ622UA23G5ZU6Y3XAKH4LINONUEICEG", "length": "577", "offset": "2907", "filename": "example.warc.gz"}
# warc.gz -- parse all # warc.gz -- parse all
>>> print_cdx_index('example.warc.gz', include_all=True) >>> print_cdx_index('example.warc.gz', include_all=True)
@ -63,6 +56,10 @@ com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ
>>> print_cdx_index('example.arc.gz', cdxj=True) >>> print_cdx_index('example.arc.gz', cdxj=True)
com,example)/ 20140216050221 {"url": "http://example.com/", "mime": "text/html", "status": "200", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "856", "offset": "171", "filename": "example.arc.gz"} com,example)/ 20140216050221 {"url": "http://example.com/", "mime": "text/html", "status": "200", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "856", "offset": "171", "filename": "example.arc.gz"}
# arc.gz -- minimal + json
>>> print_cdx_index('example.arc.gz', cdxj=True, minimal=True)
com,example)/ 20140216050221 {"url": "http://example.com/", "digest": "PEWDX5GTH66WU74WBPGFECIYBMPMP3FP", "length": "856", "offset": "171", "filename": "example.arc.gz"}
# arc # arc
>>> print_cdx_index('example.arc') >>> print_cdx_index('example.arc')
CDX N b a m s k r M S V g CDX N b a m s k r M S V g

View File

@ -60,6 +60,8 @@ setup(
}, },
data_files=[ data_files=[
('sample_archive/cdx', glob.glob('sample_archive/cdx/*')), ('sample_archive/cdx', glob.glob('sample_archive/cdx/*')),
('sample_archive/cdxj', glob.glob('sample_archive/cdxj/*')),
('sample_archive/non-surt-cdx', glob.glob('sample_archive/non-surt-cdx/*')),
('sample_archive/zipcdx', glob.glob('sample_archive/zipcdx/*')), ('sample_archive/zipcdx', glob.glob('sample_archive/zipcdx/*')),
('sample_archive/warcs', glob.glob('sample_archive/warcs/*')), ('sample_archive/warcs', glob.glob('sample_archive/warcs/*')),
('sample_archive/text_content', ('sample_archive/text_content',

View File

@ -46,6 +46,8 @@ collections:
index_paths: ./sample_archive/cdx/ index_paths: ./sample_archive/cdx/
redir_to_exact: false redir_to_exact: false
pywb-cdxj:
index_paths: ./sample_archive/cdxj/
# indicate if cdx files are sorted by SURT keys -- eg: com,example)/ # indicate if cdx files are sorted by SURT keys -- eg: com,example)/

View File

@ -124,6 +124,14 @@ class TestWb:
assert 'wb.js' in resp.body assert 'wb.js' in resp.body
assert '/pywb-nosurt/20140103030321/http://www.iana.org/domains/example' in resp.body assert '/pywb-nosurt/20140103030321/http://www.iana.org/domains/example' in resp.body
def test_replay_cdxj(self):
resp = self.testapp.get('/pywb-cdxj/20140103030321/http://example.com?example=1')
self._assert_basic_html(resp)
assert '"20140103030321"' in resp.body
assert 'wb.js' in resp.body
assert '/pywb-cdxj/20140103030321/http://www.iana.org/domains/example' in resp.body
def test_zero_len_revisit(self): def test_zero_len_revisit(self):
resp = self.testapp.get('/pywb/20140603030341/http://example.com?example=2') resp = self.testapp.get('/pywb/20140603030341/http://example.com?example=2')
self._assert_basic_html(resp) self._assert_basic_html(resp)