mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
cdxj: support loading cdxj (#76)
cdx obj: allow alt field names to be used (eg. mime, mimetype, m) (status/statuscode/s) in querying and reading cdx cdx minimal: (#75) now implies cdxj to avoid more formats minimal includes digest always and mime when warc/revisit tests for cdxj loading indexing optimization: reuse same entry obj for records of same type
This commit is contained in:
parent
73f24f5a2b
commit
fe1c32c8f7
@ -107,8 +107,8 @@ enable_http_proxy: true
|
||||
# List of route names:
|
||||
# <route>: <package or file path>
|
||||
# default route static/default for pywb defaults
|
||||
static_routes:
|
||||
static/default: pywb/static/
|
||||
#static_routes:
|
||||
# static/default: pywb/static/
|
||||
|
||||
# enable cdx server api for querying cdx directly (experimental)
|
||||
enable_cdx_api: true
|
||||
|
@ -10,6 +10,26 @@ from urlparse import parse_qs
|
||||
|
||||
from pywb.utils.wbexception import WbException
|
||||
|
||||
from json import loads as json_decode
|
||||
|
||||
|
||||
#=================================================================
|
||||
URLKEY = 'urlkey'
|
||||
TIMESTAMP = 'timestamp'
|
||||
ORIGINAL = 'original'
|
||||
MIMETYPE = 'mimetype'
|
||||
STATUSCODE = 'statuscode'
|
||||
DIGEST = 'digest'
|
||||
REDIRECT = 'redirect'
|
||||
ROBOTFLAGS = 'robotflags'
|
||||
LENGTH = 'length'
|
||||
OFFSET = 'offset'
|
||||
FILENAME = 'filename'
|
||||
|
||||
ORIG_LENGTH = 'orig.length'
|
||||
ORIG_OFFSET = 'orig.offset'
|
||||
ORIG_FILENAME = 'orig.filename'
|
||||
|
||||
|
||||
#=================================================================
|
||||
class CDXException(WbException):
|
||||
@ -24,28 +44,53 @@ class CDXObject(OrderedDict):
|
||||
"""
|
||||
CDX_FORMATS = [
|
||||
# Public CDX Format
|
||||
["urlkey", "timestamp", "original", "mimetype", "statuscode",
|
||||
"digest", "length"],
|
||||
[URLKEY, TIMESTAMP, ORIGINAL, MIMETYPE, STATUSCODE,
|
||||
DIGEST, LENGTH],
|
||||
|
||||
# CDX 11 Format
|
||||
["urlkey", "timestamp", "original", "mimetype", "statuscode",
|
||||
"digest", "redirect", "robotflags", "length", "offset", "filename"],
|
||||
[URLKEY, TIMESTAMP, ORIGINAL, MIMETYPE, STATUSCODE,
|
||||
DIGEST, REDIRECT, ROBOTFLAGS, LENGTH, OFFSET, FILENAME],
|
||||
|
||||
# CDX 9 Format
|
||||
["urlkey", "timestamp", "original", "mimetype", "statuscode",
|
||||
"digest", "redirect", "offset", "filename"],
|
||||
[URLKEY, TIMESTAMP, ORIGINAL, MIMETYPE, STATUSCODE,
|
||||
DIGEST, REDIRECT, OFFSET, FILENAME],
|
||||
|
||||
# CDX 11 Format + 3 revisit resolve fields
|
||||
["urlkey", "timestamp", "original", "mimetype", "statuscode",
|
||||
"digest", "redirect", "robotflags", "length", "offset", "filename",
|
||||
"orig.length", "orig.offset", "orig.filename"],
|
||||
[URLKEY, TIMESTAMP, ORIGINAL, MIMETYPE, STATUSCODE,
|
||||
DIGEST, REDIRECT, ROBOTFLAGS, LENGTH, OFFSET, FILENAME,
|
||||
ORIG_LENGTH, ORIG_OFFSET, ORIG_FILENAME],
|
||||
|
||||
# CDX 9 Format + 3 revisit resolve fields
|
||||
["urlkey", "timestamp", "original", "mimetype", "statuscode",
|
||||
"digest", "redirect", "offset", "filename",
|
||||
"orig.length", "orig.offset", "orig.filename"]
|
||||
[URLKEY, TIMESTAMP, ORIGINAL, MIMETYPE, STATUSCODE,
|
||||
DIGEST, REDIRECT, OFFSET, FILENAME,
|
||||
ORIG_LENGTH, ORIG_OFFSET, ORIG_FILENAME],
|
||||
]
|
||||
|
||||
|
||||
CDX_ALT_FIELDS = {
|
||||
'u': ORIGINAL,
|
||||
'url': ORIGINAL,
|
||||
|
||||
'status': STATUSCODE,
|
||||
's': STATUSCODE,
|
||||
|
||||
'mime': MIMETYPE,
|
||||
'm': MIMETYPE,
|
||||
|
||||
'l': LENGTH,
|
||||
's': LENGTH,
|
||||
|
||||
'o': OFFSET,
|
||||
|
||||
'd': DIGEST,
|
||||
|
||||
't': TIMESTAMP,
|
||||
|
||||
'k': URLKEY,
|
||||
|
||||
'f': FILENAME
|
||||
}
|
||||
|
||||
def __init__(self, cdxline=''):
|
||||
OrderedDict.__init__(self)
|
||||
|
||||
@ -56,7 +101,20 @@ class CDXObject(OrderedDict):
|
||||
self.cdxline = cdxline
|
||||
return
|
||||
|
||||
fields = cdxline.split(' ')
|
||||
fields = cdxline.split(' ' , 2)
|
||||
# Check for CDX JSON
|
||||
if fields[-1].startswith('{'):
|
||||
self[URLKEY] = fields[0]
|
||||
self[TIMESTAMP] = fields[1]
|
||||
json_fields = json_decode(fields[-1])
|
||||
for n, v in json_fields.iteritems():
|
||||
n = self.CDX_ALT_FIELDS.get(n, n)
|
||||
self[n] = str(v)
|
||||
self.cdxline = cdxline
|
||||
return
|
||||
|
||||
more_fields = fields.pop().split(' ')
|
||||
fields.extend(more_fields)
|
||||
|
||||
cdxformat = None
|
||||
for i in self.CDX_FORMATS:
|
||||
@ -80,8 +138,8 @@ class CDXObject(OrderedDict):
|
||||
|
||||
def is_revisit(self):
|
||||
"""return ``True`` if this record is a revisit record."""
|
||||
return (self['mimetype'] == 'warc/revisit' or
|
||||
self['filename'] == '-')
|
||||
return (self.get(MIMETYPE) == 'warc/revisit' or
|
||||
self.get(FILENAME) == '-')
|
||||
|
||||
def to_text(self, fields=None):
|
||||
"""
|
||||
|
@ -172,6 +172,8 @@ def cdx_filter(cdx_iter, filter_strings):
|
||||
# apply filter to cdx[field]
|
||||
else:
|
||||
self.field = parts[0]
|
||||
self.field = CDXObject.CDX_ALT_FIELDS.get(self.field,
|
||||
self.field)
|
||||
string = parts[1]
|
||||
|
||||
# make regex if regex mode
|
||||
@ -181,7 +183,10 @@ def cdx_filter(cdx_iter, filter_strings):
|
||||
self.filter_str = string
|
||||
|
||||
def __call__(self, cdx):
|
||||
val = cdx[self.field] if self.field else str(cdx)
|
||||
if not self.field:
|
||||
val = str(cdx)
|
||||
else:
|
||||
val = cdx.get(self.field, '')
|
||||
|
||||
matched = self.compare_func(val)
|
||||
|
||||
@ -280,8 +285,8 @@ def cdx_resolve_revisits(cdx_iter):
|
||||
if original_cdx and is_revisit:
|
||||
fill_orig = lambda field: original_cdx[field]
|
||||
# Transfer mimetype and statuscode
|
||||
cdx['mimetype'] = original_cdx['mimetype']
|
||||
cdx['statuscode'] = original_cdx['statuscode']
|
||||
cdx['mimetype'] = original_cdx.get('mimetype', 'none')
|
||||
cdx['statuscode'] = original_cdx.get('statuscode', 'none')
|
||||
else:
|
||||
fill_orig = lambda field: '-'
|
||||
|
||||
|
@ -167,7 +167,7 @@ class CDXServer(BaseCDXServer):
|
||||
if filename.startswith('redis://'):
|
||||
return RedisCDXSource(filename, config)
|
||||
|
||||
if filename.endswith('.cdx'):
|
||||
if filename.endswith(('.cdx', '.cdxj')):
|
||||
return CDXFile(filename)
|
||||
|
||||
if filename.endswith(('.summary', '.idx')):
|
||||
|
@ -29,13 +29,10 @@ class CDXFile(CDXSource):
|
||||
|
||||
def load_cdx(self, query):
|
||||
def do_open():
|
||||
try:
|
||||
source = open(self.filename, 'rb')
|
||||
with open(self.filename, 'rb') as source:
|
||||
gen = iter_range(source, query.key, query.end_key)
|
||||
for line in gen:
|
||||
yield line
|
||||
finally:
|
||||
source.close()
|
||||
|
||||
return do_open()
|
||||
#return iter_range(do_open(), query.key, query.end_key)
|
||||
|
@ -48,6 +48,19 @@ org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/s
|
||||
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', filter = 'statuscode:200')
|
||||
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz
|
||||
|
||||
# Filter Alt field name
|
||||
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', filter = 'status:200')
|
||||
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz
|
||||
|
||||
# Filter -- no field specified, match regex on entire line
|
||||
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', filter = '~screen.css 20140126200625')
|
||||
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz
|
||||
|
||||
# Filter -- no such field, no matches
|
||||
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', filter = 'blah:200')
|
||||
Traceback (most recent call last):
|
||||
NotFoundException: No Captures found for: http://iana.org/_css/2013.1/screen.css
|
||||
|
||||
# Filter exact
|
||||
>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '=urlkey:com,example)/?example=1')
|
||||
com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz
|
||||
@ -82,7 +95,6 @@ org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/
|
||||
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz - - -
|
||||
org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz 8754 41238 iana.warc.gz
|
||||
|
||||
|
||||
# Sort by closest timestamp + field select output
|
||||
>>> cdx_ops_test(closest = '20140126200826', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', fields = 'timestamp', limit = 10)
|
||||
20140126200826
|
||||
@ -138,6 +150,12 @@ org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201249 http://www.iana.org/_
|
||||
>>> cdx_ops_test('http://iana.org/domains/root/db', resolveRevisits = True)
|
||||
org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz - - -
|
||||
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz - - -
|
||||
|
||||
# Resolve Revisit -- cdxj minimal
|
||||
#>>> cdx_ops_test(url = 'http://example.com/?example=1', sources=[get_test_dir() + 'cdxj/example.cdxj'], resolveRevisits=True)
|
||||
|
||||
|
||||
|
||||
"""
|
||||
|
||||
#=================================================================
|
||||
|
@ -197,6 +197,9 @@ class ArchiveIterator(object):
|
||||
class ArchiveIndexEntryMixin(object):
|
||||
MIME_RE = re.compile('[; ]')
|
||||
|
||||
def reset_entry(self):
|
||||
self['key'] = ''
|
||||
|
||||
def extract_mime(self, mime, def_mime='unk'):
|
||||
""" Utility function to extract mimetype only
|
||||
from a full content type, removing charset settings
|
||||
@ -215,11 +218,12 @@ class ArchiveIndexEntryMixin(object):
|
||||
self['status'] = '-'
|
||||
|
||||
def set_rec_info(self, offset, length, digest):
|
||||
self['length'] = str(length)
|
||||
self['offset'] = str(offset)
|
||||
if digest:
|
||||
self['digest'] = digest
|
||||
|
||||
self['length'] = str(length)
|
||||
self['offset'] = str(offset)
|
||||
|
||||
def merge_request_data(self, other, options):
|
||||
surt_ordered = options.get('surt_ordered', True)
|
||||
|
||||
@ -248,12 +252,21 @@ class ArchiveIndexEntryMixin(object):
|
||||
class DefaultRecordIter(object):
|
||||
def __init__(self, **options):
|
||||
self.options = options
|
||||
self.entry_cache = {}
|
||||
|
||||
def _create_index_entry(self):
|
||||
if self.options.get('cdxj'):
|
||||
return OrderedArchiveIndexEntry()
|
||||
else:
|
||||
return ArchiveIndexEntry()
|
||||
def _create_index_entry(self, rec_type):
|
||||
try:
|
||||
entry = self.entry_cache[rec_type]
|
||||
entry.reset_entry()
|
||||
except:
|
||||
if self.options.get('cdxj'):
|
||||
entry = OrderedArchiveIndexEntry()
|
||||
else:
|
||||
entry = ArchiveIndexEntry()
|
||||
|
||||
self.entry_cache[rec_type] = entry
|
||||
|
||||
return entry
|
||||
|
||||
def create_record_iter(self, arcv_iter):
|
||||
append_post = self.options.get('append_post')
|
||||
@ -295,8 +308,7 @@ class DefaultRecordIter(object):
|
||||
|
||||
compute_digest = False
|
||||
|
||||
if (not minimal and
|
||||
entry.get('digest', '-') == '-' and
|
||||
if (entry.get('digest', '-') == '-' and
|
||||
record.rec_type not in ('revisit', 'request', 'warcinfo')):
|
||||
|
||||
compute_digest = True
|
||||
@ -312,7 +324,6 @@ class DefaultRecordIter(object):
|
||||
|
||||
entry['_post_query'] = post_query
|
||||
|
||||
#entry.set_rec_info(*arcv_iter.read_to_end(record, compute_digest))
|
||||
arcv_iter.read_to_end(record, compute_digest)
|
||||
entry.set_rec_info(*arcv_iter.member_info)
|
||||
entry.record = record
|
||||
@ -355,7 +366,7 @@ class DefaultRecordIter(object):
|
||||
""" Parse warc record
|
||||
"""
|
||||
|
||||
entry = self._create_index_entry()
|
||||
entry = self._create_index_entry(record.rec_type)
|
||||
|
||||
if record.rec_type == 'warcinfo':
|
||||
entry['url'] = record.rec_headers.get_header('WARC-Filename')
|
||||
@ -369,12 +380,11 @@ class DefaultRecordIter(object):
|
||||
entry['timestamp'] = iso_date_to_timestamp(record.rec_headers.
|
||||
get_header('WARC-Date'))
|
||||
|
||||
if self.options.get('minimal'):
|
||||
return entry
|
||||
|
||||
# mime
|
||||
if record.rec_type == 'revisit':
|
||||
entry['mime'] = 'warc/revisit'
|
||||
elif self.options.get('minimal'):
|
||||
entry['mime'] = '-'
|
||||
else:
|
||||
def_mime = '-' if record.rec_type == 'request' else 'unk'
|
||||
entry.extract_mime(record.status_headers.
|
||||
@ -382,7 +392,7 @@ class DefaultRecordIter(object):
|
||||
def_mime)
|
||||
|
||||
# status -- only for response records (by convention):
|
||||
if record.rec_type == 'response':
|
||||
if record.rec_type == 'response' and not self.options.get('minimal'):
|
||||
entry.extract_status(record.status_headers)
|
||||
else:
|
||||
entry['status'] = '-'
|
||||
@ -414,7 +424,7 @@ class DefaultRecordIter(object):
|
||||
# replace nulls
|
||||
url = url.replace('\x00', '%00')
|
||||
|
||||
entry = self._create_index_entry()
|
||||
entry = self._create_index_entry(record.rec_type)
|
||||
entry['url'] = url
|
||||
|
||||
# timestamp
|
||||
@ -422,14 +432,12 @@ class DefaultRecordIter(object):
|
||||
if len(entry['timestamp']) > 14:
|
||||
entry['timestamp'] = entry['timestamp'][:14]
|
||||
|
||||
if self.options.get('minimal'):
|
||||
return entry
|
||||
if not self.options.get('minimal'):
|
||||
# mime
|
||||
entry.extract_mime(record.rec_headers.get_header('content-type'))
|
||||
|
||||
# mime
|
||||
entry.extract_mime(record.rec_headers.get_header('content-type'))
|
||||
|
||||
# status
|
||||
entry.extract_status(record.status_headers)
|
||||
# status
|
||||
entry.extract_status(record.status_headers)
|
||||
|
||||
# digest
|
||||
entry['digest'] = '-'
|
||||
|
@ -16,6 +16,7 @@ from io import BytesIO
|
||||
|
||||
from archiveiterator import DefaultRecordIter
|
||||
|
||||
|
||||
#=================================================================
|
||||
class BaseCDXWriter(object):
|
||||
def __init__(self, out):
|
||||
@ -68,26 +69,6 @@ class CDXJ(object):
|
||||
out.write('\n')
|
||||
|
||||
|
||||
#=================================================================
|
||||
class CDX06(object):
|
||||
def _write_header(self):
|
||||
self.out.write(' CDX N b a S V g\n')
|
||||
|
||||
def write_cdx_line(self, out, entry, filename):
|
||||
out.write(entry['key'])
|
||||
out.write(' ')
|
||||
out.write(entry['timestamp'])
|
||||
out.write(' ')
|
||||
out.write(entry['url'])
|
||||
out.write(' ')
|
||||
out.write(entry['length'])
|
||||
out.write(' ')
|
||||
out.write(entry['offset'])
|
||||
out.write(' ')
|
||||
out.write(filename)
|
||||
out.write('\n')
|
||||
|
||||
|
||||
#=================================================================
|
||||
class CDX09(object):
|
||||
def _write_header(self):
|
||||
@ -201,6 +182,8 @@ def cdx_filename(filename):
|
||||
#=================================================================
|
||||
def get_cdx_writer_cls(options):
|
||||
writer_cls = options.get('writer_cls')
|
||||
if options.get('minimal'):
|
||||
options['cdxj'] = True
|
||||
|
||||
if writer_cls:
|
||||
if not options.get('writer_add_mixin'):
|
||||
@ -212,8 +195,6 @@ def get_cdx_writer_cls(options):
|
||||
|
||||
if options.get('cdxj'):
|
||||
format_mixin = CDXJ
|
||||
elif options.get('cdx06') or options.get('minimal'):
|
||||
format_mixin = CDX06
|
||||
elif options.get('cdx09'):
|
||||
format_mixin = CDX09
|
||||
else:
|
||||
@ -311,14 +292,20 @@ Not-recommended for new cdx, use only for backwards-compatibility.
|
||||
cdx09_help = """
|
||||
Use older 9-field cdx format, default is 11-cdx field
|
||||
"""
|
||||
minimal_help = """
|
||||
Use a minimal 6-field cdx format, outputing only the basic fields
|
||||
needed to identiyfy record:
|
||||
canonicalized url, timestamp, original url, archive offset, archive length
|
||||
and archive filename.
|
||||
minimal_json_help = """
|
||||
CDX JSON output, but with minimal fields only, available w/o parsing
|
||||
http record. The fields are:
|
||||
canonicalized url, timestamp, original url, digest, archive offset, archive length
|
||||
and archive filename. mimetype is included to indicate warc/revisit only.
|
||||
|
||||
This option skips record parsing and will not work with
|
||||
POST append (-p) option
|
||||
"""
|
||||
|
||||
json_help = """
|
||||
Output CDX JSON format per line, with url timestamp first, followed by json dict
|
||||
for all other fields:
|
||||
url timestamp { ... }
|
||||
"""
|
||||
|
||||
output_help = """output file or directory.
|
||||
@ -370,15 +357,13 @@ if input is a directory"""
|
||||
action='store_true',
|
||||
help=cdx09_help)
|
||||
|
||||
group.add_argument('-6', '--cdx06',
|
||||
action='store_true')
|
||||
|
||||
group.add_argument('-j', '--cdxj',
|
||||
action='store_true')
|
||||
|
||||
parser.add_argument('-m', '--minimal',
|
||||
action='store_true',
|
||||
help=minimal_help)
|
||||
help=json_help)
|
||||
|
||||
parser.add_argument('-mj', '--minimal-cdxj',
|
||||
action='store_true',
|
||||
help=minimal_json_help)
|
||||
|
||||
parser.add_argument('output', nargs='?', default='-', help=output_help)
|
||||
parser.add_argument('inputs', nargs='+', help=input_help)
|
||||
@ -392,9 +377,8 @@ if input is a directory"""
|
||||
append_post=cmd.postappend,
|
||||
recurse=cmd.recurse,
|
||||
cdx09=cmd.cdx09,
|
||||
cdx06=cmd.cdx06,
|
||||
cdxj=cmd.cdxj,
|
||||
minimal=cmd.minimal)
|
||||
minimal=cmd.minimal_cdxj)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
@ -8,18 +8,11 @@ com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 20
|
||||
com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz
|
||||
org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
|
||||
|
||||
# warc.gz -- minimal cdx
|
||||
>>> print_cdx_index('example.warc.gz', minimal=True)
|
||||
CDX N b a S V g
|
||||
com,example)/?example=1 20140103030321 http://example.com?example=1 1043 333 example.warc.gz
|
||||
com,example)/?example=1 20140103030341 http://example.com?example=1 553 1864 example.warc.gz
|
||||
org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example 577 2907 example.warc.gz
|
||||
|
||||
# warc.gz -- minimal CDXJ
|
||||
>>> print_cdx_index('example.warc.gz', minimal=True, cdxj=True)
|
||||
com,example)/?example=1 20140103030321 {"url": "http://example.com?example=1", "length": "1043", "offset": "333", "filename": "example.warc.gz"}
|
||||
com,example)/?example=1 20140103030341 {"url": "http://example.com?example=1", "length": "553", "offset": "1864", "filename": "example.warc.gz"}
|
||||
org,iana)/domains/example 20140128051539 {"url": "http://www.iana.org/domains/example", "length": "577", "offset": "2907", "filename": "example.warc.gz"}
|
||||
com,example)/?example=1 20140103030321 {"url": "http://example.com?example=1", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "1043", "offset": "333", "filename": "example.warc.gz"}
|
||||
com,example)/?example=1 20140103030341 {"url": "http://example.com?example=1", "mime": "warc/revisit", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "553", "offset": "1864", "filename": "example.warc.gz"}
|
||||
org,iana)/domains/example 20140128051539 {"url": "http://www.iana.org/domains/example", "digest": "JZ622UA23G5ZU6Y3XAKH4LINONUEICEG", "length": "577", "offset": "2907", "filename": "example.warc.gz"}
|
||||
|
||||
# warc.gz -- parse all
|
||||
>>> print_cdx_index('example.warc.gz', include_all=True)
|
||||
@ -63,6 +56,10 @@ com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ
|
||||
>>> print_cdx_index('example.arc.gz', cdxj=True)
|
||||
com,example)/ 20140216050221 {"url": "http://example.com/", "mime": "text/html", "status": "200", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "856", "offset": "171", "filename": "example.arc.gz"}
|
||||
|
||||
# arc.gz -- minimal + json
|
||||
>>> print_cdx_index('example.arc.gz', cdxj=True, minimal=True)
|
||||
com,example)/ 20140216050221 {"url": "http://example.com/", "digest": "PEWDX5GTH66WU74WBPGFECIYBMPMP3FP", "length": "856", "offset": "171", "filename": "example.arc.gz"}
|
||||
|
||||
# arc
|
||||
>>> print_cdx_index('example.arc')
|
||||
CDX N b a m s k r M S V g
|
||||
|
2
setup.py
2
setup.py
@ -60,6 +60,8 @@ setup(
|
||||
},
|
||||
data_files=[
|
||||
('sample_archive/cdx', glob.glob('sample_archive/cdx/*')),
|
||||
('sample_archive/cdxj', glob.glob('sample_archive/cdxj/*')),
|
||||
('sample_archive/non-surt-cdx', glob.glob('sample_archive/non-surt-cdx/*')),
|
||||
('sample_archive/zipcdx', glob.glob('sample_archive/zipcdx/*')),
|
||||
('sample_archive/warcs', glob.glob('sample_archive/warcs/*')),
|
||||
('sample_archive/text_content',
|
||||
|
@ -46,6 +46,8 @@ collections:
|
||||
index_paths: ./sample_archive/cdx/
|
||||
redir_to_exact: false
|
||||
|
||||
pywb-cdxj:
|
||||
index_paths: ./sample_archive/cdxj/
|
||||
|
||||
|
||||
# indicate if cdx files are sorted by SURT keys -- eg: com,example)/
|
||||
|
@ -124,6 +124,14 @@ class TestWb:
|
||||
assert 'wb.js' in resp.body
|
||||
assert '/pywb-nosurt/20140103030321/http://www.iana.org/domains/example' in resp.body
|
||||
|
||||
def test_replay_cdxj(self):
|
||||
resp = self.testapp.get('/pywb-cdxj/20140103030321/http://example.com?example=1')
|
||||
self._assert_basic_html(resp)
|
||||
|
||||
assert '"20140103030321"' in resp.body
|
||||
assert 'wb.js' in resp.body
|
||||
assert '/pywb-cdxj/20140103030321/http://www.iana.org/domains/example' in resp.body
|
||||
|
||||
def test_zero_len_revisit(self):
|
||||
resp = self.testapp.get('/pywb/20140603030341/http://example.com?example=2')
|
||||
self._assert_basic_html(resp)
|
||||
|
Loading…
x
Reference in New Issue
Block a user