1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

cdxj: support loading cdxj (#76)

cdx obj: allow alt field names to be used (eg. mime, mimetype, m)
(status/statuscode/s) in querying and reading cdx
cdx minimal: (#75) now implies cdxj to avoid more formats
minimal includes digest always and mime when warc/revisit
tests for cdxj loading
indexing optimization: reuse same entry obj for records of same type
This commit is contained in:
Ilya Kreymer 2015-03-19 11:20:40 -07:00
parent 73f24f5a2b
commit fe1c32c8f7
12 changed files with 174 additions and 95 deletions

View File

@ -107,8 +107,8 @@ enable_http_proxy: true
# List of route names:
# <route>: <package or file path>
# default route static/default for pywb defaults
static_routes:
static/default: pywb/static/
#static_routes:
# static/default: pywb/static/
# enable cdx server api for querying cdx directly (experimental)
enable_cdx_api: true

View File

@ -10,6 +10,26 @@ from urlparse import parse_qs
from pywb.utils.wbexception import WbException
from json import loads as json_decode
#=================================================================
URLKEY = 'urlkey'
TIMESTAMP = 'timestamp'
ORIGINAL = 'original'
MIMETYPE = 'mimetype'
STATUSCODE = 'statuscode'
DIGEST = 'digest'
REDIRECT = 'redirect'
ROBOTFLAGS = 'robotflags'
LENGTH = 'length'
OFFSET = 'offset'
FILENAME = 'filename'
ORIG_LENGTH = 'orig.length'
ORIG_OFFSET = 'orig.offset'
ORIG_FILENAME = 'orig.filename'
#=================================================================
class CDXException(WbException):
@ -24,28 +44,53 @@ class CDXObject(OrderedDict):
"""
CDX_FORMATS = [
# Public CDX Format
["urlkey", "timestamp", "original", "mimetype", "statuscode",
"digest", "length"],
[URLKEY, TIMESTAMP, ORIGINAL, MIMETYPE, STATUSCODE,
DIGEST, LENGTH],
# CDX 11 Format
["urlkey", "timestamp", "original", "mimetype", "statuscode",
"digest", "redirect", "robotflags", "length", "offset", "filename"],
[URLKEY, TIMESTAMP, ORIGINAL, MIMETYPE, STATUSCODE,
DIGEST, REDIRECT, ROBOTFLAGS, LENGTH, OFFSET, FILENAME],
# CDX 9 Format
["urlkey", "timestamp", "original", "mimetype", "statuscode",
"digest", "redirect", "offset", "filename"],
[URLKEY, TIMESTAMP, ORIGINAL, MIMETYPE, STATUSCODE,
DIGEST, REDIRECT, OFFSET, FILENAME],
# CDX 11 Format + 3 revisit resolve fields
["urlkey", "timestamp", "original", "mimetype", "statuscode",
"digest", "redirect", "robotflags", "length", "offset", "filename",
"orig.length", "orig.offset", "orig.filename"],
[URLKEY, TIMESTAMP, ORIGINAL, MIMETYPE, STATUSCODE,
DIGEST, REDIRECT, ROBOTFLAGS, LENGTH, OFFSET, FILENAME,
ORIG_LENGTH, ORIG_OFFSET, ORIG_FILENAME],
# CDX 9 Format + 3 revisit resolve fields
["urlkey", "timestamp", "original", "mimetype", "statuscode",
"digest", "redirect", "offset", "filename",
"orig.length", "orig.offset", "orig.filename"]
[URLKEY, TIMESTAMP, ORIGINAL, MIMETYPE, STATUSCODE,
DIGEST, REDIRECT, OFFSET, FILENAME,
ORIG_LENGTH, ORIG_OFFSET, ORIG_FILENAME],
]
CDX_ALT_FIELDS = {
'u': ORIGINAL,
'url': ORIGINAL,
'status': STATUSCODE,
's': STATUSCODE,
'mime': MIMETYPE,
'm': MIMETYPE,
'l': LENGTH,
's': LENGTH,
'o': OFFSET,
'd': DIGEST,
't': TIMESTAMP,
'k': URLKEY,
'f': FILENAME
}
def __init__(self, cdxline=''):
OrderedDict.__init__(self)
@ -56,7 +101,20 @@ class CDXObject(OrderedDict):
self.cdxline = cdxline
return
fields = cdxline.split(' ')
fields = cdxline.split(' ' , 2)
# Check for CDX JSON
if fields[-1].startswith('{'):
self[URLKEY] = fields[0]
self[TIMESTAMP] = fields[1]
json_fields = json_decode(fields[-1])
for n, v in json_fields.iteritems():
n = self.CDX_ALT_FIELDS.get(n, n)
self[n] = str(v)
self.cdxline = cdxline
return
more_fields = fields.pop().split(' ')
fields.extend(more_fields)
cdxformat = None
for i in self.CDX_FORMATS:
@ -80,8 +138,8 @@ class CDXObject(OrderedDict):
def is_revisit(self):
"""return ``True`` if this record is a revisit record."""
return (self['mimetype'] == 'warc/revisit' or
self['filename'] == '-')
return (self.get(MIMETYPE) == 'warc/revisit' or
self.get(FILENAME) == '-')
def to_text(self, fields=None):
"""

View File

@ -172,6 +172,8 @@ def cdx_filter(cdx_iter, filter_strings):
# apply filter to cdx[field]
else:
self.field = parts[0]
self.field = CDXObject.CDX_ALT_FIELDS.get(self.field,
self.field)
string = parts[1]
# make regex if regex mode
@ -181,7 +183,10 @@ def cdx_filter(cdx_iter, filter_strings):
self.filter_str = string
def __call__(self, cdx):
val = cdx[self.field] if self.field else str(cdx)
if not self.field:
val = str(cdx)
else:
val = cdx.get(self.field, '')
matched = self.compare_func(val)
@ -280,8 +285,8 @@ def cdx_resolve_revisits(cdx_iter):
if original_cdx and is_revisit:
fill_orig = lambda field: original_cdx[field]
# Transfer mimetype and statuscode
cdx['mimetype'] = original_cdx['mimetype']
cdx['statuscode'] = original_cdx['statuscode']
cdx['mimetype'] = original_cdx.get('mimetype', 'none')
cdx['statuscode'] = original_cdx.get('statuscode', 'none')
else:
fill_orig = lambda field: '-'

View File

@ -167,7 +167,7 @@ class CDXServer(BaseCDXServer):
if filename.startswith('redis://'):
return RedisCDXSource(filename, config)
if filename.endswith('.cdx'):
if filename.endswith(('.cdx', '.cdxj')):
return CDXFile(filename)
if filename.endswith(('.summary', '.idx')):

View File

@ -29,13 +29,10 @@ class CDXFile(CDXSource):
def load_cdx(self, query):
def do_open():
try:
source = open(self.filename, 'rb')
with open(self.filename, 'rb') as source:
gen = iter_range(source, query.key, query.end_key)
for line in gen:
yield line
finally:
source.close()
return do_open()
#return iter_range(do_open(), query.key, query.end_key)

View File

@ -48,6 +48,19 @@ org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/s
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', filter = 'statuscode:200')
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz
# Filter Alt field name
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', filter = 'status:200')
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz
# Filter -- no field specified, match regex on entire line
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', filter = '~screen.css 20140126200625')
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz
# Filter -- no such field, no matches
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', filter = 'blah:200')
Traceback (most recent call last):
NotFoundException: No Captures found for: http://iana.org/_css/2013.1/screen.css
# Filter exact
>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '=urlkey:com,example)/?example=1')
com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz
@ -82,7 +95,6 @@ org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz - - -
org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz 8754 41238 iana.warc.gz
# Sort by closest timestamp + field select output
>>> cdx_ops_test(closest = '20140126200826', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', fields = 'timestamp', limit = 10)
20140126200826
@ -138,6 +150,12 @@ org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201249 http://www.iana.org/_
>>> cdx_ops_test('http://iana.org/domains/root/db', resolveRevisits = True)
org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz - - -
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz - - -
# Resolve Revisit -- cdxj minimal
#>>> cdx_ops_test(url = 'http://example.com/?example=1', sources=[get_test_dir() + 'cdxj/example.cdxj'], resolveRevisits=True)
"""
#=================================================================

View File

@ -197,6 +197,9 @@ class ArchiveIterator(object):
class ArchiveIndexEntryMixin(object):
MIME_RE = re.compile('[; ]')
def reset_entry(self):
self['key'] = ''
def extract_mime(self, mime, def_mime='unk'):
""" Utility function to extract mimetype only
from a full content type, removing charset settings
@ -215,11 +218,12 @@ class ArchiveIndexEntryMixin(object):
self['status'] = '-'
def set_rec_info(self, offset, length, digest):
self['length'] = str(length)
self['offset'] = str(offset)
if digest:
self['digest'] = digest
self['length'] = str(length)
self['offset'] = str(offset)
def merge_request_data(self, other, options):
surt_ordered = options.get('surt_ordered', True)
@ -248,12 +252,21 @@ class ArchiveIndexEntryMixin(object):
class DefaultRecordIter(object):
def __init__(self, **options):
self.options = options
self.entry_cache = {}
def _create_index_entry(self):
def _create_index_entry(self, rec_type):
try:
entry = self.entry_cache[rec_type]
entry.reset_entry()
except:
if self.options.get('cdxj'):
return OrderedArchiveIndexEntry()
entry = OrderedArchiveIndexEntry()
else:
return ArchiveIndexEntry()
entry = ArchiveIndexEntry()
self.entry_cache[rec_type] = entry
return entry
def create_record_iter(self, arcv_iter):
append_post = self.options.get('append_post')
@ -295,8 +308,7 @@ class DefaultRecordIter(object):
compute_digest = False
if (not minimal and
entry.get('digest', '-') == '-' and
if (entry.get('digest', '-') == '-' and
record.rec_type not in ('revisit', 'request', 'warcinfo')):
compute_digest = True
@ -312,7 +324,6 @@ class DefaultRecordIter(object):
entry['_post_query'] = post_query
#entry.set_rec_info(*arcv_iter.read_to_end(record, compute_digest))
arcv_iter.read_to_end(record, compute_digest)
entry.set_rec_info(*arcv_iter.member_info)
entry.record = record
@ -355,7 +366,7 @@ class DefaultRecordIter(object):
""" Parse warc record
"""
entry = self._create_index_entry()
entry = self._create_index_entry(record.rec_type)
if record.rec_type == 'warcinfo':
entry['url'] = record.rec_headers.get_header('WARC-Filename')
@ -369,12 +380,11 @@ class DefaultRecordIter(object):
entry['timestamp'] = iso_date_to_timestamp(record.rec_headers.
get_header('WARC-Date'))
if self.options.get('minimal'):
return entry
# mime
if record.rec_type == 'revisit':
entry['mime'] = 'warc/revisit'
elif self.options.get('minimal'):
entry['mime'] = '-'
else:
def_mime = '-' if record.rec_type == 'request' else 'unk'
entry.extract_mime(record.status_headers.
@ -382,7 +392,7 @@ class DefaultRecordIter(object):
def_mime)
# status -- only for response records (by convention):
if record.rec_type == 'response':
if record.rec_type == 'response' and not self.options.get('minimal'):
entry.extract_status(record.status_headers)
else:
entry['status'] = '-'
@ -414,7 +424,7 @@ class DefaultRecordIter(object):
# replace nulls
url = url.replace('\x00', '%00')
entry = self._create_index_entry()
entry = self._create_index_entry(record.rec_type)
entry['url'] = url
# timestamp
@ -422,9 +432,7 @@ class DefaultRecordIter(object):
if len(entry['timestamp']) > 14:
entry['timestamp'] = entry['timestamp'][:14]
if self.options.get('minimal'):
return entry
if not self.options.get('minimal'):
# mime
entry.extract_mime(record.rec_headers.get_header('content-type'))

View File

@ -16,6 +16,7 @@ from io import BytesIO
from archiveiterator import DefaultRecordIter
#=================================================================
class BaseCDXWriter(object):
def __init__(self, out):
@ -68,26 +69,6 @@ class CDXJ(object):
out.write('\n')
#=================================================================
class CDX06(object):
def _write_header(self):
self.out.write(' CDX N b a S V g\n')
def write_cdx_line(self, out, entry, filename):
out.write(entry['key'])
out.write(' ')
out.write(entry['timestamp'])
out.write(' ')
out.write(entry['url'])
out.write(' ')
out.write(entry['length'])
out.write(' ')
out.write(entry['offset'])
out.write(' ')
out.write(filename)
out.write('\n')
#=================================================================
class CDX09(object):
def _write_header(self):
@ -201,6 +182,8 @@ def cdx_filename(filename):
#=================================================================
def get_cdx_writer_cls(options):
writer_cls = options.get('writer_cls')
if options.get('minimal'):
options['cdxj'] = True
if writer_cls:
if not options.get('writer_add_mixin'):
@ -212,8 +195,6 @@ def get_cdx_writer_cls(options):
if options.get('cdxj'):
format_mixin = CDXJ
elif options.get('cdx06') or options.get('minimal'):
format_mixin = CDX06
elif options.get('cdx09'):
format_mixin = CDX09
else:
@ -311,14 +292,20 @@ Not-recommended for new cdx, use only for backwards-compatibility.
cdx09_help = """
Use older 9-field cdx format, default is 11-cdx field
"""
minimal_help = """
Use a minimal 6-field cdx format, outputing only the basic fields
needed to identiyfy record:
canonicalized url, timestamp, original url, archive offset, archive length
and archive filename.
minimal_json_help = """
CDX JSON output, but with minimal fields only, available w/o parsing
http record. The fields are:
canonicalized url, timestamp, original url, digest, archive offset, archive length
and archive filename. mimetype is included to indicate warc/revisit only.
This option skips record parsing and will not work with
POST append (-p) option
"""
json_help = """
Output CDX JSON format per line, with url timestamp first, followed by json dict
for all other fields:
url timestamp { ... }
"""
output_help = """output file or directory.
@ -370,15 +357,13 @@ if input is a directory"""
action='store_true',
help=cdx09_help)
group.add_argument('-6', '--cdx06',
action='store_true')
group.add_argument('-j', '--cdxj',
action='store_true')
parser.add_argument('-m', '--minimal',
action='store_true',
help=minimal_help)
help=json_help)
parser.add_argument('-mj', '--minimal-cdxj',
action='store_true',
help=minimal_json_help)
parser.add_argument('output', nargs='?', default='-', help=output_help)
parser.add_argument('inputs', nargs='+', help=input_help)
@ -392,9 +377,8 @@ if input is a directory"""
append_post=cmd.postappend,
recurse=cmd.recurse,
cdx09=cmd.cdx09,
cdx06=cmd.cdx06,
cdxj=cmd.cdxj,
minimal=cmd.minimal)
minimal=cmd.minimal_cdxj)
if __name__ == '__main__':

View File

@ -8,18 +8,11 @@ com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 20
com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz
org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
# warc.gz -- minimal cdx
>>> print_cdx_index('example.warc.gz', minimal=True)
CDX N b a S V g
com,example)/?example=1 20140103030321 http://example.com?example=1 1043 333 example.warc.gz
com,example)/?example=1 20140103030341 http://example.com?example=1 553 1864 example.warc.gz
org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example 577 2907 example.warc.gz
# warc.gz -- minimal CDXJ
>>> print_cdx_index('example.warc.gz', minimal=True, cdxj=True)
com,example)/?example=1 20140103030321 {"url": "http://example.com?example=1", "length": "1043", "offset": "333", "filename": "example.warc.gz"}
com,example)/?example=1 20140103030341 {"url": "http://example.com?example=1", "length": "553", "offset": "1864", "filename": "example.warc.gz"}
org,iana)/domains/example 20140128051539 {"url": "http://www.iana.org/domains/example", "length": "577", "offset": "2907", "filename": "example.warc.gz"}
com,example)/?example=1 20140103030321 {"url": "http://example.com?example=1", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "1043", "offset": "333", "filename": "example.warc.gz"}
com,example)/?example=1 20140103030341 {"url": "http://example.com?example=1", "mime": "warc/revisit", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "553", "offset": "1864", "filename": "example.warc.gz"}
org,iana)/domains/example 20140128051539 {"url": "http://www.iana.org/domains/example", "digest": "JZ622UA23G5ZU6Y3XAKH4LINONUEICEG", "length": "577", "offset": "2907", "filename": "example.warc.gz"}
# warc.gz -- parse all
>>> print_cdx_index('example.warc.gz', include_all=True)
@ -63,6 +56,10 @@ com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ
>>> print_cdx_index('example.arc.gz', cdxj=True)
com,example)/ 20140216050221 {"url": "http://example.com/", "mime": "text/html", "status": "200", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "856", "offset": "171", "filename": "example.arc.gz"}
# arc.gz -- minimal + json
>>> print_cdx_index('example.arc.gz', cdxj=True, minimal=True)
com,example)/ 20140216050221 {"url": "http://example.com/", "digest": "PEWDX5GTH66WU74WBPGFECIYBMPMP3FP", "length": "856", "offset": "171", "filename": "example.arc.gz"}
# arc
>>> print_cdx_index('example.arc')
CDX N b a m s k r M S V g

View File

@ -60,6 +60,8 @@ setup(
},
data_files=[
('sample_archive/cdx', glob.glob('sample_archive/cdx/*')),
('sample_archive/cdxj', glob.glob('sample_archive/cdxj/*')),
('sample_archive/non-surt-cdx', glob.glob('sample_archive/non-surt-cdx/*')),
('sample_archive/zipcdx', glob.glob('sample_archive/zipcdx/*')),
('sample_archive/warcs', glob.glob('sample_archive/warcs/*')),
('sample_archive/text_content',

View File

@ -46,6 +46,8 @@ collections:
index_paths: ./sample_archive/cdx/
redir_to_exact: false
pywb-cdxj:
index_paths: ./sample_archive/cdxj/
# indicate if cdx files are sorted by SURT keys -- eg: com,example)/

View File

@ -124,6 +124,14 @@ class TestWb:
assert 'wb.js' in resp.body
assert '/pywb-nosurt/20140103030321/http://www.iana.org/domains/example' in resp.body
def test_replay_cdxj(self):
resp = self.testapp.get('/pywb-cdxj/20140103030321/http://example.com?example=1')
self._assert_basic_html(resp)
assert '"20140103030321"' in resp.body
assert 'wb.js' in resp.body
assert '/pywb-cdxj/20140103030321/http://www.iana.org/domains/example' in resp.body
def test_zero_len_revisit(self):
resp = self.testapp.get('/pywb/20140603030341/http://example.com?example=2')
self._assert_basic_html(resp)