mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
cdxj: ensure revisit resolve is skipped if the digest is missing, as may be case in cdxj (#85)
This commit is contained in:
parent
2dbde35d74
commit
85082e46bf
@ -291,8 +291,12 @@ def cdx_resolve_revisits(cdx_iter):
|
||||
for cdx in cdx_iter:
|
||||
is_revisit = cdx.is_revisit()
|
||||
|
||||
digest = cdx[DIGEST]
|
||||
digest = cdx.get(DIGEST)
|
||||
|
||||
original_cdx = None
|
||||
|
||||
# only set if digest is valid, otherwise no way to resolve
|
||||
if digest:
|
||||
original_cdx = originals.get(digest)
|
||||
|
||||
if not original_cdx and not is_revisit:
|
||||
|
@ -156,6 +156,14 @@ org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db tex
|
||||
{"urlkey": "com,example)/?example=1", "timestamp": "20140103030321", "url": "http://example.com?example=1", "length": "1043", "filename": "example.warc.gz", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "offset": "333", "orig.length": "-", "orig.offset": "-", "orig.filename": "-"}
|
||||
{"urlkey": "com,example)/?example=1", "timestamp": "20140103030341", "url": "http://example.com?example=1", "filename": "example.warc.gz", "length": "553", "mime": "", "offset": "1864", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "orig.length": "1043", "orig.offset": "333", "orig.filename": "example.warc.gz"}
|
||||
|
||||
# Resolve Revisit -- cdxj minimal -- output also json
|
||||
>>> cdx_ops_test(url = 'http://example.com/?example=1', sources=[get_test_dir() + 'cdxj/example-no-digest.cdxj'], resolveRevisits=True)
|
||||
{"urlkey": "com,example)/?example=1", "timestamp": "20140103030321", "url": "http://example.com?example=1", "length": "1043", "filename": "example.warc.gz", "offset": "333", "orig.length": "-", "orig.offset": "-", "orig.filename": "-"}
|
||||
{"urlkey": "com,example)/?example=1", "timestamp": "20140103030341", "url": "http://example.com?example=1", "length": "553", "filename": "example.warc.gz", "mime": "warc/revisit", "offset": "1864", "orig.length": "-", "orig.offset": "-", "orig.filename": "-"}
|
||||
|
||||
|
||||
|
||||
|
||||
"""
|
||||
|
||||
#=================================================================
|
||||
|
3
sample_archive/cdxj/example-no-digest.cdxj
Normal file
3
sample_archive/cdxj/example-no-digest.cdxj
Normal file
@ -0,0 +1,3 @@
|
||||
com,example)/?example=1 20140103030321 {"url": "http://example.com?example=1", "length": "1043", "offset": "333", "filename": "example.warc.gz"}
|
||||
com,example)/?example=1 20140103030341 {"url": "http://example.com?example=1", "mime": "warc/revisit", "length": "553", "offset": "1864", "filename": "example.warc.gz"}
|
||||
org,iana)/domains/example 20140128051539 {"url": "http://www.iana.org/domains/example", "length": "577", "offset": "2907", "filename": "example.warc.gz"}
|
Loading…
x
Reference in New Issue
Block a user