mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
cdxj: ensure revisit resolve is skipped if the digest is missing, as may be case in cdxj (#85)
This commit is contained in:
parent
2dbde35d74
commit
85082e46bf
@ -291,12 +291,16 @@ def cdx_resolve_revisits(cdx_iter):
|
|||||||
for cdx in cdx_iter:
|
for cdx in cdx_iter:
|
||||||
is_revisit = cdx.is_revisit()
|
is_revisit = cdx.is_revisit()
|
||||||
|
|
||||||
digest = cdx[DIGEST]
|
digest = cdx.get(DIGEST)
|
||||||
|
|
||||||
original_cdx = originals.get(digest)
|
original_cdx = None
|
||||||
|
|
||||||
if not original_cdx and not is_revisit:
|
# only set if digest is valid, otherwise no way to resolve
|
||||||
originals[digest] = cdx
|
if digest:
|
||||||
|
original_cdx = originals.get(digest)
|
||||||
|
|
||||||
|
if not original_cdx and not is_revisit:
|
||||||
|
originals[digest] = cdx
|
||||||
|
|
||||||
if original_cdx and is_revisit:
|
if original_cdx and is_revisit:
|
||||||
fill_orig = lambda field: original_cdx[field]
|
fill_orig = lambda field: original_cdx[field]
|
||||||
|
@ -156,6 +156,14 @@ org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db tex
|
|||||||
{"urlkey": "com,example)/?example=1", "timestamp": "20140103030321", "url": "http://example.com?example=1", "length": "1043", "filename": "example.warc.gz", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "offset": "333", "orig.length": "-", "orig.offset": "-", "orig.filename": "-"}
|
{"urlkey": "com,example)/?example=1", "timestamp": "20140103030321", "url": "http://example.com?example=1", "length": "1043", "filename": "example.warc.gz", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "offset": "333", "orig.length": "-", "orig.offset": "-", "orig.filename": "-"}
|
||||||
{"urlkey": "com,example)/?example=1", "timestamp": "20140103030341", "url": "http://example.com?example=1", "filename": "example.warc.gz", "length": "553", "mime": "", "offset": "1864", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "orig.length": "1043", "orig.offset": "333", "orig.filename": "example.warc.gz"}
|
{"urlkey": "com,example)/?example=1", "timestamp": "20140103030341", "url": "http://example.com?example=1", "filename": "example.warc.gz", "length": "553", "mime": "", "offset": "1864", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "orig.length": "1043", "orig.offset": "333", "orig.filename": "example.warc.gz"}
|
||||||
|
|
||||||
|
# Resolve Revisit -- cdxj minimal -- output also json
|
||||||
|
>>> cdx_ops_test(url = 'http://example.com/?example=1', sources=[get_test_dir() + 'cdxj/example-no-digest.cdxj'], resolveRevisits=True)
|
||||||
|
{"urlkey": "com,example)/?example=1", "timestamp": "20140103030321", "url": "http://example.com?example=1", "length": "1043", "filename": "example.warc.gz", "offset": "333", "orig.length": "-", "orig.offset": "-", "orig.filename": "-"}
|
||||||
|
{"urlkey": "com,example)/?example=1", "timestamp": "20140103030341", "url": "http://example.com?example=1", "length": "553", "filename": "example.warc.gz", "mime": "warc/revisit", "offset": "1864", "orig.length": "-", "orig.offset": "-", "orig.filename": "-"}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
|
3
sample_archive/cdxj/example-no-digest.cdxj
Normal file
3
sample_archive/cdxj/example-no-digest.cdxj
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
com,example)/?example=1 20140103030321 {"url": "http://example.com?example=1", "length": "1043", "offset": "333", "filename": "example.warc.gz"}
|
||||||
|
com,example)/?example=1 20140103030341 {"url": "http://example.com?example=1", "mime": "warc/revisit", "length": "553", "offset": "1864", "filename": "example.warc.gz"}
|
||||||
|
org,iana)/domains/example 20140128051539 {"url": "http://www.iana.org/domains/example", "length": "577", "offset": "2907", "filename": "example.warc.gz"}
|
Loading…
x
Reference in New Issue
Block a user