diff --git a/pywb/cdx/cdxops.py b/pywb/cdx/cdxops.py index ca4ad3fb..9cb80197 100644 --- a/pywb/cdx/cdxops.py +++ b/pywb/cdx/cdxops.py @@ -291,12 +291,16 @@ def cdx_resolve_revisits(cdx_iter): for cdx in cdx_iter: is_revisit = cdx.is_revisit() - digest = cdx[DIGEST] + digest = cdx.get(DIGEST) - original_cdx = originals.get(digest) + original_cdx = None - if not original_cdx and not is_revisit: - originals[digest] = cdx + # only set if digest is valid, otherwise no way to resolve + if digest: + original_cdx = originals.get(digest) + + if not original_cdx and not is_revisit: + originals[digest] = cdx if original_cdx and is_revisit: fill_orig = lambda field: original_cdx[field] diff --git a/pywb/cdx/test/test_cdxops.py b/pywb/cdx/test/test_cdxops.py index 18ec4ffa..42310d64 100644 --- a/pywb/cdx/test/test_cdxops.py +++ b/pywb/cdx/test/test_cdxops.py @@ -156,6 +156,14 @@ org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db tex {"urlkey": "com,example)/?example=1", "timestamp": "20140103030321", "url": "http://example.com?example=1", "length": "1043", "filename": "example.warc.gz", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "offset": "333", "orig.length": "-", "orig.offset": "-", "orig.filename": "-"} {"urlkey": "com,example)/?example=1", "timestamp": "20140103030341", "url": "http://example.com?example=1", "filename": "example.warc.gz", "length": "553", "mime": "", "offset": "1864", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "orig.length": "1043", "orig.offset": "333", "orig.filename": "example.warc.gz"} +# Resolve Revisit -- cdxj minimal -- output also json +>>> cdx_ops_test(url = 'http://example.com/?example=1', sources=[get_test_dir() + 'cdxj/example-no-digest.cdxj'], resolveRevisits=True) +{"urlkey": "com,example)/?example=1", "timestamp": "20140103030321", "url": "http://example.com?example=1", "length": "1043", "filename": "example.warc.gz", "offset": "333", "orig.length": "-", "orig.offset": "-", "orig.filename": "-"} +{"urlkey": "com,example)/?example=1", "timestamp": "20140103030341", "url": "http://example.com?example=1", "length": "553", "filename": "example.warc.gz", "mime": "warc/revisit", "offset": "1864", "orig.length": "-", "orig.offset": "-", "orig.filename": "-"} + + + + """ #================================================================= diff --git a/sample_archive/cdxj/example-no-digest.cdxj b/sample_archive/cdxj/example-no-digest.cdxj new file mode 100644 index 00000000..2ffd30f7 --- /dev/null +++ b/sample_archive/cdxj/example-no-digest.cdxj @@ -0,0 +1,3 @@ +com,example)/?example=1 20140103030321 {"url": "http://example.com?example=1", "length": "1043", "offset": "333", "filename": "example.warc.gz"} +com,example)/?example=1 20140103030341 {"url": "http://example.com?example=1", "mime": "warc/revisit", "length": "553", "offset": "1864", "filename": "example.warc.gz"} +org,iana)/domains/example 20140128051539 {"url": "http://www.iana.org/domains/example", "length": "577", "offset": "2907", "filename": "example.warc.gz"}