1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

cdxj: ensure revisit resolve is skipped if the digest is missing, as may be case in cdxj (#85)

This commit is contained in:
Ilya Kreymer 2015-03-26 11:11:10 -07:00
parent 2dbde35d74
commit 85082e46bf
3 changed files with 19 additions and 4 deletions

View File

@ -291,12 +291,16 @@ def cdx_resolve_revisits(cdx_iter):
for cdx in cdx_iter:
is_revisit = cdx.is_revisit()
digest = cdx[DIGEST]
digest = cdx.get(DIGEST)
original_cdx = originals.get(digest)
original_cdx = None
if not original_cdx and not is_revisit:
originals[digest] = cdx
# only set if digest is valid, otherwise no way to resolve
if digest:
original_cdx = originals.get(digest)
if not original_cdx and not is_revisit:
originals[digest] = cdx
if original_cdx and is_revisit:
fill_orig = lambda field: original_cdx[field]

View File

@ -156,6 +156,14 @@ org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db tex
{"urlkey": "com,example)/?example=1", "timestamp": "20140103030321", "url": "http://example.com?example=1", "length": "1043", "filename": "example.warc.gz", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "offset": "333", "orig.length": "-", "orig.offset": "-", "orig.filename": "-"}
{"urlkey": "com,example)/?example=1", "timestamp": "20140103030341", "url": "http://example.com?example=1", "filename": "example.warc.gz", "length": "553", "mime": "", "offset": "1864", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "orig.length": "1043", "orig.offset": "333", "orig.filename": "example.warc.gz"}
# Resolve Revisit -- cdxj minimal -- output also json
>>> cdx_ops_test(url = 'http://example.com/?example=1', sources=[get_test_dir() + 'cdxj/example-no-digest.cdxj'], resolveRevisits=True)
{"urlkey": "com,example)/?example=1", "timestamp": "20140103030321", "url": "http://example.com?example=1", "length": "1043", "filename": "example.warc.gz", "offset": "333", "orig.length": "-", "orig.offset": "-", "orig.filename": "-"}
{"urlkey": "com,example)/?example=1", "timestamp": "20140103030341", "url": "http://example.com?example=1", "length": "553", "filename": "example.warc.gz", "mime": "warc/revisit", "offset": "1864", "orig.length": "-", "orig.offset": "-", "orig.filename": "-"}
"""
#=================================================================

View File

@ -0,0 +1,3 @@
com,example)/?example=1 20140103030321 {"url": "http://example.com?example=1", "length": "1043", "offset": "333", "filename": "example.warc.gz"}
com,example)/?example=1 20140103030341 {"url": "http://example.com?example=1", "mime": "warc/revisit", "length": "553", "offset": "1864", "filename": "example.warc.gz"}
org,iana)/domains/example 20140128051539 {"url": "http://www.iana.org/domains/example", "length": "577", "offset": "2907", "filename": "example.warc.gz"}