1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

indexing: support indexing (and even replay of) records where target-uri is a 'urn:' identifier (#91)

for canonicalzation, treat urns as is, already canonical
for wburl, don't add http:// prefix if urn: prefix is present
add example-wpull warc for testing
This commit is contained in:
Ilya Kreymer 2015-03-30 17:21:17 -07:00
parent 002fe6a338
commit 30ab27bb1c
5 changed files with 28 additions and 8 deletions

View File

@ -44,6 +44,10 @@ ur"""
>>> repr(WbUrl('http://example.com?example=2'))
"('latest_replay', '', '', 'http://example.com?example=2', 'http://example.com?example=2')"
# support urn: prefix
>>> repr(WbUrl('urn:X-wpull:log'))
"('latest_replay', '', '', 'urn:X-wpull:log', 'urn:X-wpull:log')"
# Test scheme partially encoded urls
>>> repr(WbUrl('https%3A//example.com/'))
"('latest_replay', '', '', 'https://example.com/', 'https://example.com/')"

View File

@ -178,6 +178,9 @@ class WbUrl(BaseWbUrl):
self.url = new_uri
if self.url.startswith('urn:'):
return
# protocol agnostic url -> http://
# no protocol -> http://
inx = self.url.find(':/')

View File

@ -33,10 +33,17 @@ def canonicalize(url, surt_ordered=True):
>>> canonicalize('http://example.com/path/file.html', surt_ordered=False)
'example.com/path/file.html'
>>> canonicalize('urn:some:id')
'urn:some:id'
"""
try:
key = surt.surt(url)
except Exception as e:
# urn is already canonical, so just use as-is
if url.startswith('urn:'):
return url
raise UrlCanonicalizeException('Invalid Url: ' + url)
# if not surt, unsurt the surt to get canonicalized non-surt url

View File

@ -83,6 +83,12 @@ metadata)/gnu.org/software/wget/warc/manifest.txt 20140216012908 metadata://gnu.
metadata)/gnu.org/software/wget/warc/wget_arguments.txt 20140216012908 metadata://gnu.org/software/wget/warc/wget_arguments.txt text/plain - UCXDCGORD6K4RJT5NUQGKE2PKEG4ZZD6 - - 340 2258 example-wget-1-14.warc.gz
metadata)/gnu.org/software/wget/warc/wget.log 20140216012908 metadata://gnu.org/software/wget/warc/wget.log text/plain - 2ULE2LD5UOWDXGACCT624TU5BVKACRQ4 - - 599 2598 example-wget-1-14.warc.gz
# wpull warc, includes metadata by default
>>> print_cdx_index('example-wpull.warc.gz')
CDX N b a m s k r M S V g
com,example)/ 20150330235046 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1150 2031 example-wpull.warc.gz
urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 example-wpull.warc.gz
# bad arcs -- test error edge cases
>>> print_cdx_index('bad.arc', include_all=True)
CDX N b a m s k r M S V g
@ -135,20 +141,20 @@ org,httpbin)/post?data=^&foo=bar 20140610001255 http://httpbin.org/post?foo=bar
# test sort, multiple inputs
>>> cli_lines(['--sort', '-', TEST_WARC_DIR])
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz
org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1001 353 example-url-agnostic-orig.warc.gz
Total: 206
urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 example-wpull.warc.gz
Total: 208
# test sort, multiple inputs, recursive, from base test dir
>>> cli_lines(['--sort', '-r', '-', get_test_dir()])
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 warcs/example-url-agnostic-revisit.warc.gz
org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1001 353 warcs/example-url-agnostic-orig.warc.gz
Total: 206
urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 warcs/example-wpull.warc.gz
Total: 208
# test sort, 9-field, multiple inputs, all records + post query
>>> cli_lines(['--sort', '-a', '-p', '-9', TEST_WARC_DIR])
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - 355 example-url-agnostic-revisit.warc.gz
org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - 353 example-url-agnostic-orig.warc.gz
Total: 398
urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - 3181 example-wpull.warc.gz
Total: 401
# test writing to stdout
>>> cli_lines(['-', TEST_WARC_DIR + 'example.warc.gz'])
@ -171,8 +177,8 @@ Total: 4
# test custom root dir for cdx filenames, dir input
>>> cli_lines(['--sort', '--dir-root', get_test_dir() + 'other/', TEST_WARC_DIR])
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 ../warcs/example-url-agnostic-revisit.warc.gz
org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1001 353 ../warcs/example-url-agnostic-orig.warc.gz
Total: 206
urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 ../warcs/example-wpull.warc.gz
Total: 208
# test writing to temp dir, also use unicode filename
>>> cli_lines_with_dir(unicode(TEST_WARC_DIR + 'example.warc.gz'))

Binary file not shown.