mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
indexing: support indexing (and even replay of) records where target-uri is a 'urn:' identifier (#91)
for canonicalzation, treat urns as is, already canonical for wburl, don't add http:// prefix if urn: prefix is present add example-wpull warc for testing
This commit is contained in:
parent
002fe6a338
commit
30ab27bb1c
@ -44,6 +44,10 @@ ur"""
|
||||
>>> repr(WbUrl('http://example.com?example=2'))
|
||||
"('latest_replay', '', '', 'http://example.com?example=2', 'http://example.com?example=2')"
|
||||
|
||||
# support urn: prefix
|
||||
>>> repr(WbUrl('urn:X-wpull:log'))
|
||||
"('latest_replay', '', '', 'urn:X-wpull:log', 'urn:X-wpull:log')"
|
||||
|
||||
# Test scheme partially encoded urls
|
||||
>>> repr(WbUrl('https%3A//example.com/'))
|
||||
"('latest_replay', '', '', 'https://example.com/', 'https://example.com/')"
|
||||
|
@ -178,6 +178,9 @@ class WbUrl(BaseWbUrl):
|
||||
|
||||
self.url = new_uri
|
||||
|
||||
if self.url.startswith('urn:'):
|
||||
return
|
||||
|
||||
# protocol agnostic url -> http://
|
||||
# no protocol -> http://
|
||||
inx = self.url.find(':/')
|
||||
|
@ -33,10 +33,17 @@ def canonicalize(url, surt_ordered=True):
|
||||
|
||||
>>> canonicalize('http://example.com/path/file.html', surt_ordered=False)
|
||||
'example.com/path/file.html'
|
||||
|
||||
>>> canonicalize('urn:some:id')
|
||||
'urn:some:id'
|
||||
"""
|
||||
try:
|
||||
key = surt.surt(url)
|
||||
except Exception as e:
|
||||
# urn is already canonical, so just use as-is
|
||||
if url.startswith('urn:'):
|
||||
return url
|
||||
|
||||
raise UrlCanonicalizeException('Invalid Url: ' + url)
|
||||
|
||||
# if not surt, unsurt the surt to get canonicalized non-surt url
|
||||
|
@ -83,6 +83,12 @@ metadata)/gnu.org/software/wget/warc/manifest.txt 20140216012908 metadata://gnu.
|
||||
metadata)/gnu.org/software/wget/warc/wget_arguments.txt 20140216012908 metadata://gnu.org/software/wget/warc/wget_arguments.txt text/plain - UCXDCGORD6K4RJT5NUQGKE2PKEG4ZZD6 - - 340 2258 example-wget-1-14.warc.gz
|
||||
metadata)/gnu.org/software/wget/warc/wget.log 20140216012908 metadata://gnu.org/software/wget/warc/wget.log text/plain - 2ULE2LD5UOWDXGACCT624TU5BVKACRQ4 - - 599 2598 example-wget-1-14.warc.gz
|
||||
|
||||
# wpull warc, includes metadata by default
|
||||
>>> print_cdx_index('example-wpull.warc.gz')
|
||||
CDX N b a m s k r M S V g
|
||||
com,example)/ 20150330235046 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1150 2031 example-wpull.warc.gz
|
||||
urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 example-wpull.warc.gz
|
||||
|
||||
# bad arcs -- test error edge cases
|
||||
>>> print_cdx_index('bad.arc', include_all=True)
|
||||
CDX N b a m s k r M S V g
|
||||
@ -135,20 +141,20 @@ org,httpbin)/post?data=^&foo=bar 20140610001255 http://httpbin.org/post?foo=bar
|
||||
# test sort, multiple inputs
|
||||
>>> cli_lines(['--sort', '-', TEST_WARC_DIR])
|
||||
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz
|
||||
org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1001 353 example-url-agnostic-orig.warc.gz
|
||||
Total: 206
|
||||
urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 example-wpull.warc.gz
|
||||
Total: 208
|
||||
|
||||
# test sort, multiple inputs, recursive, from base test dir
|
||||
>>> cli_lines(['--sort', '-r', '-', get_test_dir()])
|
||||
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 warcs/example-url-agnostic-revisit.warc.gz
|
||||
org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1001 353 warcs/example-url-agnostic-orig.warc.gz
|
||||
Total: 206
|
||||
urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 warcs/example-wpull.warc.gz
|
||||
Total: 208
|
||||
|
||||
# test sort, 9-field, multiple inputs, all records + post query
|
||||
>>> cli_lines(['--sort', '-a', '-p', '-9', TEST_WARC_DIR])
|
||||
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - 355 example-url-agnostic-revisit.warc.gz
|
||||
org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - 353 example-url-agnostic-orig.warc.gz
|
||||
Total: 398
|
||||
urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - 3181 example-wpull.warc.gz
|
||||
Total: 401
|
||||
|
||||
# test writing to stdout
|
||||
>>> cli_lines(['-', TEST_WARC_DIR + 'example.warc.gz'])
|
||||
@ -171,8 +177,8 @@ Total: 4
|
||||
# test custom root dir for cdx filenames, dir input
|
||||
>>> cli_lines(['--sort', '--dir-root', get_test_dir() + 'other/', TEST_WARC_DIR])
|
||||
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 ../warcs/example-url-agnostic-revisit.warc.gz
|
||||
org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1001 353 ../warcs/example-url-agnostic-orig.warc.gz
|
||||
Total: 206
|
||||
urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 ../warcs/example-wpull.warc.gz
|
||||
Total: 208
|
||||
|
||||
# test writing to temp dir, also use unicode filename
|
||||
>>> cli_lines_with_dir(unicode(TEST_WARC_DIR + 'example.warc.gz'))
|
||||
|
BIN
sample_archive/warcs/example-wpull.warc.gz
Normal file
BIN
sample_archive/warcs/example-wpull.warc.gz
Normal file
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user