mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
indexing: support indexing (and even replay of) records where target-uri is a 'urn:' identifier (#91)
for canonicalzation, treat urns as is, already canonical for wburl, don't add http:// prefix if urn: prefix is present add example-wpull warc for testing
This commit is contained in:
parent
002fe6a338
commit
30ab27bb1c
@ -44,6 +44,10 @@ ur"""
|
|||||||
>>> repr(WbUrl('http://example.com?example=2'))
|
>>> repr(WbUrl('http://example.com?example=2'))
|
||||||
"('latest_replay', '', '', 'http://example.com?example=2', 'http://example.com?example=2')"
|
"('latest_replay', '', '', 'http://example.com?example=2', 'http://example.com?example=2')"
|
||||||
|
|
||||||
|
# support urn: prefix
|
||||||
|
>>> repr(WbUrl('urn:X-wpull:log'))
|
||||||
|
"('latest_replay', '', '', 'urn:X-wpull:log', 'urn:X-wpull:log')"
|
||||||
|
|
||||||
# Test scheme partially encoded urls
|
# Test scheme partially encoded urls
|
||||||
>>> repr(WbUrl('https%3A//example.com/'))
|
>>> repr(WbUrl('https%3A//example.com/'))
|
||||||
"('latest_replay', '', '', 'https://example.com/', 'https://example.com/')"
|
"('latest_replay', '', '', 'https://example.com/', 'https://example.com/')"
|
||||||
|
@ -178,6 +178,9 @@ class WbUrl(BaseWbUrl):
|
|||||||
|
|
||||||
self.url = new_uri
|
self.url = new_uri
|
||||||
|
|
||||||
|
if self.url.startswith('urn:'):
|
||||||
|
return
|
||||||
|
|
||||||
# protocol agnostic url -> http://
|
# protocol agnostic url -> http://
|
||||||
# no protocol -> http://
|
# no protocol -> http://
|
||||||
inx = self.url.find(':/')
|
inx = self.url.find(':/')
|
||||||
|
@ -33,10 +33,17 @@ def canonicalize(url, surt_ordered=True):
|
|||||||
|
|
||||||
>>> canonicalize('http://example.com/path/file.html', surt_ordered=False)
|
>>> canonicalize('http://example.com/path/file.html', surt_ordered=False)
|
||||||
'example.com/path/file.html'
|
'example.com/path/file.html'
|
||||||
|
|
||||||
|
>>> canonicalize('urn:some:id')
|
||||||
|
'urn:some:id'
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
key = surt.surt(url)
|
key = surt.surt(url)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
# urn is already canonical, so just use as-is
|
||||||
|
if url.startswith('urn:'):
|
||||||
|
return url
|
||||||
|
|
||||||
raise UrlCanonicalizeException('Invalid Url: ' + url)
|
raise UrlCanonicalizeException('Invalid Url: ' + url)
|
||||||
|
|
||||||
# if not surt, unsurt the surt to get canonicalized non-surt url
|
# if not surt, unsurt the surt to get canonicalized non-surt url
|
||||||
|
@ -83,6 +83,12 @@ metadata)/gnu.org/software/wget/warc/manifest.txt 20140216012908 metadata://gnu.
|
|||||||
metadata)/gnu.org/software/wget/warc/wget_arguments.txt 20140216012908 metadata://gnu.org/software/wget/warc/wget_arguments.txt text/plain - UCXDCGORD6K4RJT5NUQGKE2PKEG4ZZD6 - - 340 2258 example-wget-1-14.warc.gz
|
metadata)/gnu.org/software/wget/warc/wget_arguments.txt 20140216012908 metadata://gnu.org/software/wget/warc/wget_arguments.txt text/plain - UCXDCGORD6K4RJT5NUQGKE2PKEG4ZZD6 - - 340 2258 example-wget-1-14.warc.gz
|
||||||
metadata)/gnu.org/software/wget/warc/wget.log 20140216012908 metadata://gnu.org/software/wget/warc/wget.log text/plain - 2ULE2LD5UOWDXGACCT624TU5BVKACRQ4 - - 599 2598 example-wget-1-14.warc.gz
|
metadata)/gnu.org/software/wget/warc/wget.log 20140216012908 metadata://gnu.org/software/wget/warc/wget.log text/plain - 2ULE2LD5UOWDXGACCT624TU5BVKACRQ4 - - 599 2598 example-wget-1-14.warc.gz
|
||||||
|
|
||||||
|
# wpull warc, includes metadata by default
|
||||||
|
>>> print_cdx_index('example-wpull.warc.gz')
|
||||||
|
CDX N b a m s k r M S V g
|
||||||
|
com,example)/ 20150330235046 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1150 2031 example-wpull.warc.gz
|
||||||
|
urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 example-wpull.warc.gz
|
||||||
|
|
||||||
# bad arcs -- test error edge cases
|
# bad arcs -- test error edge cases
|
||||||
>>> print_cdx_index('bad.arc', include_all=True)
|
>>> print_cdx_index('bad.arc', include_all=True)
|
||||||
CDX N b a m s k r M S V g
|
CDX N b a m s k r M S V g
|
||||||
@ -135,20 +141,20 @@ org,httpbin)/post?data=^&foo=bar 20140610001255 http://httpbin.org/post?foo=bar
|
|||||||
# test sort, multiple inputs
|
# test sort, multiple inputs
|
||||||
>>> cli_lines(['--sort', '-', TEST_WARC_DIR])
|
>>> cli_lines(['--sort', '-', TEST_WARC_DIR])
|
||||||
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz
|
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz
|
||||||
org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1001 353 example-url-agnostic-orig.warc.gz
|
urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 example-wpull.warc.gz
|
||||||
Total: 206
|
Total: 208
|
||||||
|
|
||||||
# test sort, multiple inputs, recursive, from base test dir
|
# test sort, multiple inputs, recursive, from base test dir
|
||||||
>>> cli_lines(['--sort', '-r', '-', get_test_dir()])
|
>>> cli_lines(['--sort', '-r', '-', get_test_dir()])
|
||||||
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 warcs/example-url-agnostic-revisit.warc.gz
|
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 warcs/example-url-agnostic-revisit.warc.gz
|
||||||
org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1001 353 warcs/example-url-agnostic-orig.warc.gz
|
urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 warcs/example-wpull.warc.gz
|
||||||
Total: 206
|
Total: 208
|
||||||
|
|
||||||
# test sort, 9-field, multiple inputs, all records + post query
|
# test sort, 9-field, multiple inputs, all records + post query
|
||||||
>>> cli_lines(['--sort', '-a', '-p', '-9', TEST_WARC_DIR])
|
>>> cli_lines(['--sort', '-a', '-p', '-9', TEST_WARC_DIR])
|
||||||
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - 355 example-url-agnostic-revisit.warc.gz
|
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - 355 example-url-agnostic-revisit.warc.gz
|
||||||
org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - 353 example-url-agnostic-orig.warc.gz
|
urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - 3181 example-wpull.warc.gz
|
||||||
Total: 398
|
Total: 401
|
||||||
|
|
||||||
# test writing to stdout
|
# test writing to stdout
|
||||||
>>> cli_lines(['-', TEST_WARC_DIR + 'example.warc.gz'])
|
>>> cli_lines(['-', TEST_WARC_DIR + 'example.warc.gz'])
|
||||||
@ -171,8 +177,8 @@ Total: 4
|
|||||||
# test custom root dir for cdx filenames, dir input
|
# test custom root dir for cdx filenames, dir input
|
||||||
>>> cli_lines(['--sort', '--dir-root', get_test_dir() + 'other/', TEST_WARC_DIR])
|
>>> cli_lines(['--sort', '--dir-root', get_test_dir() + 'other/', TEST_WARC_DIR])
|
||||||
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 ../warcs/example-url-agnostic-revisit.warc.gz
|
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 ../warcs/example-url-agnostic-revisit.warc.gz
|
||||||
org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1001 353 ../warcs/example-url-agnostic-orig.warc.gz
|
urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 ../warcs/example-wpull.warc.gz
|
||||||
Total: 206
|
Total: 208
|
||||||
|
|
||||||
# test writing to temp dir, also use unicode filename
|
# test writing to temp dir, also use unicode filename
|
||||||
>>> cli_lines_with_dir(unicode(TEST_WARC_DIR + 'example.warc.gz'))
|
>>> cli_lines_with_dir(unicode(TEST_WARC_DIR + 'example.warc.gz'))
|
||||||
|
BIN
sample_archive/warcs/example-wpull.warc.gz
Normal file
BIN
sample_archive/warcs/example-wpull.warc.gz
Normal file
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user