diff --git a/pywb/rewrite/test/test_wburl.py b/pywb/rewrite/test/test_wburl.py index feabc3f1..88b57f75 100644 --- a/pywb/rewrite/test/test_wburl.py +++ b/pywb/rewrite/test/test_wburl.py @@ -44,6 +44,10 @@ ur""" >>> repr(WbUrl('http://example.com?example=2')) "('latest_replay', '', '', 'http://example.com?example=2', 'http://example.com?example=2')" +# support urn: prefix +>>> repr(WbUrl('urn:X-wpull:log')) +"('latest_replay', '', '', 'urn:X-wpull:log', 'urn:X-wpull:log')" + # Test scheme partially encoded urls >>> repr(WbUrl('https%3A//example.com/')) "('latest_replay', '', '', 'https://example.com/', 'https://example.com/')" diff --git a/pywb/rewrite/wburl.py b/pywb/rewrite/wburl.py index 5efe9e45..57967d37 100644 --- a/pywb/rewrite/wburl.py +++ b/pywb/rewrite/wburl.py @@ -178,6 +178,9 @@ class WbUrl(BaseWbUrl): self.url = new_uri + if self.url.startswith('urn:'): + return + # protocol agnostic url -> http:// # no protocol -> http:// inx = self.url.find(':/') diff --git a/pywb/utils/canonicalize.py b/pywb/utils/canonicalize.py index f8630284..7bbbf7ed 100644 --- a/pywb/utils/canonicalize.py +++ b/pywb/utils/canonicalize.py @@ -33,10 +33,17 @@ def canonicalize(url, surt_ordered=True): >>> canonicalize('http://example.com/path/file.html', surt_ordered=False) 'example.com/path/file.html' + + >>> canonicalize('urn:some:id') + 'urn:some:id' """ try: key = surt.surt(url) except Exception as e: + # urn is already canonical, so just use as-is + if url.startswith('urn:'): + return url + raise UrlCanonicalizeException('Invalid Url: ' + url) # if not surt, unsurt the surt to get canonicalized non-surt url diff --git a/pywb/warc/test/test_indexing.py b/pywb/warc/test/test_indexing.py index de64e513..864270c0 100644 --- a/pywb/warc/test/test_indexing.py +++ b/pywb/warc/test/test_indexing.py @@ -83,6 +83,12 @@ metadata)/gnu.org/software/wget/warc/manifest.txt 20140216012908 metadata://gnu. metadata)/gnu.org/software/wget/warc/wget_arguments.txt 20140216012908 metadata://gnu.org/software/wget/warc/wget_arguments.txt text/plain - UCXDCGORD6K4RJT5NUQGKE2PKEG4ZZD6 - - 340 2258 example-wget-1-14.warc.gz metadata)/gnu.org/software/wget/warc/wget.log 20140216012908 metadata://gnu.org/software/wget/warc/wget.log text/plain - 2ULE2LD5UOWDXGACCT624TU5BVKACRQ4 - - 599 2598 example-wget-1-14.warc.gz +# wpull warc, includes metadata by default +>>> print_cdx_index('example-wpull.warc.gz') + CDX N b a m s k r M S V g +com,example)/ 20150330235046 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1150 2031 example-wpull.warc.gz +urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 example-wpull.warc.gz + # bad arcs -- test error edge cases >>> print_cdx_index('bad.arc', include_all=True) CDX N b a m s k r M S V g @@ -135,20 +141,20 @@ org,httpbin)/post?data=^&foo=bar 20140610001255 http://httpbin.org/post?foo=bar # test sort, multiple inputs >>> cli_lines(['--sort', '-', TEST_WARC_DIR]) com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz -org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1001 353 example-url-agnostic-orig.warc.gz -Total: 206 +urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 example-wpull.warc.gz +Total: 208 # test sort, multiple inputs, recursive, from base test dir >>> cli_lines(['--sort', '-r', '-', get_test_dir()]) com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 warcs/example-url-agnostic-revisit.warc.gz -org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1001 353 warcs/example-url-agnostic-orig.warc.gz -Total: 206 +urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 warcs/example-wpull.warc.gz +Total: 208 # test sort, 9-field, multiple inputs, all records + post query >>> cli_lines(['--sort', '-a', '-p', '-9', TEST_WARC_DIR]) com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - 355 example-url-agnostic-revisit.warc.gz -org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - 353 example-url-agnostic-orig.warc.gz -Total: 398 +urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - 3181 example-wpull.warc.gz +Total: 401 # test writing to stdout >>> cli_lines(['-', TEST_WARC_DIR + 'example.warc.gz']) @@ -171,8 +177,8 @@ Total: 4 # test custom root dir for cdx filenames, dir input >>> cli_lines(['--sort', '--dir-root', get_test_dir() + 'other/', TEST_WARC_DIR]) com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 ../warcs/example-url-agnostic-revisit.warc.gz -org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1001 353 ../warcs/example-url-agnostic-orig.warc.gz -Total: 206 +urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 ../warcs/example-wpull.warc.gz +Total: 208 # test writing to temp dir, also use unicode filename >>> cli_lines_with_dir(unicode(TEST_WARC_DIR + 'example.warc.gz')) diff --git a/sample_archive/warcs/example-wpull.warc.gz b/sample_archive/warcs/example-wpull.warc.gz new file mode 100644 index 00000000..227eb019 Binary files /dev/null and b/sample_archive/warcs/example-wpull.warc.gz differ