From 5028901a1710487818e393632d0c4f0b299f56a5 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 20 Apr 2015 08:58:51 -0700 Subject: [PATCH] tests: add tests for indexing http custom status/verbs with and without verify #99 --- pywb/warc/test/test_indexing.py | 22 ++++- sample_archive/warcs/example-extra.warc | 107 ++++++++++++++++++++++++ 2 files changed, 125 insertions(+), 4 deletions(-) diff --git a/pywb/warc/test/test_indexing.py b/pywb/warc/test/test_indexing.py index 864270c0..0d196b17 100644 --- a/pywb/warc/test/test_indexing.py +++ b/pywb/warc/test/test_indexing.py @@ -134,6 +134,20 @@ org,httpbin)/post?data=^&foo=bar 20140610001255 http://httpbin.org/post?foo=bar org,httpbin)/post?data=^&foo=bar 20140610001255 http://httpbin.org/post?foo=bar application/x-www-form-urlencoded - - - - 475 3118 post-test.warc.gz +# Test with custom verbs/protocol +#================================================================ +# no validation +>>> print_cdx_index('example-extra.warc') + CDX N b a m s k r M S V g +com,example)/?example=2 20140103030321 http://example.com?example=2 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1987 0 example-extra.warc +com,example)/?example=2 20140603030341 http://example.com?example=2 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 504 2701 example-extra.warc +com,example)/?example=2 20140103030321 http://example.com?example=2 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1987 3207 example-extra.warc +com,example)/?example=2 20140603030341 http://example.com?example=2 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 504 5910 example-extra.warc + +>>> print_cdx_index('example-extra.warc', verify_http=True) +Traceback (most recent call last): +StatusAndHeadersParserException: Expected Status Line starting with ['HTTP/1.0', 'HTTP/1.1'] - Found: HTTPX/1.1 200 OK + # Test CLI interface -- (check for num lines) #================================================================= @@ -142,19 +156,19 @@ org,httpbin)/post?data=^&foo=bar 20140610001255 http://httpbin.org/post?foo=bar >>> cli_lines(['--sort', '-', TEST_WARC_DIR]) com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 example-wpull.warc.gz -Total: 208 +Total: 210 # test sort, multiple inputs, recursive, from base test dir >>> cli_lines(['--sort', '-r', '-', get_test_dir()]) com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 warcs/example-url-agnostic-revisit.warc.gz urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 warcs/example-wpull.warc.gz -Total: 208 +Total: 210 # test sort, 9-field, multiple inputs, all records + post query >>> cli_lines(['--sort', '-a', '-p', '-9', TEST_WARC_DIR]) com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - 355 example-url-agnostic-revisit.warc.gz urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - 3181 example-wpull.warc.gz -Total: 401 +Total: 404 # test writing to stdout >>> cli_lines(['-', TEST_WARC_DIR + 'example.warc.gz']) @@ -178,7 +192,7 @@ Total: 4 >>> cli_lines(['--sort', '--dir-root', get_test_dir() + 'other/', TEST_WARC_DIR]) com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 ../warcs/example-url-agnostic-revisit.warc.gz urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 ../warcs/example-wpull.warc.gz -Total: 208 +Total: 210 # test writing to temp dir, also use unicode filename >>> cli_lines_with_dir(unicode(TEST_WARC_DIR + 'example.warc.gz')) diff --git a/sample_archive/warcs/example-extra.warc b/sample_archive/warcs/example-extra.warc index 8839ddd1..2abd9d30 100644 --- a/sample_archive/warcs/example-extra.warc +++ b/sample_archive/warcs/example-extra.warc @@ -105,3 +105,110 @@ WARC-Refers-To-Date: 2014-01-03T03:03:21Z Content-Length: 0 +WARC/1.0 +WARC-Type: response +WARC-Record-ID: +WARC-Date: 2014-01-03T03:03:21Z +Content-Length: 1610 +Content-Type: application/http; msgtype=response +WARC-Payload-Digest: sha1:B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A +WARC-Target-URI: http://example.com?example=2 +WARC-Warcinfo-ID: + +HTTPX/1.1 200 OK +Accept-Ranges: bytes +Cache-Control: max-age=604800 +Content-Type: text/html +Date: Fri, 03 Jan 2014 03:03:21 GMT +Etag: "359670651" +Expires: Fri, 10 Jan 2014 03:03:21 GMT +Last-Modified: Fri, 09 Aug 2013 23:54:35 GMT +Server: ECS (sjc/4FCE) +X-Cache: HIT +x-ec-custom-error: 1 +Content-Length: 1270 +Connection: close + + + + + Example Domain + + + + + + + + +
+

Example Domain

+

This domain is established to be used for illustrative examples in documents. You may use this + domain in examples without prior coordination or asking for permission.

+

More information...

+
+ + + + +WARC/1.0 +WARC-Type: request +WARC-Record-ID: +WARC-Date: 2014-01-03T03:03:21Z +Content-Length: 323 +Content-Type: application/http; msgtype=request +WARC-Concurrent-To: +WARC-Target-URI: http://example.com?example=2 +WARC-Warcinfo-ID: + +GETX /?example=2 HTTP/1.1 +Connection: close +Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8 +Accept-Language: en-US,en;q=0.8 +User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.57 Safari/537.36 (via Wayback Save Page) +Host: example.com + + +WARC/1.0 +WARC-Type: revisit +WARC-Record-ID: +WARC-Date: 2014-06-03T03:03:41Z +WARC-Payload-Digest: sha1:B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A +WARC-Target-URI: http://example.com?example=2 +WARC-Warcinfo-ID: +WARC-Profile: http://netpreserve.org/warc/0.18/revisit/identical-payload-digest +WARC-Refers-To-Target-URI: http://example.com?example=2 +WARC-Refers-To-Date: 2014-01-03T03:03:21Z +Content-Length: 0 + +