From 913a1e9f31f66964e6d8dfe947e52d2b81db1250 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 25 Jun 2014 12:11:26 -0700 Subject: [PATCH] warc: simplify recordloader a bit more, only response and request records get parsed as http (excluding dns: and whois: uris) All others have an '-' status and no headers parsing tests: add test for zero-length revisits --- pywb/cdx/test/test_cdxops.py | 4 +- pywb/warc/recordloader.py | 52 +++++------- pywb/warc/test/test_indexing.py | 10 +-- sample_archive/cdx/example-extra.cdx | 3 + sample_archive/warcs/example-extra.warc | 107 ++++++++++++++++++++++++ tests/test_integration.py | 8 ++ 6 files changed, 145 insertions(+), 39 deletions(-) create mode 100644 sample_archive/cdx/example-extra.cdx create mode 100644 sample_archive/warcs/example-extra.warc diff --git a/pywb/cdx/test/test_cdxops.py b/pywb/cdx/test/test_cdxops.py index edfcd749..81ab4660 100644 --- a/pywb/cdx/test/test_cdxops.py +++ b/pywb/cdx/test/test_cdxops.py @@ -54,7 +54,7 @@ com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 20 com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz # Filter exact invert ->>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '!=urlkey:com,example)/?example=1') +>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = ['!=urlkey:com,example)/?example=1', '!=urlkey:com,example)/?example=2']) com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz @@ -65,7 +65,7 @@ com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 20 com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz # Filter contains invert ->>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '!~urlkey:example=1') +>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '!~urlkey:example=') com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz diff --git a/pywb/warc/recordloader.py b/pywb/warc/recordloader.py index ed904ac7..dee29020 100644 --- a/pywb/warc/recordloader.py +++ b/pywb/warc/recordloader.py @@ -46,6 +46,8 @@ class ArcWarcRecordLoader: HTTP_VERBS = ['GET', 'HEAD', 'POST', 'PUT', 'DELETE', 'TRACE', 'OPTIONS', 'CONNECT', 'PATCH'] + NON_HTTP_RECORDS = ('warcinfo', 'arc_header', 'metadata', 'resource') + def __init__(self, loader=None, cookie_maker=None, block_size=8192): if not loader: loader = BlockLoader(cookie_maker) @@ -94,25 +96,22 @@ class ArcWarcRecordLoader: known_format)) if the_format == 'arc': - rec_type = 'response' uri = rec_headers.get_header('uri') length = rec_headers.get_header('length') + content_type = rec_headers.get_header('content-type') sub_len = rec_headers.total_len + if uri and uri.startswith('filedesc://'): + rec_type = 'arc_header' + else: + rec_type = 'response' elif the_format == 'warc': rec_type = rec_headers.get_header('WARC-Type') uri = rec_headers.get_header('WARC-Target-URI') length = rec_headers.get_header('Content-Length') + content_type = rec_headers.get_header('Content-Type') sub_len = 0 - if rec_type == 'response' and uri: - if uri.startswith('filedesc://'): - rec_type = 'arc_header' - elif uri.startswith('dns:'): - rec_type = 'dns_response' - elif uri.startswith('whois:'): - rec_type = 'whois_response' - is_err = False try: @@ -124,39 +123,28 @@ class ArcWarcRecordLoader: # err condition if is_err: - status_headers = StatusAndHeaders('-', []) length = 0 - # special case: empty w/arc record (hopefully a revisit) - elif length == 0: - status_headers = StatusAndHeaders('204 No Content', []) - # limit stream to the length for all valid records stream = LimitReader.wrap_stream(stream, length) + # if empty record (error or otherwise) set status to - if length == 0: - # already handled error case above - pass + status_headers = StatusAndHeaders('- None', []) - # ================================================================ - # handle different types of records - # special case: warc records that are not expected to have http headers - # attempt to add 200 status and content-type - elif rec_type == 'metadata' or rec_type == 'resource': - content_type = [('Content-Type', - rec_headers.get_header('Content-Type'))] + # response record or non-empty revisit: parse HTTP status and headers! + elif (rec_type in ('response', 'revisit') and + not uri.startswith(('dns:', 'whois:'))): + status_headers = self.http_parser.parse(stream) - status_headers = StatusAndHeaders('200 OK', content_type) - - elif (rec_type in ('warcinfo', 'arc_header', 'dns_response', 'whois_response')): - # no extra parsing of body for these - status_headers = StatusAndHeaders('204 No Content', []) - - elif (rec_type == 'request'): + # request record: parse request + elif ((rec_type == 'request') and + not uri.startswith(('dns:', 'whois:'))): status_headers = self.http_req_parser.parse(stream) - # response record: parse HTTP status and headers! + # everything else: create a no-status entry, set content-type else: - status_headers = self.http_parser.parse(stream) + content_type_header = [('Content-Type', content_type)] + status_headers = StatusAndHeaders('- OK', content_type_header) return ArcWarcRecord(the_format, rec_type, rec_headers, stream, status_headers) diff --git a/pywb/warc/test/test_indexing.py b/pywb/warc/test/test_indexing.py index 172d923e..e9c7a892 100644 --- a/pywb/warc/test/test_indexing.py +++ b/pywb/warc/test/test_indexing.py @@ -47,9 +47,9 @@ com,example)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ >>> print_cdx_index('example-wget-1-14.warc.gz') CDX N b a m s k r M S V g com,example)/ 20140216012908 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1151 792 example-wget-1-14.warc.gz -metadata)/gnu.org/software/wget/warc/manifest.txt 20140216012908 metadata://gnu.org/software/wget/warc/MANIFEST.txt text/plain 200 SWUF4CK2XMZSOKSA7SDT7M7NUGWH2TRE - - 315 1943 example-wget-1-14.warc.gz -metadata)/gnu.org/software/wget/warc/wget_arguments.txt 20140216012908 metadata://gnu.org/software/wget/warc/wget_arguments.txt text/plain 200 UCXDCGORD6K4RJT5NUQGKE2PKEG4ZZD6 - - 340 2258 example-wget-1-14.warc.gz -metadata)/gnu.org/software/wget/warc/wget.log 20140216012908 metadata://gnu.org/software/wget/warc/wget.log text/plain 200 2ULE2LD5UOWDXGACCT624TU5BVKACRQ4 - - 599 2598 example-wget-1-14.warc.gz +metadata)/gnu.org/software/wget/warc/manifest.txt 20140216012908 metadata://gnu.org/software/wget/warc/MANIFEST.txt text/plain - SWUF4CK2XMZSOKSA7SDT7M7NUGWH2TRE - - 315 1943 example-wget-1-14.warc.gz +metadata)/gnu.org/software/wget/warc/wget_arguments.txt 20140216012908 metadata://gnu.org/software/wget/warc/wget_arguments.txt text/plain - UCXDCGORD6K4RJT5NUQGKE2PKEG4ZZD6 - - 340 2258 example-wget-1-14.warc.gz +metadata)/gnu.org/software/wget/warc/wget.log 20140216012908 metadata://gnu.org/software/wget/warc/wget.log text/plain - 2ULE2LD5UOWDXGACCT624TU5BVKACRQ4 - - 599 2598 example-wget-1-14.warc.gz # bad arcs -- test error edge cases >>> print_cdx_index('bad.arc') @@ -104,13 +104,13 @@ org,httpbin)/post?data=^&foo=bar 20140610001255 http://httpbin.org/post?foo=bar >>> cli_lines(['--sort', '-', TEST_WARC_DIR]) com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1001 353 example-url-agnostic-orig.warc.gz -204 +206 # test sort, multiple inputs, all records + post query >>> cli_lines(['--sort', '-a', '-p', '-9', TEST_WARC_DIR]) com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - 355 example-url-agnostic-revisit.warc.gz org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - 353 example-url-agnostic-orig.warc.gz -395 +398 # test writing to stdout >>> cli_lines(['-', TEST_WARC_DIR + 'example.warc.gz']) diff --git a/sample_archive/cdx/example-extra.cdx b/sample_archive/cdx/example-extra.cdx new file mode 100644 index 00000000..bdc13f92 --- /dev/null +++ b/sample_archive/cdx/example-extra.cdx @@ -0,0 +1,3 @@ + CDX N b a m s k r M S V g +com,example)/?example=2 20140103030321 http://example.com?example=2 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1987 0 example-extra.warc +com,example)/?example=2 20140603030341 http://example.com?example=2 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 504 2701 example-extra.warc diff --git a/sample_archive/warcs/example-extra.warc b/sample_archive/warcs/example-extra.warc new file mode 100644 index 00000000..8839ddd1 --- /dev/null +++ b/sample_archive/warcs/example-extra.warc @@ -0,0 +1,107 @@ +WARC/1.0 +WARC-Type: response +WARC-Record-ID: +WARC-Date: 2014-01-03T03:03:21Z +Content-Length: 1610 +Content-Type: application/http; msgtype=response +WARC-Payload-Digest: sha1:B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A +WARC-Target-URI: http://example.com?example=2 +WARC-Warcinfo-ID: + +HTTP/1.1 200 OK +Accept-Ranges: bytes +Cache-Control: max-age=604800 +Content-Type: text/html +Date: Fri, 03 Jan 2014 03:03:21 GMT +Etag: "359670651" +Expires: Fri, 10 Jan 2014 03:03:21 GMT +Last-Modified: Fri, 09 Aug 2013 23:54:35 GMT +Server: ECS (sjc/4FCE) +X-Cache: HIT +x-ec-custom-error: 1 +Content-Length: 1270 +Connection: close + + + + + Example Domain + + + + + + + + +
+

Example Domain

+

This domain is established to be used for illustrative examples in documents. You may use this + domain in examples without prior coordination or asking for permission.

+

More information...

+
+ + + + +WARC/1.0 +WARC-Type: request +WARC-Record-ID: +WARC-Date: 2014-01-03T03:03:21Z +Content-Length: 323 +Content-Type: application/http; msgtype=request +WARC-Concurrent-To: +WARC-Target-URI: http://example.com?example=2 +WARC-Warcinfo-ID: + +GET /?example=2 HTTP/1.1 +Connection: close +Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8 +Accept-Language: en-US,en;q=0.8 +User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.57 Safari/537.36 (via Wayback Save Page) +Host: example.com + + +WARC/1.0 +WARC-Type: revisit +WARC-Record-ID: +WARC-Date: 2014-06-03T03:03:41Z +WARC-Payload-Digest: sha1:B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A +WARC-Target-URI: http://example.com?example=2 +WARC-Warcinfo-ID: +WARC-Profile: http://netpreserve.org/warc/0.18/revisit/identical-payload-digest +WARC-Refers-To-Target-URI: http://example.com?example=2 +WARC-Refers-To-Date: 2014-01-03T03:03:21Z +Content-Length: 0 + + diff --git a/tests/test_integration.py b/tests/test_integration.py index 0541b08a..107ae2bb 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -115,6 +115,14 @@ class TestWb: assert 'wb.js' in resp.body assert '/pywb-nosurt/20140103030321mp_/http://www.iana.org/domains/example' in resp.body + def test_zero_len_revisit(self): + resp = self.testapp.get('/pywb/20140603030341mp_/http://example.com?example=2') + self._assert_basic_html(resp) + + assert 'Tue, Jun 03 2014 03:03:41' in resp.body + assert 'wb.js' in resp.body + assert '/pywb/20140603030341mp_/http://www.iana.org/domains/example' in resp.body + def test_replay_url_agnostic_revisit(self): resp = self.testapp.get('/pywb/20130729195151mp_/http://www.example.com/') self._assert_basic_html(resp)