From 9eba59d8b4b9eff91d8a411e3a1d46c1c43686ac Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 30 Nov 2017 14:13:47 -0800 Subject: [PATCH] warcserver: resource load: only read headers for self-redirect for response or revisit records tests: add test with resource record (new warc/cdxj) to ensure correct read of resource records --- pywb/indexer/test/test_indexing.py | 8 +++---- pywb/warcserver/resource/responseloader.py | 20 ++++++++++-------- sample_archive/cdx/httpbin-resource.cdxj | 1 + sample_archive/warcs/httpbin-resource.warc.gz | Bin 0 -> 465 bytes tests/test_integration.py | 4 ++++ 5 files changed, 20 insertions(+), 13 deletions(-) create mode 100644 sample_archive/cdx/httpbin-resource.cdxj create mode 100644 sample_archive/warcs/httpbin-resource.warc.gz diff --git a/pywb/indexer/test/test_indexing.py b/pywb/indexer/test/test_indexing.py index b2bd6dfe..de79f2e9 100644 --- a/pywb/indexer/test/test_indexing.py +++ b/pywb/indexer/test/test_indexing.py @@ -152,19 +152,19 @@ StatusAndHeadersParserException: Expected Status Line starting with ['HTTP/1.0', >>> cli_lines(['--sort', '-', TEST_WARC_DIR]) com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 example-wpull.warc.gz -Total: 211 +Total: 212 # test sort, multiple inputs, recursive, from base test dir >>> cli_lines(['--sort', '-r', '-', get_test_dir()]) com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 warcs/example-url-agnostic-revisit.warc.gz urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 warcs/example-wpull.warc.gz -Total: 211 +Total: 212 # test sort, 9-field, multiple inputs, all records + post query >>> cli_lines(['--sort', '-a', '-p', '-9', TEST_WARC_DIR]) com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - 355 example-url-agnostic-revisit.warc.gz urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - 3181 example-wpull.warc.gz -Total: 406 +Total: 407 # test writing to stdout >>> cli_lines(['-', TEST_WARC_DIR + 'example.warc.gz']) @@ -188,7 +188,7 @@ Total: 4 >>> cli_lines(['--sort', '--dir-root', get_test_dir() + 'other/', TEST_WARC_DIR]) com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 ../warcs/example-url-agnostic-revisit.warc.gz urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 ../warcs/example-wpull.warc.gz -Total: 211 +Total: 212 # test writing to temp dir, also use unicode filename >>> cli_lines_with_dir(TEST_WARC_DIR + 'example.warc.gz') diff --git a/pywb/warcserver/resource/responseloader.py b/pywb/warcserver/resource/responseloader.py index d45d291f..6b7cccc9 100644 --- a/pywb/warcserver/resource/responseloader.py +++ b/pywb/warcserver/resource/responseloader.py @@ -195,15 +195,17 @@ class WARCPathLoader(DefaultResolverMixin, BaseLoader): failed_files, local_index_query)) - status = cdx.get('status') - if not status or status.startswith('3'): - http_headers = self.headers_parser.parse(payload.raw_stream) - self.raise_on_self_redirect(params, cdx, - http_headers.get_statuscode(), - http_headers.get_header('Location')) - http_headers_buff = http_headers.to_bytes() - else: - http_headers_buff = None + http_headers_buff = None + if payload.rec_type in ('response', 'revisit'): + status = cdx.get('status') + # status may not be set for 'revisit' + if not status or status.startswith('3'): + http_headers = self.headers_parser.parse(payload.raw_stream) + self.raise_on_self_redirect(params, cdx, + http_headers.get_statuscode(), + http_headers.get_header('Location')) + + http_headers_buff = http_headers.to_bytes() warc_headers = payload.rec_headers diff --git a/sample_archive/cdx/httpbin-resource.cdxj b/sample_archive/cdx/httpbin-resource.cdxj new file mode 100644 index 00000000..6d01f921 --- /dev/null +++ b/sample_archive/cdx/httpbin-resource.cdxj @@ -0,0 +1 @@ +org,httpbin)/anything/resource.json 20171130220904 {"filename":"httpbin-resource.warc.gz","digest":"UQ3W6RIQVJO6ZEL55355BJODG2DMWBPH","length":"465","offset":"0","mime":"application/json","url":"http://httpbin.org/anything/resource.json"} diff --git a/sample_archive/warcs/httpbin-resource.warc.gz b/sample_archive/warcs/httpbin-resource.warc.gz new file mode 100644 index 0000000000000000000000000000000000000000..af3df8f332f21e8104e2ad5ffbceba23135ac1bc GIT binary patch literal 465 zcmV;?0WSU@iwFP!000021C^3pYuhjshVKpe55`wpnPtm~KO)8;U$iM5ZL?&7-YL?l zt<9B!EHgssf1i}tAvcBg&gZ=EdG+YX+sSGcVH6L%t^Eq;)By3580R6z;V@n&35mxf z9zAu2U#is%!p&+){AW=Ei6RtL#_V@OBBjfS%7dwdEF%rg(Tn~xN_NvT4fX*3-Z4qs zAG1E}t)Y+Tnk!&R^KdyQ{`Xc((zb#V%!@qDDTF-3JDUc1xZ5Qxn9bic0!qedthov3>K4rFO6;t#WY;0?T2&-Vj+&n6)L>)SWq zm7E%C%OE(F?%gisXQf{*$_gk4eKgq8HYSY0-nf9P=<1Yl?uHA=6t_!rvfvg-n8Vp) z86_|W?POM~2F}`zYL1x~FL2g24d`%E+BE0b+DEhhC>|s1f8kXB6B1`AL-E(Z+B<5u z6l<1kU7s2+U@FDk{Pkkp-CSu=iX%&wBU}_{SfFGy>eN=(J