1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

warcserver: resource load: only read headers for self-redirect for response or revisit records

tests: add test with resource record (new warc/cdxj) to ensure correct read of resource records
This commit is contained in:
Ilya Kreymer 2017-11-30 14:13:47 -08:00
parent 8a107b0f6f
commit 9eba59d8b4
5 changed files with 20 additions and 13 deletions

View File

@ -152,19 +152,19 @@ StatusAndHeadersParserException: Expected Status Line starting with ['HTTP/1.0',
>>> cli_lines(['--sort', '-', TEST_WARC_DIR])
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz
urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 example-wpull.warc.gz
Total: 211
Total: 212
# test sort, multiple inputs, recursive, from base test dir
>>> cli_lines(['--sort', '-r', '-', get_test_dir()])
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 warcs/example-url-agnostic-revisit.warc.gz
urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 warcs/example-wpull.warc.gz
Total: 211
Total: 212
# test sort, 9-field, multiple inputs, all records + post query
>>> cli_lines(['--sort', '-a', '-p', '-9', TEST_WARC_DIR])
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - 355 example-url-agnostic-revisit.warc.gz
urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - 3181 example-wpull.warc.gz
Total: 406
Total: 407
# test writing to stdout
>>> cli_lines(['-', TEST_WARC_DIR + 'example.warc.gz'])
@ -188,7 +188,7 @@ Total: 4
>>> cli_lines(['--sort', '--dir-root', get_test_dir() + 'other/', TEST_WARC_DIR])
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 ../warcs/example-url-agnostic-revisit.warc.gz
urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 ../warcs/example-wpull.warc.gz
Total: 211
Total: 212
# test writing to temp dir, also use unicode filename
>>> cli_lines_with_dir(TEST_WARC_DIR + 'example.warc.gz')

View File

@ -195,15 +195,17 @@ class WARCPathLoader(DefaultResolverMixin, BaseLoader):
failed_files,
local_index_query))
status = cdx.get('status')
if not status or status.startswith('3'):
http_headers = self.headers_parser.parse(payload.raw_stream)
self.raise_on_self_redirect(params, cdx,
http_headers.get_statuscode(),
http_headers.get_header('Location'))
http_headers_buff = http_headers.to_bytes()
else:
http_headers_buff = None
http_headers_buff = None
if payload.rec_type in ('response', 'revisit'):
status = cdx.get('status')
# status may not be set for 'revisit'
if not status or status.startswith('3'):
http_headers = self.headers_parser.parse(payload.raw_stream)
self.raise_on_self_redirect(params, cdx,
http_headers.get_statuscode(),
http_headers.get_header('Location'))
http_headers_buff = http_headers.to_bytes()
warc_headers = payload.rec_headers

View File

@ -0,0 +1 @@
org,httpbin)/anything/resource.json 20171130220904 {"filename":"httpbin-resource.warc.gz","digest":"UQ3W6RIQVJO6ZEL55355BJODG2DMWBPH","length":"465","offset":"0","mime":"application/json","url":"http://httpbin.org/anything/resource.json"}

Binary file not shown.

View File

@ -101,6 +101,10 @@ class TestWbIntegration(BaseConfigTest):
csp = "default-src 'unsafe-eval' 'unsafe-inline' 'self' data: blob: mediastream: ws: wss: ; form-action 'self'"
assert resp.headers['Content-Security-Policy'] == csp
def test_replay_resource(self, fmod):
resp = self.get('/pywb/20171122230223{0}/http://httpbin.org/anything/resource.json', fmod)
assert resp.headers['Content-Type'] == 'application/json'
def test_replay_fuzzy_1(self, fmod):
resp = self.get('/pywb/20140127171238{0}/http://www.iana.org/?_=123', fmod)
assert resp.status_int == 200