mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
record reading: better handling of empty arc (or warc) records
for indexing, index empty/invalid length as '-' status code for reading, serve as 204 no content. ensure that StatusAndHeaders has a valid statusline when serving if http content-length is valid,, limit stream to that content-length as well as record content-length (whichever is smaller) replace content-length when buffering
This commit is contained in:
parent
d8c20a59cf
commit
64eef7063d
@ -93,7 +93,10 @@ class BlockLoader(object):
|
||||
headers['Range'] = range_header
|
||||
|
||||
if self.cookie_maker:
|
||||
headers['Cookie'] = self.cookie_maker.make()
|
||||
if isinstance(self.cookie_maker, basestring):
|
||||
headers['Cookie'] = self.cookie_maker
|
||||
else:
|
||||
headers['Cookie'] = self.cookie_maker.make()
|
||||
|
||||
request = urllib2.Request(url, headers=headers)
|
||||
return urllib2.urlopen(request)
|
||||
@ -184,7 +187,12 @@ class LimitReader(object):
|
||||
try:
|
||||
content_length = int(content_length)
|
||||
if content_length >= 0:
|
||||
stream = LimitReader(stream, content_length)
|
||||
# optimize: if already a LimitStream, set limit to
|
||||
# the smaller of the two limits
|
||||
if isinstance(stream, LimitReader):
|
||||
stream.limit = min(stream.limit, content_length)
|
||||
else:
|
||||
stream = LimitReader(stream, content_length)
|
||||
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
|
@ -29,6 +29,21 @@ class StatusAndHeaders(object):
|
||||
if value[0].lower() == name_lower:
|
||||
return value[1]
|
||||
|
||||
def replace_header(self, name, value):
|
||||
"""
|
||||
replace header with new value or add new header
|
||||
return old header value, if any
|
||||
"""
|
||||
name_lower = name.lower()
|
||||
for index in xrange(len(self.headers) - 1, -1, -1):
|
||||
curr_name, curr_value = self.headers[index]
|
||||
if curr_name.lower() == name_lower:
|
||||
self.headers[index] = (curr_name, value)
|
||||
return curr_value
|
||||
|
||||
self.headers.append((name, value))
|
||||
return None
|
||||
|
||||
def remove_header(self, name):
|
||||
"""
|
||||
remove header (case-insensitive)
|
||||
@ -42,6 +57,20 @@ class StatusAndHeaders(object):
|
||||
|
||||
return False
|
||||
|
||||
def validate_statusline(self, valid_statusline):
|
||||
"""
|
||||
Check that the statusline is valid, eg. starts with a numeric
|
||||
code. If not, replace with passed in valid_statusline
|
||||
"""
|
||||
code = self.statusline.split(' ', 1)[0]
|
||||
try:
|
||||
code = int(code)
|
||||
assert(code > 0)
|
||||
return True
|
||||
except ValueError, AssertionError:
|
||||
self.statusline = valid_statusline
|
||||
return False
|
||||
|
||||
def __repr__(self):
|
||||
headers_str = pprint.pformat(self.headers, indent=2)
|
||||
return "StatusAndHeaders(protocol = '{0}', statusline = '{1}', \
|
||||
@ -81,9 +110,16 @@ class StatusAndHeadersParser(object):
|
||||
|
||||
statusline, total_read = _strip_count(full_statusline, 0)
|
||||
|
||||
headers = []
|
||||
|
||||
# at end of stream
|
||||
if total_read == 0:
|
||||
raise EOFError()
|
||||
elif not statusline:
|
||||
return StatusAndHeaders(statusline=statusline,
|
||||
headers=headers,
|
||||
protocol='',
|
||||
total_len=total_read)
|
||||
|
||||
protocol_status = self.split_prefix(statusline, self.statuslist)
|
||||
|
||||
@ -92,13 +128,15 @@ class StatusAndHeadersParser(object):
|
||||
msg = msg.format(self.statuslist, statusline)
|
||||
raise StatusAndHeadersParserException(msg, full_statusline)
|
||||
|
||||
headers = []
|
||||
|
||||
line, total_read = _strip_count(stream.readline(), total_read)
|
||||
while line:
|
||||
name, value = line.split(':', 1)
|
||||
name = name.rstrip(' \t')
|
||||
value = value.lstrip()
|
||||
result = line.split(':', 1)
|
||||
if len(result) == 2:
|
||||
name = result[0].rstrip(' \t')
|
||||
value = result[1].lstrip()
|
||||
else:
|
||||
name = result[0]
|
||||
value = None
|
||||
|
||||
next_line, total_read = _strip_count(stream.readline(),
|
||||
total_read)
|
||||
@ -109,8 +147,10 @@ class StatusAndHeadersParser(object):
|
||||
next_line, total_read = _strip_count(stream.readline(),
|
||||
total_read)
|
||||
|
||||
header = (name, value)
|
||||
headers.append(header)
|
||||
if value is not None:
|
||||
header = (name, value)
|
||||
headers.append(header)
|
||||
|
||||
line = next_line
|
||||
|
||||
return StatusAndHeaders(statusline=protocol_status[1].strip(),
|
||||
|
@ -13,6 +13,14 @@ StatusAndHeadersParserException: Expected Status Line starting with ['Other'] -
|
||||
>>> st1 == StatusAndHeadersParser(['HTTP/1.0']).parse(BytesIO(status_headers_1))
|
||||
True
|
||||
|
||||
# replace header, print new headers
|
||||
>>> st1.replace_header('some', 'Another-Value'); st1
|
||||
'Value'
|
||||
StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '200 OK', headers = [ ('Content-Type', 'ABC'),
|
||||
('Some', 'Another-Value'),
|
||||
('Multi-Line', 'Value1 Also This')])
|
||||
|
||||
|
||||
# remove header
|
||||
>>> st1.remove_header('some')
|
||||
True
|
||||
@ -20,6 +28,10 @@ True
|
||||
# already removed
|
||||
>>> st1.remove_header('Some')
|
||||
False
|
||||
|
||||
# empty
|
||||
>>> st2 = StatusAndHeadersParser(['HTTP/1.0']).parse(BytesIO(status_headers_2)); x = st2.validate_statusline('204 No Content'); st2
|
||||
StatusAndHeaders(protocol = '', statusline = '204 No Content', headers = [])
|
||||
"""
|
||||
|
||||
|
||||
@ -37,6 +49,11 @@ Multi-Line: Value1\r\n\
|
||||
Body"
|
||||
|
||||
|
||||
status_headers_2 = """
|
||||
|
||||
"""
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import doctest
|
||||
doctest.testmod()
|
||||
|
@ -164,7 +164,7 @@ class ArchiveIndexer(object):
|
||||
|
||||
digest = record.rec_headers.get_header('WARC-Payload-Digest')
|
||||
|
||||
status = record.status_headers.statusline.split(' ')[0]
|
||||
status = self._extract_status(record.status_headers)
|
||||
|
||||
if record.rec_type == 'revisit':
|
||||
mime = 'warc/revisit'
|
||||
@ -205,7 +205,9 @@ class ArchiveIndexer(object):
|
||||
timestamp = record.rec_headers.get_header('archive-date')
|
||||
if len(timestamp) > 14:
|
||||
timestamp = timestamp[:14]
|
||||
status = record.status_headers.statusline.split(' ')[0]
|
||||
|
||||
status = self._extract_status(record.status_headers)
|
||||
|
||||
mime = record.rec_headers.get_header('content-type')
|
||||
mime = self._extract_mime(mime)
|
||||
|
||||
@ -228,6 +230,12 @@ class ArchiveIndexer(object):
|
||||
mime = 'unk'
|
||||
return mime
|
||||
|
||||
def _extract_status(self, status_headers):
|
||||
status = status_headers.statusline.split(' ')[0]
|
||||
if not status:
|
||||
status = '-'
|
||||
return status
|
||||
|
||||
def read_rest(self, reader, digester=None):
|
||||
""" Read remainder of the stream
|
||||
If a digester is included, update it
|
||||
|
@ -97,18 +97,24 @@ class ArcWarcRecordLoader:
|
||||
rec_type = rec_headers.get_header('WARC-Type')
|
||||
length = rec_headers.get_header('Content-Length')
|
||||
|
||||
is_err = False
|
||||
|
||||
try:
|
||||
length = int(length)
|
||||
if length < 0:
|
||||
length = 0
|
||||
is_err = True
|
||||
except ValueError:
|
||||
length = 0
|
||||
is_err = True
|
||||
|
||||
# ================================================================
|
||||
# handle different types of records
|
||||
|
||||
# err condition
|
||||
if is_err:
|
||||
status_headers = StatusAndHeaders('-', [])
|
||||
length = 0
|
||||
# special case: empty w/arc record (hopefully a revisit)
|
||||
if length == 0:
|
||||
elif length == 0:
|
||||
status_headers = StatusAndHeaders('204 No Content', [])
|
||||
|
||||
# special case: warc records that are not expected to have http headers
|
||||
|
@ -63,6 +63,9 @@ class ResolvingLoader:
|
||||
if not headers_record or not payload_record:
|
||||
raise ArchiveLoadFailed('Could not load ' + str(cdx))
|
||||
|
||||
# ensure status line is valid from here
|
||||
headers_record.status_headers.validate_statusline('204 No Content')
|
||||
|
||||
return (headers_record.status_headers, payload_record.stream)
|
||||
|
||||
def _resolve_path_load(self, cdx, is_original, failed_files):
|
||||
|
@ -36,8 +36,9 @@ metadata)/gnu.org/software/wget/warc/wget.log 20140216012908 metadata://gnu.org/
|
||||
# bad arcs -- test error edge cases
|
||||
>>> print_cdx_index('bad.arc')
|
||||
CDX N b a m s k r M S V g
|
||||
com,example)/ 20140401000000 http://example.com/ text/html 204 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 67 134 bad.arc
|
||||
com,example)/ 20140401000000 http://example.com/ text/html 204 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 68 202 bad.arc
|
||||
com,example)/ 20140401000000 http://example.com/ text/html - 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 67 134 bad.arc
|
||||
com,example)/ 20140102000000 http://example.com/ text/plain - 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 59 202 bad.arc
|
||||
com,example)/ 20140401000000 http://example.com/ text/html - 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 68 262 bad.arc
|
||||
|
||||
# Test CLI interface -- (check for num lines)
|
||||
#=================================================================
|
||||
@ -46,7 +47,7 @@ com,example)/ 20140401000000 http://example.com/ text/html 204 3I42H3S6NNFQ2MSVX
|
||||
>>> cli_lines(['--sort', '-', TEST_WARC_DIR])
|
||||
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz
|
||||
org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1001 353 example-url-agnostic-orig.warc.gz
|
||||
200
|
||||
201
|
||||
|
||||
# test writing to stdout
|
||||
>>> cli_lines(['-', TEST_WARC_DIR + 'example.warc.gz'])
|
||||
|
@ -1,9 +1,9 @@
|
||||
import re
|
||||
from io import BytesIO
|
||||
|
||||
from pywb.utils.bufferedreaders import ChunkedDataReader
|
||||
from pywb.utils.statusandheaders import StatusAndHeaders
|
||||
from pywb.utils.wbexception import WbException, NotFoundException
|
||||
from pywb.utils.loaders import LimitReader
|
||||
|
||||
from pywb.framework.wbrequestresponse import WbResponse
|
||||
from pywb.framework.memento import MementoResponse
|
||||
@ -105,6 +105,9 @@ class ReplayView(object):
|
||||
if redir_response:
|
||||
return redir_response
|
||||
|
||||
length = status_headers.get_header('content-length')
|
||||
stream = LimitReader.wrap_stream(stream, length)
|
||||
|
||||
# one more check for referrer-based self-redirect
|
||||
self._reject_referrer_self_redirect(wbrequest)
|
||||
|
||||
@ -124,9 +127,6 @@ class ReplayView(object):
|
||||
|
||||
# buffer response if buffering enabled
|
||||
if self.buffer_response:
|
||||
if wbrequest.is_identity:
|
||||
status_headers.remove_header('content-length')
|
||||
|
||||
response_iter = self.buffered_response(status_headers,
|
||||
response_iter)
|
||||
|
||||
@ -165,8 +165,10 @@ class ReplayView(object):
|
||||
content = out.getvalue()
|
||||
|
||||
content_length_str = str(len(content))
|
||||
status_headers.headers.append(('Content-Length',
|
||||
content_length_str))
|
||||
|
||||
# remove existing content length
|
||||
status_headers.replace_header('Content-Length',
|
||||
content_length_str)
|
||||
out.close()
|
||||
|
||||
return content
|
||||
|
Loading…
x
Reference in New Issue
Block a user