mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
record reading: better handling of empty arc (or warc) records
for indexing, index empty/invalid length as '-' status code for reading, serve as 204 no content. ensure that StatusAndHeaders has a valid statusline when serving if http content-length is valid,, limit stream to that content-length as well as record content-length (whichever is smaller) replace content-length when buffering
This commit is contained in:
parent
d8c20a59cf
commit
64eef7063d
@ -93,7 +93,10 @@ class BlockLoader(object):
|
|||||||
headers['Range'] = range_header
|
headers['Range'] = range_header
|
||||||
|
|
||||||
if self.cookie_maker:
|
if self.cookie_maker:
|
||||||
headers['Cookie'] = self.cookie_maker.make()
|
if isinstance(self.cookie_maker, basestring):
|
||||||
|
headers['Cookie'] = self.cookie_maker
|
||||||
|
else:
|
||||||
|
headers['Cookie'] = self.cookie_maker.make()
|
||||||
|
|
||||||
request = urllib2.Request(url, headers=headers)
|
request = urllib2.Request(url, headers=headers)
|
||||||
return urllib2.urlopen(request)
|
return urllib2.urlopen(request)
|
||||||
@ -184,7 +187,12 @@ class LimitReader(object):
|
|||||||
try:
|
try:
|
||||||
content_length = int(content_length)
|
content_length = int(content_length)
|
||||||
if content_length >= 0:
|
if content_length >= 0:
|
||||||
stream = LimitReader(stream, content_length)
|
# optimize: if already a LimitStream, set limit to
|
||||||
|
# the smaller of the two limits
|
||||||
|
if isinstance(stream, LimitReader):
|
||||||
|
stream.limit = min(stream.limit, content_length)
|
||||||
|
else:
|
||||||
|
stream = LimitReader(stream, content_length)
|
||||||
|
|
||||||
except (ValueError, TypeError):
|
except (ValueError, TypeError):
|
||||||
pass
|
pass
|
||||||
|
@ -29,6 +29,21 @@ class StatusAndHeaders(object):
|
|||||||
if value[0].lower() == name_lower:
|
if value[0].lower() == name_lower:
|
||||||
return value[1]
|
return value[1]
|
||||||
|
|
||||||
|
def replace_header(self, name, value):
|
||||||
|
"""
|
||||||
|
replace header with new value or add new header
|
||||||
|
return old header value, if any
|
||||||
|
"""
|
||||||
|
name_lower = name.lower()
|
||||||
|
for index in xrange(len(self.headers) - 1, -1, -1):
|
||||||
|
curr_name, curr_value = self.headers[index]
|
||||||
|
if curr_name.lower() == name_lower:
|
||||||
|
self.headers[index] = (curr_name, value)
|
||||||
|
return curr_value
|
||||||
|
|
||||||
|
self.headers.append((name, value))
|
||||||
|
return None
|
||||||
|
|
||||||
def remove_header(self, name):
|
def remove_header(self, name):
|
||||||
"""
|
"""
|
||||||
remove header (case-insensitive)
|
remove header (case-insensitive)
|
||||||
@ -42,6 +57,20 @@ class StatusAndHeaders(object):
|
|||||||
|
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
def validate_statusline(self, valid_statusline):
|
||||||
|
"""
|
||||||
|
Check that the statusline is valid, eg. starts with a numeric
|
||||||
|
code. If not, replace with passed in valid_statusline
|
||||||
|
"""
|
||||||
|
code = self.statusline.split(' ', 1)[0]
|
||||||
|
try:
|
||||||
|
code = int(code)
|
||||||
|
assert(code > 0)
|
||||||
|
return True
|
||||||
|
except ValueError, AssertionError:
|
||||||
|
self.statusline = valid_statusline
|
||||||
|
return False
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
headers_str = pprint.pformat(self.headers, indent=2)
|
headers_str = pprint.pformat(self.headers, indent=2)
|
||||||
return "StatusAndHeaders(protocol = '{0}', statusline = '{1}', \
|
return "StatusAndHeaders(protocol = '{0}', statusline = '{1}', \
|
||||||
@ -81,9 +110,16 @@ class StatusAndHeadersParser(object):
|
|||||||
|
|
||||||
statusline, total_read = _strip_count(full_statusline, 0)
|
statusline, total_read = _strip_count(full_statusline, 0)
|
||||||
|
|
||||||
|
headers = []
|
||||||
|
|
||||||
# at end of stream
|
# at end of stream
|
||||||
if total_read == 0:
|
if total_read == 0:
|
||||||
raise EOFError()
|
raise EOFError()
|
||||||
|
elif not statusline:
|
||||||
|
return StatusAndHeaders(statusline=statusline,
|
||||||
|
headers=headers,
|
||||||
|
protocol='',
|
||||||
|
total_len=total_read)
|
||||||
|
|
||||||
protocol_status = self.split_prefix(statusline, self.statuslist)
|
protocol_status = self.split_prefix(statusline, self.statuslist)
|
||||||
|
|
||||||
@ -92,13 +128,15 @@ class StatusAndHeadersParser(object):
|
|||||||
msg = msg.format(self.statuslist, statusline)
|
msg = msg.format(self.statuslist, statusline)
|
||||||
raise StatusAndHeadersParserException(msg, full_statusline)
|
raise StatusAndHeadersParserException(msg, full_statusline)
|
||||||
|
|
||||||
headers = []
|
|
||||||
|
|
||||||
line, total_read = _strip_count(stream.readline(), total_read)
|
line, total_read = _strip_count(stream.readline(), total_read)
|
||||||
while line:
|
while line:
|
||||||
name, value = line.split(':', 1)
|
result = line.split(':', 1)
|
||||||
name = name.rstrip(' \t')
|
if len(result) == 2:
|
||||||
value = value.lstrip()
|
name = result[0].rstrip(' \t')
|
||||||
|
value = result[1].lstrip()
|
||||||
|
else:
|
||||||
|
name = result[0]
|
||||||
|
value = None
|
||||||
|
|
||||||
next_line, total_read = _strip_count(stream.readline(),
|
next_line, total_read = _strip_count(stream.readline(),
|
||||||
total_read)
|
total_read)
|
||||||
@ -109,8 +147,10 @@ class StatusAndHeadersParser(object):
|
|||||||
next_line, total_read = _strip_count(stream.readline(),
|
next_line, total_read = _strip_count(stream.readline(),
|
||||||
total_read)
|
total_read)
|
||||||
|
|
||||||
header = (name, value)
|
if value is not None:
|
||||||
headers.append(header)
|
header = (name, value)
|
||||||
|
headers.append(header)
|
||||||
|
|
||||||
line = next_line
|
line = next_line
|
||||||
|
|
||||||
return StatusAndHeaders(statusline=protocol_status[1].strip(),
|
return StatusAndHeaders(statusline=protocol_status[1].strip(),
|
||||||
|
@ -13,6 +13,14 @@ StatusAndHeadersParserException: Expected Status Line starting with ['Other'] -
|
|||||||
>>> st1 == StatusAndHeadersParser(['HTTP/1.0']).parse(BytesIO(status_headers_1))
|
>>> st1 == StatusAndHeadersParser(['HTTP/1.0']).parse(BytesIO(status_headers_1))
|
||||||
True
|
True
|
||||||
|
|
||||||
|
# replace header, print new headers
|
||||||
|
>>> st1.replace_header('some', 'Another-Value'); st1
|
||||||
|
'Value'
|
||||||
|
StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '200 OK', headers = [ ('Content-Type', 'ABC'),
|
||||||
|
('Some', 'Another-Value'),
|
||||||
|
('Multi-Line', 'Value1 Also This')])
|
||||||
|
|
||||||
|
|
||||||
# remove header
|
# remove header
|
||||||
>>> st1.remove_header('some')
|
>>> st1.remove_header('some')
|
||||||
True
|
True
|
||||||
@ -20,6 +28,10 @@ True
|
|||||||
# already removed
|
# already removed
|
||||||
>>> st1.remove_header('Some')
|
>>> st1.remove_header('Some')
|
||||||
False
|
False
|
||||||
|
|
||||||
|
# empty
|
||||||
|
>>> st2 = StatusAndHeadersParser(['HTTP/1.0']).parse(BytesIO(status_headers_2)); x = st2.validate_statusline('204 No Content'); st2
|
||||||
|
StatusAndHeaders(protocol = '', statusline = '204 No Content', headers = [])
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
@ -37,6 +49,11 @@ Multi-Line: Value1\r\n\
|
|||||||
Body"
|
Body"
|
||||||
|
|
||||||
|
|
||||||
|
status_headers_2 = """
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import doctest
|
import doctest
|
||||||
doctest.testmod()
|
doctest.testmod()
|
||||||
|
@ -164,7 +164,7 @@ class ArchiveIndexer(object):
|
|||||||
|
|
||||||
digest = record.rec_headers.get_header('WARC-Payload-Digest')
|
digest = record.rec_headers.get_header('WARC-Payload-Digest')
|
||||||
|
|
||||||
status = record.status_headers.statusline.split(' ')[0]
|
status = self._extract_status(record.status_headers)
|
||||||
|
|
||||||
if record.rec_type == 'revisit':
|
if record.rec_type == 'revisit':
|
||||||
mime = 'warc/revisit'
|
mime = 'warc/revisit'
|
||||||
@ -205,7 +205,9 @@ class ArchiveIndexer(object):
|
|||||||
timestamp = record.rec_headers.get_header('archive-date')
|
timestamp = record.rec_headers.get_header('archive-date')
|
||||||
if len(timestamp) > 14:
|
if len(timestamp) > 14:
|
||||||
timestamp = timestamp[:14]
|
timestamp = timestamp[:14]
|
||||||
status = record.status_headers.statusline.split(' ')[0]
|
|
||||||
|
status = self._extract_status(record.status_headers)
|
||||||
|
|
||||||
mime = record.rec_headers.get_header('content-type')
|
mime = record.rec_headers.get_header('content-type')
|
||||||
mime = self._extract_mime(mime)
|
mime = self._extract_mime(mime)
|
||||||
|
|
||||||
@ -228,6 +230,12 @@ class ArchiveIndexer(object):
|
|||||||
mime = 'unk'
|
mime = 'unk'
|
||||||
return mime
|
return mime
|
||||||
|
|
||||||
|
def _extract_status(self, status_headers):
|
||||||
|
status = status_headers.statusline.split(' ')[0]
|
||||||
|
if not status:
|
||||||
|
status = '-'
|
||||||
|
return status
|
||||||
|
|
||||||
def read_rest(self, reader, digester=None):
|
def read_rest(self, reader, digester=None):
|
||||||
""" Read remainder of the stream
|
""" Read remainder of the stream
|
||||||
If a digester is included, update it
|
If a digester is included, update it
|
||||||
|
@ -97,18 +97,24 @@ class ArcWarcRecordLoader:
|
|||||||
rec_type = rec_headers.get_header('WARC-Type')
|
rec_type = rec_headers.get_header('WARC-Type')
|
||||||
length = rec_headers.get_header('Content-Length')
|
length = rec_headers.get_header('Content-Length')
|
||||||
|
|
||||||
|
is_err = False
|
||||||
|
|
||||||
try:
|
try:
|
||||||
length = int(length)
|
length = int(length)
|
||||||
if length < 0:
|
if length < 0:
|
||||||
length = 0
|
is_err = True
|
||||||
except ValueError:
|
except ValueError:
|
||||||
length = 0
|
is_err = True
|
||||||
|
|
||||||
# ================================================================
|
# ================================================================
|
||||||
# handle different types of records
|
# handle different types of records
|
||||||
|
|
||||||
|
# err condition
|
||||||
|
if is_err:
|
||||||
|
status_headers = StatusAndHeaders('-', [])
|
||||||
|
length = 0
|
||||||
# special case: empty w/arc record (hopefully a revisit)
|
# special case: empty w/arc record (hopefully a revisit)
|
||||||
if length == 0:
|
elif length == 0:
|
||||||
status_headers = StatusAndHeaders('204 No Content', [])
|
status_headers = StatusAndHeaders('204 No Content', [])
|
||||||
|
|
||||||
# special case: warc records that are not expected to have http headers
|
# special case: warc records that are not expected to have http headers
|
||||||
|
@ -63,6 +63,9 @@ class ResolvingLoader:
|
|||||||
if not headers_record or not payload_record:
|
if not headers_record or not payload_record:
|
||||||
raise ArchiveLoadFailed('Could not load ' + str(cdx))
|
raise ArchiveLoadFailed('Could not load ' + str(cdx))
|
||||||
|
|
||||||
|
# ensure status line is valid from here
|
||||||
|
headers_record.status_headers.validate_statusline('204 No Content')
|
||||||
|
|
||||||
return (headers_record.status_headers, payload_record.stream)
|
return (headers_record.status_headers, payload_record.stream)
|
||||||
|
|
||||||
def _resolve_path_load(self, cdx, is_original, failed_files):
|
def _resolve_path_load(self, cdx, is_original, failed_files):
|
||||||
|
@ -36,8 +36,9 @@ metadata)/gnu.org/software/wget/warc/wget.log 20140216012908 metadata://gnu.org/
|
|||||||
# bad arcs -- test error edge cases
|
# bad arcs -- test error edge cases
|
||||||
>>> print_cdx_index('bad.arc')
|
>>> print_cdx_index('bad.arc')
|
||||||
CDX N b a m s k r M S V g
|
CDX N b a m s k r M S V g
|
||||||
com,example)/ 20140401000000 http://example.com/ text/html 204 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 67 134 bad.arc
|
com,example)/ 20140401000000 http://example.com/ text/html - 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 67 134 bad.arc
|
||||||
com,example)/ 20140401000000 http://example.com/ text/html 204 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 68 202 bad.arc
|
com,example)/ 20140102000000 http://example.com/ text/plain - 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 59 202 bad.arc
|
||||||
|
com,example)/ 20140401000000 http://example.com/ text/html - 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 68 262 bad.arc
|
||||||
|
|
||||||
# Test CLI interface -- (check for num lines)
|
# Test CLI interface -- (check for num lines)
|
||||||
#=================================================================
|
#=================================================================
|
||||||
@ -46,7 +47,7 @@ com,example)/ 20140401000000 http://example.com/ text/html 204 3I42H3S6NNFQ2MSVX
|
|||||||
>>> cli_lines(['--sort', '-', TEST_WARC_DIR])
|
>>> cli_lines(['--sort', '-', TEST_WARC_DIR])
|
||||||
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz
|
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz
|
||||||
org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1001 353 example-url-agnostic-orig.warc.gz
|
org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1001 353 example-url-agnostic-orig.warc.gz
|
||||||
200
|
201
|
||||||
|
|
||||||
# test writing to stdout
|
# test writing to stdout
|
||||||
>>> cli_lines(['-', TEST_WARC_DIR + 'example.warc.gz'])
|
>>> cli_lines(['-', TEST_WARC_DIR + 'example.warc.gz'])
|
||||||
|
@ -1,9 +1,9 @@
|
|||||||
import re
|
import re
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
|
|
||||||
from pywb.utils.bufferedreaders import ChunkedDataReader
|
|
||||||
from pywb.utils.statusandheaders import StatusAndHeaders
|
from pywb.utils.statusandheaders import StatusAndHeaders
|
||||||
from pywb.utils.wbexception import WbException, NotFoundException
|
from pywb.utils.wbexception import WbException, NotFoundException
|
||||||
|
from pywb.utils.loaders import LimitReader
|
||||||
|
|
||||||
from pywb.framework.wbrequestresponse import WbResponse
|
from pywb.framework.wbrequestresponse import WbResponse
|
||||||
from pywb.framework.memento import MementoResponse
|
from pywb.framework.memento import MementoResponse
|
||||||
@ -105,6 +105,9 @@ class ReplayView(object):
|
|||||||
if redir_response:
|
if redir_response:
|
||||||
return redir_response
|
return redir_response
|
||||||
|
|
||||||
|
length = status_headers.get_header('content-length')
|
||||||
|
stream = LimitReader.wrap_stream(stream, length)
|
||||||
|
|
||||||
# one more check for referrer-based self-redirect
|
# one more check for referrer-based self-redirect
|
||||||
self._reject_referrer_self_redirect(wbrequest)
|
self._reject_referrer_self_redirect(wbrequest)
|
||||||
|
|
||||||
@ -124,9 +127,6 @@ class ReplayView(object):
|
|||||||
|
|
||||||
# buffer response if buffering enabled
|
# buffer response if buffering enabled
|
||||||
if self.buffer_response:
|
if self.buffer_response:
|
||||||
if wbrequest.is_identity:
|
|
||||||
status_headers.remove_header('content-length')
|
|
||||||
|
|
||||||
response_iter = self.buffered_response(status_headers,
|
response_iter = self.buffered_response(status_headers,
|
||||||
response_iter)
|
response_iter)
|
||||||
|
|
||||||
@ -165,8 +165,10 @@ class ReplayView(object):
|
|||||||
content = out.getvalue()
|
content = out.getvalue()
|
||||||
|
|
||||||
content_length_str = str(len(content))
|
content_length_str = str(len(content))
|
||||||
status_headers.headers.append(('Content-Length',
|
|
||||||
content_length_str))
|
# remove existing content length
|
||||||
|
status_headers.replace_header('Content-Length',
|
||||||
|
content_length_str)
|
||||||
out.close()
|
out.close()
|
||||||
|
|
||||||
return content
|
return content
|
||||||
|
Loading…
x
Reference in New Issue
Block a user