1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

record reading: better handling of empty arc (or warc) records

for indexing, index empty/invalid length as '-' status code
for reading, serve as 204 no content.
ensure that StatusAndHeaders has a valid statusline when serving
if http content-length is valid,, limit stream to that content-length
as well as record content-length (whichever is smaller)
replace content-length when buffering
This commit is contained in:
Ilya Kreymer 2014-04-07 17:08:39 -07:00
parent d8c20a59cf
commit 64eef7063d
8 changed files with 108 additions and 23 deletions

View File

@ -93,7 +93,10 @@ class BlockLoader(object):
headers['Range'] = range_header
if self.cookie_maker:
headers['Cookie'] = self.cookie_maker.make()
if isinstance(self.cookie_maker, basestring):
headers['Cookie'] = self.cookie_maker
else:
headers['Cookie'] = self.cookie_maker.make()
request = urllib2.Request(url, headers=headers)
return urllib2.urlopen(request)
@ -184,7 +187,12 @@ class LimitReader(object):
try:
content_length = int(content_length)
if content_length >= 0:
stream = LimitReader(stream, content_length)
# optimize: if already a LimitStream, set limit to
# the smaller of the two limits
if isinstance(stream, LimitReader):
stream.limit = min(stream.limit, content_length)
else:
stream = LimitReader(stream, content_length)
except (ValueError, TypeError):
pass

View File

@ -29,6 +29,21 @@ class StatusAndHeaders(object):
if value[0].lower() == name_lower:
return value[1]
def replace_header(self, name, value):
"""
replace header with new value or add new header
return old header value, if any
"""
name_lower = name.lower()
for index in xrange(len(self.headers) - 1, -1, -1):
curr_name, curr_value = self.headers[index]
if curr_name.lower() == name_lower:
self.headers[index] = (curr_name, value)
return curr_value
self.headers.append((name, value))
return None
def remove_header(self, name):
"""
remove header (case-insensitive)
@ -42,6 +57,20 @@ class StatusAndHeaders(object):
return False
def validate_statusline(self, valid_statusline):
"""
Check that the statusline is valid, eg. starts with a numeric
code. If not, replace with passed in valid_statusline
"""
code = self.statusline.split(' ', 1)[0]
try:
code = int(code)
assert(code > 0)
return True
except ValueError, AssertionError:
self.statusline = valid_statusline
return False
def __repr__(self):
headers_str = pprint.pformat(self.headers, indent=2)
return "StatusAndHeaders(protocol = '{0}', statusline = '{1}', \
@ -81,9 +110,16 @@ class StatusAndHeadersParser(object):
statusline, total_read = _strip_count(full_statusline, 0)
headers = []
# at end of stream
if total_read == 0:
raise EOFError()
elif not statusline:
return StatusAndHeaders(statusline=statusline,
headers=headers,
protocol='',
total_len=total_read)
protocol_status = self.split_prefix(statusline, self.statuslist)
@ -92,13 +128,15 @@ class StatusAndHeadersParser(object):
msg = msg.format(self.statuslist, statusline)
raise StatusAndHeadersParserException(msg, full_statusline)
headers = []
line, total_read = _strip_count(stream.readline(), total_read)
while line:
name, value = line.split(':', 1)
name = name.rstrip(' \t')
value = value.lstrip()
result = line.split(':', 1)
if len(result) == 2:
name = result[0].rstrip(' \t')
value = result[1].lstrip()
else:
name = result[0]
value = None
next_line, total_read = _strip_count(stream.readline(),
total_read)
@ -109,8 +147,10 @@ class StatusAndHeadersParser(object):
next_line, total_read = _strip_count(stream.readline(),
total_read)
header = (name, value)
headers.append(header)
if value is not None:
header = (name, value)
headers.append(header)
line = next_line
return StatusAndHeaders(statusline=protocol_status[1].strip(),

View File

@ -13,6 +13,14 @@ StatusAndHeadersParserException: Expected Status Line starting with ['Other'] -
>>> st1 == StatusAndHeadersParser(['HTTP/1.0']).parse(BytesIO(status_headers_1))
True
# replace header, print new headers
>>> st1.replace_header('some', 'Another-Value'); st1
'Value'
StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '200 OK', headers = [ ('Content-Type', 'ABC'),
('Some', 'Another-Value'),
('Multi-Line', 'Value1 Also This')])
# remove header
>>> st1.remove_header('some')
True
@ -20,6 +28,10 @@ True
# already removed
>>> st1.remove_header('Some')
False
# empty
>>> st2 = StatusAndHeadersParser(['HTTP/1.0']).parse(BytesIO(status_headers_2)); x = st2.validate_statusline('204 No Content'); st2
StatusAndHeaders(protocol = '', statusline = '204 No Content', headers = [])
"""
@ -37,6 +49,11 @@ Multi-Line: Value1\r\n\
Body"
status_headers_2 = """
"""
if __name__ == "__main__":
import doctest
doctest.testmod()

View File

@ -164,7 +164,7 @@ class ArchiveIndexer(object):
digest = record.rec_headers.get_header('WARC-Payload-Digest')
status = record.status_headers.statusline.split(' ')[0]
status = self._extract_status(record.status_headers)
if record.rec_type == 'revisit':
mime = 'warc/revisit'
@ -205,7 +205,9 @@ class ArchiveIndexer(object):
timestamp = record.rec_headers.get_header('archive-date')
if len(timestamp) > 14:
timestamp = timestamp[:14]
status = record.status_headers.statusline.split(' ')[0]
status = self._extract_status(record.status_headers)
mime = record.rec_headers.get_header('content-type')
mime = self._extract_mime(mime)
@ -228,6 +230,12 @@ class ArchiveIndexer(object):
mime = 'unk'
return mime
def _extract_status(self, status_headers):
status = status_headers.statusline.split(' ')[0]
if not status:
status = '-'
return status
def read_rest(self, reader, digester=None):
""" Read remainder of the stream
If a digester is included, update it

View File

@ -97,18 +97,24 @@ class ArcWarcRecordLoader:
rec_type = rec_headers.get_header('WARC-Type')
length = rec_headers.get_header('Content-Length')
is_err = False
try:
length = int(length)
if length < 0:
length = 0
is_err = True
except ValueError:
length = 0
is_err = True
# ================================================================
# handle different types of records
# err condition
if is_err:
status_headers = StatusAndHeaders('-', [])
length = 0
# special case: empty w/arc record (hopefully a revisit)
if length == 0:
elif length == 0:
status_headers = StatusAndHeaders('204 No Content', [])
# special case: warc records that are not expected to have http headers

View File

@ -63,6 +63,9 @@ class ResolvingLoader:
if not headers_record or not payload_record:
raise ArchiveLoadFailed('Could not load ' + str(cdx))
# ensure status line is valid from here
headers_record.status_headers.validate_statusline('204 No Content')
return (headers_record.status_headers, payload_record.stream)
def _resolve_path_load(self, cdx, is_original, failed_files):

View File

@ -36,8 +36,9 @@ metadata)/gnu.org/software/wget/warc/wget.log 20140216012908 metadata://gnu.org/
# bad arcs -- test error edge cases
>>> print_cdx_index('bad.arc')
CDX N b a m s k r M S V g
com,example)/ 20140401000000 http://example.com/ text/html 204 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 67 134 bad.arc
com,example)/ 20140401000000 http://example.com/ text/html 204 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 68 202 bad.arc
com,example)/ 20140401000000 http://example.com/ text/html - 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 67 134 bad.arc
com,example)/ 20140102000000 http://example.com/ text/plain - 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 59 202 bad.arc
com,example)/ 20140401000000 http://example.com/ text/html - 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 68 262 bad.arc
# Test CLI interface -- (check for num lines)
#=================================================================
@ -46,7 +47,7 @@ com,example)/ 20140401000000 http://example.com/ text/html 204 3I42H3S6NNFQ2MSVX
>>> cli_lines(['--sort', '-', TEST_WARC_DIR])
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz
org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1001 353 example-url-agnostic-orig.warc.gz
200
201
# test writing to stdout
>>> cli_lines(['-', TEST_WARC_DIR + 'example.warc.gz'])

View File

@ -1,9 +1,9 @@
import re
from io import BytesIO
from pywb.utils.bufferedreaders import ChunkedDataReader
from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.utils.wbexception import WbException, NotFoundException
from pywb.utils.loaders import LimitReader
from pywb.framework.wbrequestresponse import WbResponse
from pywb.framework.memento import MementoResponse
@ -105,6 +105,9 @@ class ReplayView(object):
if redir_response:
return redir_response
length = status_headers.get_header('content-length')
stream = LimitReader.wrap_stream(stream, length)
# one more check for referrer-based self-redirect
self._reject_referrer_self_redirect(wbrequest)
@ -124,9 +127,6 @@ class ReplayView(object):
# buffer response if buffering enabled
if self.buffer_response:
if wbrequest.is_identity:
status_headers.remove_header('content-length')
response_iter = self.buffered_response(status_headers,
response_iter)
@ -165,8 +165,10 @@ class ReplayView(object):
content = out.getvalue()
content_length_str = str(len(content))
status_headers.headers.append(('Content-Length',
content_length_str))
# remove existing content length
status_headers.replace_header('Content-Length',
content_length_str)
out.close()
return content