diff --git a/pywb/utils/loaders.py b/pywb/utils/loaders.py index ccbe960e..f86e4072 100644 --- a/pywb/utils/loaders.py +++ b/pywb/utils/loaders.py @@ -93,7 +93,10 @@ class BlockLoader(object): headers['Range'] = range_header if self.cookie_maker: - headers['Cookie'] = self.cookie_maker.make() + if isinstance(self.cookie_maker, basestring): + headers['Cookie'] = self.cookie_maker + else: + headers['Cookie'] = self.cookie_maker.make() request = urllib2.Request(url, headers=headers) return urllib2.urlopen(request) @@ -184,7 +187,12 @@ class LimitReader(object): try: content_length = int(content_length) if content_length >= 0: - stream = LimitReader(stream, content_length) + # optimize: if already a LimitStream, set limit to + # the smaller of the two limits + if isinstance(stream, LimitReader): + stream.limit = min(stream.limit, content_length) + else: + stream = LimitReader(stream, content_length) except (ValueError, TypeError): pass diff --git a/pywb/utils/statusandheaders.py b/pywb/utils/statusandheaders.py index 24dcf784..85805cb2 100644 --- a/pywb/utils/statusandheaders.py +++ b/pywb/utils/statusandheaders.py @@ -29,6 +29,21 @@ class StatusAndHeaders(object): if value[0].lower() == name_lower: return value[1] + def replace_header(self, name, value): + """ + replace header with new value or add new header + return old header value, if any + """ + name_lower = name.lower() + for index in xrange(len(self.headers) - 1, -1, -1): + curr_name, curr_value = self.headers[index] + if curr_name.lower() == name_lower: + self.headers[index] = (curr_name, value) + return curr_value + + self.headers.append((name, value)) + return None + def remove_header(self, name): """ remove header (case-insensitive) @@ -42,6 +57,20 @@ class StatusAndHeaders(object): return False + def validate_statusline(self, valid_statusline): + """ + Check that the statusline is valid, eg. starts with a numeric + code. If not, replace with passed in valid_statusline + """ + code = self.statusline.split(' ', 1)[0] + try: + code = int(code) + assert(code > 0) + return True + except ValueError, AssertionError: + self.statusline = valid_statusline + return False + def __repr__(self): headers_str = pprint.pformat(self.headers, indent=2) return "StatusAndHeaders(protocol = '{0}', statusline = '{1}', \ @@ -81,9 +110,16 @@ class StatusAndHeadersParser(object): statusline, total_read = _strip_count(full_statusline, 0) + headers = [] + # at end of stream if total_read == 0: raise EOFError() + elif not statusline: + return StatusAndHeaders(statusline=statusline, + headers=headers, + protocol='', + total_len=total_read) protocol_status = self.split_prefix(statusline, self.statuslist) @@ -92,13 +128,15 @@ class StatusAndHeadersParser(object): msg = msg.format(self.statuslist, statusline) raise StatusAndHeadersParserException(msg, full_statusline) - headers = [] - line, total_read = _strip_count(stream.readline(), total_read) while line: - name, value = line.split(':', 1) - name = name.rstrip(' \t') - value = value.lstrip() + result = line.split(':', 1) + if len(result) == 2: + name = result[0].rstrip(' \t') + value = result[1].lstrip() + else: + name = result[0] + value = None next_line, total_read = _strip_count(stream.readline(), total_read) @@ -109,8 +147,10 @@ class StatusAndHeadersParser(object): next_line, total_read = _strip_count(stream.readline(), total_read) - header = (name, value) - headers.append(header) + if value is not None: + header = (name, value) + headers.append(header) + line = next_line return StatusAndHeaders(statusline=protocol_status[1].strip(), diff --git a/pywb/utils/test/test_statusandheaders.py b/pywb/utils/test/test_statusandheaders.py index ea835e32..061532a3 100644 --- a/pywb/utils/test/test_statusandheaders.py +++ b/pywb/utils/test/test_statusandheaders.py @@ -13,6 +13,14 @@ StatusAndHeadersParserException: Expected Status Line starting with ['Other'] - >>> st1 == StatusAndHeadersParser(['HTTP/1.0']).parse(BytesIO(status_headers_1)) True +# replace header, print new headers +>>> st1.replace_header('some', 'Another-Value'); st1 +'Value' +StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '200 OK', headers = [ ('Content-Type', 'ABC'), + ('Some', 'Another-Value'), + ('Multi-Line', 'Value1 Also This')]) + + # remove header >>> st1.remove_header('some') True @@ -20,6 +28,10 @@ True # already removed >>> st1.remove_header('Some') False + +# empty +>>> st2 = StatusAndHeadersParser(['HTTP/1.0']).parse(BytesIO(status_headers_2)); x = st2.validate_statusline('204 No Content'); st2 +StatusAndHeaders(protocol = '', statusline = '204 No Content', headers = []) """ @@ -37,6 +49,11 @@ Multi-Line: Value1\r\n\ Body" +status_headers_2 = """ + +""" + + if __name__ == "__main__": import doctest doctest.testmod() diff --git a/pywb/warc/archiveindexer.py b/pywb/warc/archiveindexer.py index 6ee3a10c..2247ced4 100644 --- a/pywb/warc/archiveindexer.py +++ b/pywb/warc/archiveindexer.py @@ -164,7 +164,7 @@ class ArchiveIndexer(object): digest = record.rec_headers.get_header('WARC-Payload-Digest') - status = record.status_headers.statusline.split(' ')[0] + status = self._extract_status(record.status_headers) if record.rec_type == 'revisit': mime = 'warc/revisit' @@ -205,7 +205,9 @@ class ArchiveIndexer(object): timestamp = record.rec_headers.get_header('archive-date') if len(timestamp) > 14: timestamp = timestamp[:14] - status = record.status_headers.statusline.split(' ')[0] + + status = self._extract_status(record.status_headers) + mime = record.rec_headers.get_header('content-type') mime = self._extract_mime(mime) @@ -228,6 +230,12 @@ class ArchiveIndexer(object): mime = 'unk' return mime + def _extract_status(self, status_headers): + status = status_headers.statusline.split(' ')[0] + if not status: + status = '-' + return status + def read_rest(self, reader, digester=None): """ Read remainder of the stream If a digester is included, update it diff --git a/pywb/warc/recordloader.py b/pywb/warc/recordloader.py index 96e149e3..4c71dee3 100644 --- a/pywb/warc/recordloader.py +++ b/pywb/warc/recordloader.py @@ -97,18 +97,24 @@ class ArcWarcRecordLoader: rec_type = rec_headers.get_header('WARC-Type') length = rec_headers.get_header('Content-Length') + is_err = False + try: length = int(length) if length < 0: - length = 0 + is_err = True except ValueError: - length = 0 + is_err = True # ================================================================ # handle different types of records + # err condition + if is_err: + status_headers = StatusAndHeaders('-', []) + length = 0 # special case: empty w/arc record (hopefully a revisit) - if length == 0: + elif length == 0: status_headers = StatusAndHeaders('204 No Content', []) # special case: warc records that are not expected to have http headers diff --git a/pywb/warc/resolvingloader.py b/pywb/warc/resolvingloader.py index 10c7caa0..393efc3e 100644 --- a/pywb/warc/resolvingloader.py +++ b/pywb/warc/resolvingloader.py @@ -63,6 +63,9 @@ class ResolvingLoader: if not headers_record or not payload_record: raise ArchiveLoadFailed('Could not load ' + str(cdx)) + # ensure status line is valid from here + headers_record.status_headers.validate_statusline('204 No Content') + return (headers_record.status_headers, payload_record.stream) def _resolve_path_load(self, cdx, is_original, failed_files): diff --git a/pywb/warc/test/test_indexing.py b/pywb/warc/test/test_indexing.py index 0e470424..0a3d6038 100644 --- a/pywb/warc/test/test_indexing.py +++ b/pywb/warc/test/test_indexing.py @@ -36,8 +36,9 @@ metadata)/gnu.org/software/wget/warc/wget.log 20140216012908 metadata://gnu.org/ # bad arcs -- test error edge cases >>> print_cdx_index('bad.arc') CDX N b a m s k r M S V g -com,example)/ 20140401000000 http://example.com/ text/html 204 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 67 134 bad.arc -com,example)/ 20140401000000 http://example.com/ text/html 204 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 68 202 bad.arc +com,example)/ 20140401000000 http://example.com/ text/html - 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 67 134 bad.arc +com,example)/ 20140102000000 http://example.com/ text/plain - 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 59 202 bad.arc +com,example)/ 20140401000000 http://example.com/ text/html - 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 68 262 bad.arc # Test CLI interface -- (check for num lines) #================================================================= @@ -46,7 +47,7 @@ com,example)/ 20140401000000 http://example.com/ text/html 204 3I42H3S6NNFQ2MSVX >>> cli_lines(['--sort', '-', TEST_WARC_DIR]) com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1001 353 example-url-agnostic-orig.warc.gz -200 +201 # test writing to stdout >>> cli_lines(['-', TEST_WARC_DIR + 'example.warc.gz']) diff --git a/pywb/webapp/replay_views.py b/pywb/webapp/replay_views.py index 2ab17225..31fe4b57 100644 --- a/pywb/webapp/replay_views.py +++ b/pywb/webapp/replay_views.py @@ -1,9 +1,9 @@ import re from io import BytesIO -from pywb.utils.bufferedreaders import ChunkedDataReader from pywb.utils.statusandheaders import StatusAndHeaders from pywb.utils.wbexception import WbException, NotFoundException +from pywb.utils.loaders import LimitReader from pywb.framework.wbrequestresponse import WbResponse from pywb.framework.memento import MementoResponse @@ -105,6 +105,9 @@ class ReplayView(object): if redir_response: return redir_response + length = status_headers.get_header('content-length') + stream = LimitReader.wrap_stream(stream, length) + # one more check for referrer-based self-redirect self._reject_referrer_self_redirect(wbrequest) @@ -124,9 +127,6 @@ class ReplayView(object): # buffer response if buffering enabled if self.buffer_response: - if wbrequest.is_identity: - status_headers.remove_header('content-length') - response_iter = self.buffered_response(status_headers, response_iter) @@ -165,8 +165,10 @@ class ReplayView(object): content = out.getvalue() content_length_str = str(len(content)) - status_headers.headers.append(('Content-Length', - content_length_str)) + + # remove existing content length + status_headers.replace_header('Content-Length', + content_length_str) out.close() return content