diff --git a/pywb/utils/statusandheaders.py b/pywb/utils/statusandheaders.py index 93725628..92e897fc 100644 --- a/pywb/utils/statusandheaders.py +++ b/pywb/utils/statusandheaders.py @@ -65,23 +65,36 @@ class StatusAndHeadersParser(object): """ parse stream for status line and headers return a StatusAndHeaders object + + support continuation headers starting with space or tab """ statusline = stream.readline().rstrip() protocol_status = self.split_prefix(statusline, self.statuslist) if not protocol_status: - msg = 'Expected Status Line - Found: ' + statusline + msg = 'Expected Status Line starting with {0} - Found: {1}' + msg = msg.format(self.statuslist, statusline) raise StatusAndHeadersParserException(msg, statusline) headers = [] line = stream.readline().rstrip() - while line and line != '\r\n': + while line: name, value = line.split(':', 1) - header = (name, value.strip()) + name = name.rstrip(' \t') + value = value.lstrip() + + next_line = stream.readline().rstrip() + + # append continuation lines, if any + while next_line and next_line.startswith((' ', '\t')): + value += next_line + next_line = stream.readline().rstrip() + + header = (name, value) headers.append(header) - line = stream.readline().rstrip() + line = next_line return StatusAndHeaders(statusline=protocol_status[1].strip(), headers=headers, diff --git a/pywb/utils/test/statusandheaders_test.py b/pywb/utils/test/statusandheaders_test.py new file mode 100644 index 00000000..3473e71e --- /dev/null +++ b/pywb/utils/test/statusandheaders_test.py @@ -0,0 +1,29 @@ +""" +>>> StatusAndHeadersParser(['HTTP/1.0']).parse(StringIO.StringIO(status_headers_1)) +StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '200 OK', headers = [ ('Content-Type', 'ABC'), + ('Some', 'Value'), + ('Multi-Line', 'Value1 Also This')]) + +>>> StatusAndHeadersParser(['Other']).parse(StringIO.StringIO(status_headers_1)) +Traceback (most recent call last): +StatusAndHeadersParserException: Expected Status Line starting with ['Other'] - Found: HTTP/1.0 200 OK +""" + + +from pywb.utils.statusandheaders import StatusAndHeadersParser +import StringIO + + +status_headers_1 = "\ +HTTP/1.0 200 OK\r\n\ +Content-Type: ABC\r\n\ +Some: Value\r\n\ +Multi-Line: Value1\r\n\ + Also This\r\n\ +\r\n\ +Body" + + +if __name__ == "__main__": + import doctest + doctest.testmod() diff --git a/pywb/warc/test/test_loading.py b/pywb/warc/test/test_loading.py index 47176e3e..02ab54cb 100644 --- a/pywb/warc/test/test_loading.py +++ b/pywb/warc/test/test_loading.py @@ -213,3 +213,6 @@ def load_from_cdx_test(cdx): except Exception as e: print 'Exception: ' + e.__class__.__name__ +if __name__ == "__main__": + import doctest + doctest.testmod()