diff --git a/pywb/core/replay_views.py b/pywb/core/replay_views.py index 2718d428..2c9d0278 100644 --- a/pywb/core/replay_views.py +++ b/pywb/core/replay_views.py @@ -4,7 +4,6 @@ from io import BytesIO from pywb.utils.bufferedreaders import ChunkedDataReader from pywb.utils.statusandheaders import StatusAndHeaders from pywb.utils.wbexception import WbException -from pywb.utils.loaders import LimitReader from pywb.framework.wbrequestresponse import WbResponse from pywb.framework.memento import MementoResponse @@ -110,12 +109,6 @@ class ReplayView: response = None - # if Content-Length for payload is present, - # ensure we don't read past it - content_length = status_headers.get_header('content-length') - if content_length: - stream = LimitReader.wrap_stream(stream, content_length) - if self.content_rewriter and wbrequest.wb_url.mod != 'id_': response = self.rewrite_content(wbrequest, diff --git a/pywb/utils/statusandheaders.py b/pywb/utils/statusandheaders.py index 92e897fc..64102b1c 100644 --- a/pywb/utils/statusandheaders.py +++ b/pywb/utils/statusandheaders.py @@ -13,10 +13,11 @@ class StatusAndHeaders(object): Headers is a list of (name, value) tuples An optional protocol which appears on first line may be specified """ - def __init__(self, statusline, headers, protocol=''): + def __init__(self, statusline, headers, protocol='', total_len=0): self.statusline = statusline self.headers = headers self.protocol = protocol + self.total_len = total_len def get_header(self, name): """ @@ -52,6 +53,12 @@ headers = {2})".format(self.protocol, self.statusline, headers_str) self.protocol == other.protocol) +#================================================================= +def _strip_count(string, total_read): + length = len(string) + return string.rstrip(), total_read + length + + #================================================================= class StatusAndHeadersParser(object): """ @@ -68,29 +75,33 @@ class StatusAndHeadersParser(object): support continuation headers starting with space or tab """ - statusline = stream.readline().rstrip() + # status line w newlines intact + full_statusline = stream.readline() + statusline, total_read = _strip_count(full_statusline, 0) protocol_status = self.split_prefix(statusline, self.statuslist) if not protocol_status: msg = 'Expected Status Line starting with {0} - Found: {1}' msg = msg.format(self.statuslist, statusline) - raise StatusAndHeadersParserException(msg, statusline) + raise StatusAndHeadersParserException(msg, full_statusline) headers = [] - line = stream.readline().rstrip() + line, total_read = _strip_count(stream.readline(), total_read) while line: name, value = line.split(':', 1) name = name.rstrip(' \t') value = value.lstrip() - next_line = stream.readline().rstrip() + next_line, total_read = _strip_count(stream.readline(), + total_read) # append continuation lines, if any while next_line and next_line.startswith((' ', '\t')): value += next_line - next_line = stream.readline().rstrip() + next_line, total_read = _strip_count(stream.readline(), + total_read) header = (name, value) headers.append(header) @@ -98,7 +109,8 @@ class StatusAndHeadersParser(object): return StatusAndHeaders(statusline=protocol_status[1].strip(), headers=headers, - protocol=protocol_status[0]) + protocol=protocol_status[0], + total_len=total_read) @staticmethod def split_prefix(key, prefixs): diff --git a/pywb/warc/recordloader.py b/pywb/warc/recordloader.py index 4acb491f..5fe5f64d 100644 --- a/pywb/warc/recordloader.py +++ b/pywb/warc/recordloader.py @@ -6,7 +6,7 @@ from pywb.utils.statusandheaders import StatusAndHeaders from pywb.utils.statusandheaders import StatusAndHeadersParser from pywb.utils.statusandheaders import StatusAndHeadersParserException -from pywb.utils.loaders import BlockLoader +from pywb.utils.loaders import BlockLoader, LimitReader from pywb.utils.bufferedreaders import DecompressingBufferedReader from pywb.utils.wbexception import WbException @@ -73,14 +73,14 @@ class ArcWarcRecordLoader: if the_format == 'arc': rec_type = 'response' - empty = (rec_headers.get_header('length') == 0) + length = int(rec_headers.get_header('length')) elif the_format == 'warc': rec_type = rec_headers.get_header('WARC-Type') - empty = (rec_headers.get_header('Content-Length') == '0') + length = int(rec_headers.get_header('Content-Length')) # special case: empty w/arc record (hopefully a revisit) - if empty: + if length == 0: status_headers = StatusAndHeaders('204 No Content', []) # special case: warc records that are not expected to have http headers @@ -102,6 +102,13 @@ class ArcWarcRecordLoader: #(statusline, http_headers) = self.parse_http_headers(stream) status_headers = self.http_parser.parse(stream) + # limit the stream to the remainder, if >0 + # should always be valid, but just in case, still stream if + # content-length was not set + remains = length - status_headers.total_len + if remains > 0: + stream = LimitReader.wrap_stream(stream, remains) + return ArcWarcRecord((the_format, rec_type), rec_headers, stream, status_headers) @@ -137,9 +144,14 @@ class ARCHeadersParser: def parse(self, stream, headerline=None): + total_read = 0 + # if headerline passed in, use that if not headerline: - headerline = stream.readline().rstrip() + headerline = stream.readline() + + total_read = len(headerline) + headerline = headerline.rstrip() parts = headerline.split() @@ -157,4 +169,5 @@ class ARCHeadersParser: return StatusAndHeaders(statusline='', headers=headers, - protocol='ARC/1.0') + protocol='ARC/1.0', + total_len=total_read) diff --git a/sample_archive/cdx/example-arc-test.cdx b/sample_archive/cdx/example-arc-test.cdx new file mode 100644 index 00000000..21f2c291 --- /dev/null +++ b/sample_archive/cdx/example-arc-test.cdx @@ -0,0 +1,3 @@ + CDX N b a m s k r M S V g +com,example,test,arc)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1656 151 example.arc +com,example,test,gz,arc)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 856 171 example.arc.gz diff --git a/tests/test_integration.py b/tests/test_integration.py index 526ca69d..c5a4f43a 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -93,9 +93,24 @@ class TestWb: def test_replay_identity_1(self): resp = self.testapp.get('/pywb/20140127171251id_/http://example.com') - #resp = self.testapp.get('/pywb/20140126200654id_/http://www.iana.org/_img/2013.1/rir-map.svg') - #resp = self.testapp.get('/pywb/20140127171239id_/http://www.iana.org/_css/2013.1/screen.css') - #self._assert_basic_html(resp) + + # no wb header insertion + assert 'wb.js' not in resp.body + + # original unrewritten url present + assert '"http://www.iana.org/domains/example"' in resp.body + + def test_replay_identity_2_arcgz(self): + resp = self.testapp.get('/pywb/20140216050221id_/http://arc.gz.test.example.com') + + # no wb header insertion + assert 'wb.js' not in resp.body + + # original unrewritten url present + assert '"http://www.iana.org/domains/example"' in resp.body + + def test_replay_identity_2_arc(self): + resp = self.testapp.get('/pywb/20140216050221id_/http://arc.test.example.com') # no wb header insertion assert 'wb.js' not in resp.body