limit stream by warc/arc record length instead of

http content length. track length of StatusAndHeaders also. add tests to verify content length correct for identity arc and arcgz replays as well
2025-03-15 00:03:28 +01:00 · 2014-03-22 11:30:51 -07:00 · 2014-03-22 11:30:51 -07:00 · 79da12348f
commit 79da12348f
parent 53590537e0
5 changed files with 59 additions and 23 deletions
--- a/pywb/core/replay_views.py
+++ b/pywb/core/replay_views.py
@ -4,7 +4,6 @@ from io import BytesIO
 from pywb.utils.bufferedreaders import ChunkedDataReader
 from pywb.utils.statusandheaders import StatusAndHeaders
 from pywb.utils.wbexception import WbException
-from pywb.utils.loaders import LimitReader

 from pywb.framework.wbrequestresponse import WbResponse
 from pywb.framework.memento import MementoResponse
@ -110,12 +109,6 @@ class ReplayView:

        response = None

-        # if Content-Length for payload is present,
-        # ensure we don't read past it
-        content_length = status_headers.get_header('content-length')
-        if content_length:
-            stream = LimitReader.wrap_stream(stream, content_length)
-
        if self.content_rewriter and wbrequest.wb_url.mod != 'id_':

            response = self.rewrite_content(wbrequest,
--- a/pywb/utils/statusandheaders.py
+++ b/pywb/utils/statusandheaders.py
@ -13,10 +13,11 @@ class StatusAndHeaders(object):
    Headers is a list of (name, value) tuples
    An optional protocol which appears on first line may be specified
    """
-    def __init__(self, statusline, headers, protocol=''):
+    def __init__(self, statusline, headers, protocol='', total_len=0):
        self.statusline = statusline
        self.headers = headers
        self.protocol = protocol
+        self.total_len = total_len

    def get_header(self, name):
        """
@ -52,6 +53,12 @@ headers = {2})".format(self.protocol, self.statusline, headers_str)
                self.protocol == other.protocol)


+#=================================================================
+def _strip_count(string, total_read):
+    length = len(string)
+    return string.rstrip(), total_read + length
+
+
 #=================================================================
 class StatusAndHeadersParser(object):
    """
@ -68,29 +75,33 @@ class StatusAndHeadersParser(object):

        support continuation headers starting with space or tab
        """
-        statusline = stream.readline().rstrip()
+        # status line w newlines intact
+        full_statusline = stream.readline()
+        statusline, total_read = _strip_count(full_statusline, 0)

        protocol_status = self.split_prefix(statusline, self.statuslist)

        if not protocol_status:
            msg = 'Expected Status Line starting with {0} - Found: {1}'
            msg = msg.format(self.statuslist, statusline)
-            raise StatusAndHeadersParserException(msg, statusline)
+            raise StatusAndHeadersParserException(msg, full_statusline)

        headers = []

-        line = stream.readline().rstrip()
+        line, total_read = _strip_count(stream.readline(), total_read)
        while line:
            name, value = line.split(':', 1)
            name = name.rstrip(' \t')
            value = value.lstrip()

-            next_line = stream.readline().rstrip()
+            next_line, total_read = _strip_count(stream.readline(),
+                                                 total_read)

            # append continuation lines, if any
            while next_line and next_line.startswith((' ', '\t')):
                value += next_line
-                next_line = stream.readline().rstrip()
+                next_line, total_read = _strip_count(stream.readline(),
+                                                     total_read)

            header = (name, value)
            headers.append(header)
@ -98,7 +109,8 @@ class StatusAndHeadersParser(object):

        return StatusAndHeaders(statusline=protocol_status[1].strip(),
                                headers=headers,
-                                protocol=protocol_status[0])
+                                protocol=protocol_status[0],
+                                total_len=total_read)

    @staticmethod
    def split_prefix(key, prefixs):
--- a/pywb/warc/recordloader.py
+++ b/pywb/warc/recordloader.py
@ -6,7 +6,7 @@ from pywb.utils.statusandheaders import StatusAndHeaders
 from pywb.utils.statusandheaders import StatusAndHeadersParser
 from pywb.utils.statusandheaders import StatusAndHeadersParserException

-from pywb.utils.loaders import BlockLoader
+from pywb.utils.loaders import BlockLoader, LimitReader
 from pywb.utils.bufferedreaders import DecompressingBufferedReader

 from pywb.utils.wbexception import WbException
@ -73,14 +73,14 @@ class ArcWarcRecordLoader:

        if the_format == 'arc':
            rec_type = 'response'
-            empty = (rec_headers.get_header('length') == 0)
+            length = int(rec_headers.get_header('length'))

        elif the_format == 'warc':
            rec_type = rec_headers.get_header('WARC-Type')
-            empty = (rec_headers.get_header('Content-Length') == '0')
+            length = int(rec_headers.get_header('Content-Length'))

        # special case: empty w/arc record (hopefully a revisit)
-        if empty:
+        if length == 0:
            status_headers = StatusAndHeaders('204 No Content', [])

        # special case: warc records that are not expected to have http headers
@ -102,6 +102,13 @@ class ArcWarcRecordLoader:
            #(statusline, http_headers) = self.parse_http_headers(stream)
            status_headers = self.http_parser.parse(stream)

+        # limit the stream to the remainder, if >0
+        # should always be valid, but just in case, still stream if
+        # content-length was not set
+        remains = length - status_headers.total_len
+        if remains > 0:
+            stream = LimitReader.wrap_stream(stream, remains)
+
        return ArcWarcRecord((the_format, rec_type),
                             rec_headers, stream, status_headers)

@ -137,9 +144,14 @@ class ARCHeadersParser:

    def parse(self, stream, headerline=None):

+        total_read = 0
+
        # if headerline passed in, use that
        if not headerline:
-            headerline = stream.readline().rstrip()
+            headerline = stream.readline()
+
+        total_read = len(headerline)
+        headerline = headerline.rstrip()

        parts = headerline.split()

@ -157,4 +169,5 @@ class ARCHeadersParser:

        return StatusAndHeaders(statusline='',
                                headers=headers,
-                                protocol='ARC/1.0')
+                                protocol='ARC/1.0',
+                                total_len=total_read)
--- a/sample_archive/cdx/example-arc-test.cdx
+++ b/sample_archive/cdx/example-arc-test.cdx
@ -0,0 +1,3 @@
+ CDX N b a m s k r M S V g
+com,example,test,arc)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1656 151 example.arc
+com,example,test,gz,arc)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 856 171 example.arc.gz
--- a/tests/test_integration.py
+++ b/tests/test_integration.py
@ -93,9 +93,24 @@ class TestWb:

    def test_replay_identity_1(self):
        resp = self.testapp.get('/pywb/20140127171251id_/http://example.com')
-        #resp = self.testapp.get('/pywb/20140126200654id_/http://www.iana.org/_img/2013.1/rir-map.svg')
-        #resp = self.testapp.get('/pywb/20140127171239id_/http://www.iana.org/_css/2013.1/screen.css')
-        #self._assert_basic_html(resp)
+
+        # no wb header insertion
+        assert 'wb.js' not in resp.body
+
+        # original unrewritten url present
+        assert '"http://www.iana.org/domains/example"' in resp.body
+
+    def test_replay_identity_2_arcgz(self):
+        resp = self.testapp.get('/pywb/20140216050221id_/http://arc.gz.test.example.com')
+
+        # no wb header insertion
+        assert 'wb.js' not in resp.body
+
+        # original unrewritten url present
+        assert '"http://www.iana.org/domains/example"' in resp.body
+
+    def test_replay_identity_2_arc(self):
+        resp = self.testapp.get('/pywb/20140216050221id_/http://arc.test.example.com')

        # no wb header insertion
        assert 'wb.js' not in resp.body