1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

limit stream by warc/arc record length instead of

http content length.
track length of StatusAndHeaders also.
add tests to verify content length correct for identity
arc and arcgz replays as well
This commit is contained in:
Ilya Kreymer 2014-03-22 11:30:51 -07:00
parent 53590537e0
commit 79da12348f
5 changed files with 59 additions and 23 deletions

View File

@ -4,7 +4,6 @@ from io import BytesIO
from pywb.utils.bufferedreaders import ChunkedDataReader
from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.utils.wbexception import WbException
from pywb.utils.loaders import LimitReader
from pywb.framework.wbrequestresponse import WbResponse
from pywb.framework.memento import MementoResponse
@ -110,12 +109,6 @@ class ReplayView:
response = None
# if Content-Length for payload is present,
# ensure we don't read past it
content_length = status_headers.get_header('content-length')
if content_length:
stream = LimitReader.wrap_stream(stream, content_length)
if self.content_rewriter and wbrequest.wb_url.mod != 'id_':
response = self.rewrite_content(wbrequest,

View File

@ -13,10 +13,11 @@ class StatusAndHeaders(object):
Headers is a list of (name, value) tuples
An optional protocol which appears on first line may be specified
"""
def __init__(self, statusline, headers, protocol=''):
def __init__(self, statusline, headers, protocol='', total_len=0):
self.statusline = statusline
self.headers = headers
self.protocol = protocol
self.total_len = total_len
def get_header(self, name):
"""
@ -52,6 +53,12 @@ headers = {2})".format(self.protocol, self.statusline, headers_str)
self.protocol == other.protocol)
#=================================================================
def _strip_count(string, total_read):
length = len(string)
return string.rstrip(), total_read + length
#=================================================================
class StatusAndHeadersParser(object):
"""
@ -68,29 +75,33 @@ class StatusAndHeadersParser(object):
support continuation headers starting with space or tab
"""
statusline = stream.readline().rstrip()
# status line w newlines intact
full_statusline = stream.readline()
statusline, total_read = _strip_count(full_statusline, 0)
protocol_status = self.split_prefix(statusline, self.statuslist)
if not protocol_status:
msg = 'Expected Status Line starting with {0} - Found: {1}'
msg = msg.format(self.statuslist, statusline)
raise StatusAndHeadersParserException(msg, statusline)
raise StatusAndHeadersParserException(msg, full_statusline)
headers = []
line = stream.readline().rstrip()
line, total_read = _strip_count(stream.readline(), total_read)
while line:
name, value = line.split(':', 1)
name = name.rstrip(' \t')
value = value.lstrip()
next_line = stream.readline().rstrip()
next_line, total_read = _strip_count(stream.readline(),
total_read)
# append continuation lines, if any
while next_line and next_line.startswith((' ', '\t')):
value += next_line
next_line = stream.readline().rstrip()
next_line, total_read = _strip_count(stream.readline(),
total_read)
header = (name, value)
headers.append(header)
@ -98,7 +109,8 @@ class StatusAndHeadersParser(object):
return StatusAndHeaders(statusline=protocol_status[1].strip(),
headers=headers,
protocol=protocol_status[0])
protocol=protocol_status[0],
total_len=total_read)
@staticmethod
def split_prefix(key, prefixs):

View File

@ -6,7 +6,7 @@ from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.utils.statusandheaders import StatusAndHeadersParser
from pywb.utils.statusandheaders import StatusAndHeadersParserException
from pywb.utils.loaders import BlockLoader
from pywb.utils.loaders import BlockLoader, LimitReader
from pywb.utils.bufferedreaders import DecompressingBufferedReader
from pywb.utils.wbexception import WbException
@ -73,14 +73,14 @@ class ArcWarcRecordLoader:
if the_format == 'arc':
rec_type = 'response'
empty = (rec_headers.get_header('length') == 0)
length = int(rec_headers.get_header('length'))
elif the_format == 'warc':
rec_type = rec_headers.get_header('WARC-Type')
empty = (rec_headers.get_header('Content-Length') == '0')
length = int(rec_headers.get_header('Content-Length'))
# special case: empty w/arc record (hopefully a revisit)
if empty:
if length == 0:
status_headers = StatusAndHeaders('204 No Content', [])
# special case: warc records that are not expected to have http headers
@ -102,6 +102,13 @@ class ArcWarcRecordLoader:
#(statusline, http_headers) = self.parse_http_headers(stream)
status_headers = self.http_parser.parse(stream)
# limit the stream to the remainder, if >0
# should always be valid, but just in case, still stream if
# content-length was not set
remains = length - status_headers.total_len
if remains > 0:
stream = LimitReader.wrap_stream(stream, remains)
return ArcWarcRecord((the_format, rec_type),
rec_headers, stream, status_headers)
@ -137,9 +144,14 @@ class ARCHeadersParser:
def parse(self, stream, headerline=None):
total_read = 0
# if headerline passed in, use that
if not headerline:
headerline = stream.readline().rstrip()
headerline = stream.readline()
total_read = len(headerline)
headerline = headerline.rstrip()
parts = headerline.split()
@ -157,4 +169,5 @@ class ARCHeadersParser:
return StatusAndHeaders(statusline='',
headers=headers,
protocol='ARC/1.0')
protocol='ARC/1.0',
total_len=total_read)

View File

@ -0,0 +1,3 @@
CDX N b a m s k r M S V g
com,example,test,arc)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1656 151 example.arc
com,example,test,gz,arc)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 856 171 example.arc.gz

View File

@ -93,9 +93,24 @@ class TestWb:
def test_replay_identity_1(self):
resp = self.testapp.get('/pywb/20140127171251id_/http://example.com')
#resp = self.testapp.get('/pywb/20140126200654id_/http://www.iana.org/_img/2013.1/rir-map.svg')
#resp = self.testapp.get('/pywb/20140127171239id_/http://www.iana.org/_css/2013.1/screen.css')
#self._assert_basic_html(resp)
# no wb header insertion
assert 'wb.js' not in resp.body
# original unrewritten url present
assert '"http://www.iana.org/domains/example"' in resp.body
def test_replay_identity_2_arcgz(self):
resp = self.testapp.get('/pywb/20140216050221id_/http://arc.gz.test.example.com')
# no wb header insertion
assert 'wb.js' not in resp.body
# original unrewritten url present
assert '"http://www.iana.org/domains/example"' in resp.body
def test_replay_identity_2_arc(self):
resp = self.testapp.get('/pywb/20140216050221id_/http://arc.test.example.com')
# no wb header insertion
assert 'wb.js' not in resp.body