1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

limit stream by warc/arc record length instead of

http content length.
track length of StatusAndHeaders also.
add tests to verify content length correct for identity
arc and arcgz replays as well
This commit is contained in:
Ilya Kreymer 2014-03-22 11:30:51 -07:00
parent 53590537e0
commit 79da12348f
5 changed files with 59 additions and 23 deletions

View File

@ -4,7 +4,6 @@ from io import BytesIO
from pywb.utils.bufferedreaders import ChunkedDataReader from pywb.utils.bufferedreaders import ChunkedDataReader
from pywb.utils.statusandheaders import StatusAndHeaders from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.utils.wbexception import WbException from pywb.utils.wbexception import WbException
from pywb.utils.loaders import LimitReader
from pywb.framework.wbrequestresponse import WbResponse from pywb.framework.wbrequestresponse import WbResponse
from pywb.framework.memento import MementoResponse from pywb.framework.memento import MementoResponse
@ -110,12 +109,6 @@ class ReplayView:
response = None response = None
# if Content-Length for payload is present,
# ensure we don't read past it
content_length = status_headers.get_header('content-length')
if content_length:
stream = LimitReader.wrap_stream(stream, content_length)
if self.content_rewriter and wbrequest.wb_url.mod != 'id_': if self.content_rewriter and wbrequest.wb_url.mod != 'id_':
response = self.rewrite_content(wbrequest, response = self.rewrite_content(wbrequest,

View File

@ -13,10 +13,11 @@ class StatusAndHeaders(object):
Headers is a list of (name, value) tuples Headers is a list of (name, value) tuples
An optional protocol which appears on first line may be specified An optional protocol which appears on first line may be specified
""" """
def __init__(self, statusline, headers, protocol=''): def __init__(self, statusline, headers, protocol='', total_len=0):
self.statusline = statusline self.statusline = statusline
self.headers = headers self.headers = headers
self.protocol = protocol self.protocol = protocol
self.total_len = total_len
def get_header(self, name): def get_header(self, name):
""" """
@ -52,6 +53,12 @@ headers = {2})".format(self.protocol, self.statusline, headers_str)
self.protocol == other.protocol) self.protocol == other.protocol)
#=================================================================
def _strip_count(string, total_read):
length = len(string)
return string.rstrip(), total_read + length
#================================================================= #=================================================================
class StatusAndHeadersParser(object): class StatusAndHeadersParser(object):
""" """
@ -68,29 +75,33 @@ class StatusAndHeadersParser(object):
support continuation headers starting with space or tab support continuation headers starting with space or tab
""" """
statusline = stream.readline().rstrip() # status line w newlines intact
full_statusline = stream.readline()
statusline, total_read = _strip_count(full_statusline, 0)
protocol_status = self.split_prefix(statusline, self.statuslist) protocol_status = self.split_prefix(statusline, self.statuslist)
if not protocol_status: if not protocol_status:
msg = 'Expected Status Line starting with {0} - Found: {1}' msg = 'Expected Status Line starting with {0} - Found: {1}'
msg = msg.format(self.statuslist, statusline) msg = msg.format(self.statuslist, statusline)
raise StatusAndHeadersParserException(msg, statusline) raise StatusAndHeadersParserException(msg, full_statusline)
headers = [] headers = []
line = stream.readline().rstrip() line, total_read = _strip_count(stream.readline(), total_read)
while line: while line:
name, value = line.split(':', 1) name, value = line.split(':', 1)
name = name.rstrip(' \t') name = name.rstrip(' \t')
value = value.lstrip() value = value.lstrip()
next_line = stream.readline().rstrip() next_line, total_read = _strip_count(stream.readline(),
total_read)
# append continuation lines, if any # append continuation lines, if any
while next_line and next_line.startswith((' ', '\t')): while next_line and next_line.startswith((' ', '\t')):
value += next_line value += next_line
next_line = stream.readline().rstrip() next_line, total_read = _strip_count(stream.readline(),
total_read)
header = (name, value) header = (name, value)
headers.append(header) headers.append(header)
@ -98,7 +109,8 @@ class StatusAndHeadersParser(object):
return StatusAndHeaders(statusline=protocol_status[1].strip(), return StatusAndHeaders(statusline=protocol_status[1].strip(),
headers=headers, headers=headers,
protocol=protocol_status[0]) protocol=protocol_status[0],
total_len=total_read)
@staticmethod @staticmethod
def split_prefix(key, prefixs): def split_prefix(key, prefixs):

View File

@ -6,7 +6,7 @@ from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.utils.statusandheaders import StatusAndHeadersParser from pywb.utils.statusandheaders import StatusAndHeadersParser
from pywb.utils.statusandheaders import StatusAndHeadersParserException from pywb.utils.statusandheaders import StatusAndHeadersParserException
from pywb.utils.loaders import BlockLoader from pywb.utils.loaders import BlockLoader, LimitReader
from pywb.utils.bufferedreaders import DecompressingBufferedReader from pywb.utils.bufferedreaders import DecompressingBufferedReader
from pywb.utils.wbexception import WbException from pywb.utils.wbexception import WbException
@ -73,14 +73,14 @@ class ArcWarcRecordLoader:
if the_format == 'arc': if the_format == 'arc':
rec_type = 'response' rec_type = 'response'
empty = (rec_headers.get_header('length') == 0) length = int(rec_headers.get_header('length'))
elif the_format == 'warc': elif the_format == 'warc':
rec_type = rec_headers.get_header('WARC-Type') rec_type = rec_headers.get_header('WARC-Type')
empty = (rec_headers.get_header('Content-Length') == '0') length = int(rec_headers.get_header('Content-Length'))
# special case: empty w/arc record (hopefully a revisit) # special case: empty w/arc record (hopefully a revisit)
if empty: if length == 0:
status_headers = StatusAndHeaders('204 No Content', []) status_headers = StatusAndHeaders('204 No Content', [])
# special case: warc records that are not expected to have http headers # special case: warc records that are not expected to have http headers
@ -102,6 +102,13 @@ class ArcWarcRecordLoader:
#(statusline, http_headers) = self.parse_http_headers(stream) #(statusline, http_headers) = self.parse_http_headers(stream)
status_headers = self.http_parser.parse(stream) status_headers = self.http_parser.parse(stream)
# limit the stream to the remainder, if >0
# should always be valid, but just in case, still stream if
# content-length was not set
remains = length - status_headers.total_len
if remains > 0:
stream = LimitReader.wrap_stream(stream, remains)
return ArcWarcRecord((the_format, rec_type), return ArcWarcRecord((the_format, rec_type),
rec_headers, stream, status_headers) rec_headers, stream, status_headers)
@ -137,9 +144,14 @@ class ARCHeadersParser:
def parse(self, stream, headerline=None): def parse(self, stream, headerline=None):
total_read = 0
# if headerline passed in, use that # if headerline passed in, use that
if not headerline: if not headerline:
headerline = stream.readline().rstrip() headerline = stream.readline()
total_read = len(headerline)
headerline = headerline.rstrip()
parts = headerline.split() parts = headerline.split()
@ -157,4 +169,5 @@ class ARCHeadersParser:
return StatusAndHeaders(statusline='', return StatusAndHeaders(statusline='',
headers=headers, headers=headers,
protocol='ARC/1.0') protocol='ARC/1.0',
total_len=total_read)

View File

@ -0,0 +1,3 @@
CDX N b a m s k r M S V g
com,example,test,arc)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1656 151 example.arc
com,example,test,gz,arc)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 856 171 example.arc.gz

View File

@ -93,9 +93,24 @@ class TestWb:
def test_replay_identity_1(self): def test_replay_identity_1(self):
resp = self.testapp.get('/pywb/20140127171251id_/http://example.com') resp = self.testapp.get('/pywb/20140127171251id_/http://example.com')
#resp = self.testapp.get('/pywb/20140126200654id_/http://www.iana.org/_img/2013.1/rir-map.svg')
#resp = self.testapp.get('/pywb/20140127171239id_/http://www.iana.org/_css/2013.1/screen.css') # no wb header insertion
#self._assert_basic_html(resp) assert 'wb.js' not in resp.body
# original unrewritten url present
assert '"http://www.iana.org/domains/example"' in resp.body
def test_replay_identity_2_arcgz(self):
resp = self.testapp.get('/pywb/20140216050221id_/http://arc.gz.test.example.com')
# no wb header insertion
assert 'wb.js' not in resp.body
# original unrewritten url present
assert '"http://www.iana.org/domains/example"' in resp.body
def test_replay_identity_2_arc(self):
resp = self.testapp.get('/pywb/20140216050221id_/http://arc.test.example.com')
# no wb header insertion # no wb header insertion
assert 'wb.js' not in resp.body assert 'wb.js' not in resp.body