mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
limit stream by warc/arc record length instead of
http content length. track length of StatusAndHeaders also. add tests to verify content length correct for identity arc and arcgz replays as well
This commit is contained in:
parent
53590537e0
commit
79da12348f
@ -4,7 +4,6 @@ from io import BytesIO
|
||||
from pywb.utils.bufferedreaders import ChunkedDataReader
|
||||
from pywb.utils.statusandheaders import StatusAndHeaders
|
||||
from pywb.utils.wbexception import WbException
|
||||
from pywb.utils.loaders import LimitReader
|
||||
|
||||
from pywb.framework.wbrequestresponse import WbResponse
|
||||
from pywb.framework.memento import MementoResponse
|
||||
@ -110,12 +109,6 @@ class ReplayView:
|
||||
|
||||
response = None
|
||||
|
||||
# if Content-Length for payload is present,
|
||||
# ensure we don't read past it
|
||||
content_length = status_headers.get_header('content-length')
|
||||
if content_length:
|
||||
stream = LimitReader.wrap_stream(stream, content_length)
|
||||
|
||||
if self.content_rewriter and wbrequest.wb_url.mod != 'id_':
|
||||
|
||||
response = self.rewrite_content(wbrequest,
|
||||
|
@ -13,10 +13,11 @@ class StatusAndHeaders(object):
|
||||
Headers is a list of (name, value) tuples
|
||||
An optional protocol which appears on first line may be specified
|
||||
"""
|
||||
def __init__(self, statusline, headers, protocol=''):
|
||||
def __init__(self, statusline, headers, protocol='', total_len=0):
|
||||
self.statusline = statusline
|
||||
self.headers = headers
|
||||
self.protocol = protocol
|
||||
self.total_len = total_len
|
||||
|
||||
def get_header(self, name):
|
||||
"""
|
||||
@ -52,6 +53,12 @@ headers = {2})".format(self.protocol, self.statusline, headers_str)
|
||||
self.protocol == other.protocol)
|
||||
|
||||
|
||||
#=================================================================
|
||||
def _strip_count(string, total_read):
|
||||
length = len(string)
|
||||
return string.rstrip(), total_read + length
|
||||
|
||||
|
||||
#=================================================================
|
||||
class StatusAndHeadersParser(object):
|
||||
"""
|
||||
@ -68,29 +75,33 @@ class StatusAndHeadersParser(object):
|
||||
|
||||
support continuation headers starting with space or tab
|
||||
"""
|
||||
statusline = stream.readline().rstrip()
|
||||
# status line w newlines intact
|
||||
full_statusline = stream.readline()
|
||||
statusline, total_read = _strip_count(full_statusline, 0)
|
||||
|
||||
protocol_status = self.split_prefix(statusline, self.statuslist)
|
||||
|
||||
if not protocol_status:
|
||||
msg = 'Expected Status Line starting with {0} - Found: {1}'
|
||||
msg = msg.format(self.statuslist, statusline)
|
||||
raise StatusAndHeadersParserException(msg, statusline)
|
||||
raise StatusAndHeadersParserException(msg, full_statusline)
|
||||
|
||||
headers = []
|
||||
|
||||
line = stream.readline().rstrip()
|
||||
line, total_read = _strip_count(stream.readline(), total_read)
|
||||
while line:
|
||||
name, value = line.split(':', 1)
|
||||
name = name.rstrip(' \t')
|
||||
value = value.lstrip()
|
||||
|
||||
next_line = stream.readline().rstrip()
|
||||
next_line, total_read = _strip_count(stream.readline(),
|
||||
total_read)
|
||||
|
||||
# append continuation lines, if any
|
||||
while next_line and next_line.startswith((' ', '\t')):
|
||||
value += next_line
|
||||
next_line = stream.readline().rstrip()
|
||||
next_line, total_read = _strip_count(stream.readline(),
|
||||
total_read)
|
||||
|
||||
header = (name, value)
|
||||
headers.append(header)
|
||||
@ -98,7 +109,8 @@ class StatusAndHeadersParser(object):
|
||||
|
||||
return StatusAndHeaders(statusline=protocol_status[1].strip(),
|
||||
headers=headers,
|
||||
protocol=protocol_status[0])
|
||||
protocol=protocol_status[0],
|
||||
total_len=total_read)
|
||||
|
||||
@staticmethod
|
||||
def split_prefix(key, prefixs):
|
||||
|
@ -6,7 +6,7 @@ from pywb.utils.statusandheaders import StatusAndHeaders
|
||||
from pywb.utils.statusandheaders import StatusAndHeadersParser
|
||||
from pywb.utils.statusandheaders import StatusAndHeadersParserException
|
||||
|
||||
from pywb.utils.loaders import BlockLoader
|
||||
from pywb.utils.loaders import BlockLoader, LimitReader
|
||||
from pywb.utils.bufferedreaders import DecompressingBufferedReader
|
||||
|
||||
from pywb.utils.wbexception import WbException
|
||||
@ -73,14 +73,14 @@ class ArcWarcRecordLoader:
|
||||
|
||||
if the_format == 'arc':
|
||||
rec_type = 'response'
|
||||
empty = (rec_headers.get_header('length') == 0)
|
||||
length = int(rec_headers.get_header('length'))
|
||||
|
||||
elif the_format == 'warc':
|
||||
rec_type = rec_headers.get_header('WARC-Type')
|
||||
empty = (rec_headers.get_header('Content-Length') == '0')
|
||||
length = int(rec_headers.get_header('Content-Length'))
|
||||
|
||||
# special case: empty w/arc record (hopefully a revisit)
|
||||
if empty:
|
||||
if length == 0:
|
||||
status_headers = StatusAndHeaders('204 No Content', [])
|
||||
|
||||
# special case: warc records that are not expected to have http headers
|
||||
@ -102,6 +102,13 @@ class ArcWarcRecordLoader:
|
||||
#(statusline, http_headers) = self.parse_http_headers(stream)
|
||||
status_headers = self.http_parser.parse(stream)
|
||||
|
||||
# limit the stream to the remainder, if >0
|
||||
# should always be valid, but just in case, still stream if
|
||||
# content-length was not set
|
||||
remains = length - status_headers.total_len
|
||||
if remains > 0:
|
||||
stream = LimitReader.wrap_stream(stream, remains)
|
||||
|
||||
return ArcWarcRecord((the_format, rec_type),
|
||||
rec_headers, stream, status_headers)
|
||||
|
||||
@ -137,9 +144,14 @@ class ARCHeadersParser:
|
||||
|
||||
def parse(self, stream, headerline=None):
|
||||
|
||||
total_read = 0
|
||||
|
||||
# if headerline passed in, use that
|
||||
if not headerline:
|
||||
headerline = stream.readline().rstrip()
|
||||
headerline = stream.readline()
|
||||
|
||||
total_read = len(headerline)
|
||||
headerline = headerline.rstrip()
|
||||
|
||||
parts = headerline.split()
|
||||
|
||||
@ -157,4 +169,5 @@ class ARCHeadersParser:
|
||||
|
||||
return StatusAndHeaders(statusline='',
|
||||
headers=headers,
|
||||
protocol='ARC/1.0')
|
||||
protocol='ARC/1.0',
|
||||
total_len=total_read)
|
||||
|
3
sample_archive/cdx/example-arc-test.cdx
Normal file
3
sample_archive/cdx/example-arc-test.cdx
Normal file
@ -0,0 +1,3 @@
|
||||
CDX N b a m s k r M S V g
|
||||
com,example,test,arc)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1656 151 example.arc
|
||||
com,example,test,gz,arc)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 856 171 example.arc.gz
|
@ -93,9 +93,24 @@ class TestWb:
|
||||
|
||||
def test_replay_identity_1(self):
|
||||
resp = self.testapp.get('/pywb/20140127171251id_/http://example.com')
|
||||
#resp = self.testapp.get('/pywb/20140126200654id_/http://www.iana.org/_img/2013.1/rir-map.svg')
|
||||
#resp = self.testapp.get('/pywb/20140127171239id_/http://www.iana.org/_css/2013.1/screen.css')
|
||||
#self._assert_basic_html(resp)
|
||||
|
||||
# no wb header insertion
|
||||
assert 'wb.js' not in resp.body
|
||||
|
||||
# original unrewritten url present
|
||||
assert '"http://www.iana.org/domains/example"' in resp.body
|
||||
|
||||
def test_replay_identity_2_arcgz(self):
|
||||
resp = self.testapp.get('/pywb/20140216050221id_/http://arc.gz.test.example.com')
|
||||
|
||||
# no wb header insertion
|
||||
assert 'wb.js' not in resp.body
|
||||
|
||||
# original unrewritten url present
|
||||
assert '"http://www.iana.org/domains/example"' in resp.body
|
||||
|
||||
def test_replay_identity_2_arc(self):
|
||||
resp = self.testapp.get('/pywb/20140216050221id_/http://arc.test.example.com')
|
||||
|
||||
# no wb header insertion
|
||||
assert 'wb.js' not in resp.body
|
||||
|
Loading…
x
Reference in New Issue
Block a user