1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

Adaptive Streaming Improvements (#236)

* adaptive rewrite improvements:
- Add 'application/vnd.apple.mpegurl' as HLS type in rules.yaml and default_rewriter.py
- Support setting max resolution and max bandwidth to choose, defaults to 480x854 and 200000 respectively
- LiveWebLoader provides a get_custom_metadata for specifying WARC-JSON-Metadata header, per mime type (TODO: support customization via rules)
- When filtering, first limiting by resolution (if set), then by bandwidth (if set), otherwise default to max bandwidth
- Max resoluton/max bandwidth stored in WARC record under WARC-JSON-Metadata as 'adaptive_max_resolution' and 'adaptive_max_bandwidth' to ensure replayability. If absent, choose absolute max in manifest to be backwards compatible
- Add sample HLS and DASH manifests for testing, with and without max resolution/bandwidth settings.
This commit is contained in:
Ilya Kreymer 2017-09-06 23:23:39 -07:00 committed by GitHub
parent 5a0867fed9
commit 772993ba53
10 changed files with 361 additions and 24 deletions

View File

@ -8,6 +8,7 @@ from warcio.utils import to_native_str
import re
import webencodings
import tempfile
import json
from pywb.utils.io import StreamIter, BUFF_SIZE
@ -197,11 +198,27 @@ class BufferedRewriter(object):
stream_buffer.write(buff)
stream_buffer.seek(0)
return StreamIter(self.rewrite_stream(stream_buffer))
return StreamIter(self.rewrite_stream(stream_buffer, rwinfo))
def rewrite_stream(self, stream):
def rewrite_stream(self, stream, rwinfo):
raise NotImplemented('implement in subclass')
def _get_record_metadata(self, rwinfo):
client_metadata = rwinfo.record.rec_headers.get_header('WARC-JSON-Metadata')
if client_metadata:
try:
return json.loads(client_metadata)
except:
pass
return {}
def _get_adaptive_metadata(self, rwinfo):
metadata = self._get_record_metadata(rwinfo)
max_resolution = int(metadata.get('adaptive_max_resolution', 0))
max_bandwidth = int(metadata.get('adaptive_max_bandwidth', 1000000000))
return max_resolution, max_bandwidth
# ============================================================================
class StreamingRewriter(object):

View File

@ -63,6 +63,7 @@ class DefaultRewriter(BaseContentRewriter):
# HLS
'application/x-mpegURL': 'hls',
'application/vnd.apple.mpegurl': 'hls',
# DASH
'application/dash+xml': 'dash',

View File

@ -7,7 +7,7 @@ from pywb.rewrite.content_rewriter import BufferedRewriter
# ============================================================================
# Experimental: not fully tested
class RewriteAMF(BufferedRewriter): #pragma: no cover
def rewrite_stream(self, stream):
def rewrite_stream(self, stream, rwinfo):
try:
from pyamf import remoting

View File

@ -1,5 +1,5 @@
from contextlib import closing
from io import BytesIO, StringIO
from io import BytesIO
import json
import xml.etree.ElementTree as ET
@ -9,11 +9,12 @@ from pywb.rewrite.content_rewriter import BufferedRewriter
# ============================================================================
class RewriteDASH(BufferedRewriter):
def rewrite_stream(self, stream):
res_buff, best_ids = self.rewrite_dash(stream)
def rewrite_stream(self, stream, rwinfo):
res_buff, best_ids = self.rewrite_dash(stream, rwinfo)
return res_buff
def rewrite_dash(self, stream):
def rewrite_dash(self, stream, rwinfo):
max_resolution, max_bandwidth = self._get_adaptive_metadata(rwinfo)
ET.register_namespace('', 'urn:mpeg:dash:schema:mpd:2011')
namespaces = {'mpd': 'urn:mpeg:dash:schema:mpd:2011'}
@ -26,24 +27,32 @@ class RewriteDASH(BufferedRewriter):
for period in root.findall('mpd:Period', namespaces):
for adaptset in period.findall('mpd:AdaptationSet', namespaces):
best = None
best_resolution = 0
best_bandwidth = 0
for repres in adaptset.findall('mpd:Representation', namespaces):
bandwidth = int(repres.get('bandwidth', '0'))
if not best or bandwidth > int(best.get('bandwidth', '0')):
curr_resolution = int(repres.get('width', '0')) * int(repres.get('height', '0'))
curr_bandwidth = int(repres.get('bandwidth', 0))
if curr_resolution and max_resolution:
if curr_resolution <= max_resolution and curr_resolution > best_resolution:
best_resolution = curr_resolution
best_bandwidth = curr_bandwidth
best = repres
elif curr_bandwidth <= max_bandwidth and curr_bandwidth > best_bandwidth:
best_resolution = curr_resolution
best_bandwidth = curr_bandwidth
best = repres
if best:
if best is not None:
best_ids.append(best.get('id'))
for repres in adaptset.findall('mpd:Representation', namespaces):
if repres != best:
adaptset.remove(repres)
string_io = StringIO()
tree.write(string_io, encoding='unicode', xml_declaration=True)
buff_io = BytesIO()
buff_io.write(string_io.getvalue().encode('utf-8'))
tree.write(buff_io, encoding='UTF-8', xml_declaration=True)
buff_io.seek(0)
return buff_io, best_ids

View File

@ -7,23 +7,43 @@ from pywb.rewrite.content_rewriter import BufferedRewriter
# ============================================================================
class RewriteHLS(BufferedRewriter):
EXT_INF = re.compile('#EXT-X-STREAM-INF:(?:.*[,])?BANDWIDTH=([\d]+)')
EXT_RESOLUTION = re.compile('RESOLUTION=([\d]+)x([\d]+)')
def rewrite_stream(self, stream, rwinfo):
max_resolution, max_bandwidth = self._get_adaptive_metadata(rwinfo)
def rewrite_stream(self, stream):
buff = stream.read()
lines = buff.decode('utf-8').split('\n')
best = None
indexes = []
count = 0
best_index = None
best_bandwidth = 0
best_resolution = 0
for line in lines:
m = self.EXT_INF.match(line)
if m:
indexes.append(count)
bandwidth = int(m.group(1))
if not best or bandwidth > best:
best = bandwidth
curr_bandwidth = int(m.group(1))
# resolution
m2 = self.EXT_RESOLUTION.search(line)
if m2:
curr_resolution = int(m2.group(1)) * int(m2.group(2))
else:
curr_resolution = 0
if max_resolution and curr_resolution:
if curr_resolution > best_resolution and curr_resolution <= max_resolution:
best_resolution = curr_resolution
best_bandwidth = curr_bandwidth
best_index = count
elif curr_bandwidth > best_bandwidth and curr_bandwidth <= max_bandwidth:
best_resolution = curr_resolution
best_bandwidth = curr_bandwidth
best_index = count
count = count + 1

View File

@ -10,8 +10,12 @@ from pywb.rewrite.wburl import WbUrl
from pywb.rewrite.url_rewriter import UrlRewriter
from pywb.rewrite.default_rewriter import DefaultRewriter
from pywb import get_test_dir
import os
import json
import pytest
@pytest.fixture(params=[{'Content-Type': 'text/html'},
{'Content-Type': 'application/xhtml+xml'},
{'Content-Type': 'application/octet-stream'},
@ -28,9 +32,11 @@ class TestContentRewriter(object):
def setup_class(self):
self.content_rewriter = DefaultRewriter()
def _create_response_record(self, url, headers, payload):
def _create_response_record(self, url, headers, payload, warc_headers):
writer = BufferWARCWriter()
warc_headers = warc_headers or {}
payload = payload.encode('utf-8')
http_headers = StatusAndHeaders('200 OK', headers, protocol='HTTP/1.0')
@ -38,12 +44,13 @@ class TestContentRewriter(object):
return writer.create_warc_record(url, 'response',
payload=BytesIO(payload),
length=len(payload),
http_headers=http_headers)
http_headers=http_headers,
warc_headers_dict=warc_headers)
def rewrite_record(self, headers, content, ts, url='http://example.com/',
prefix='http://localhost:8080/prefix/'):
prefix='http://localhost:8080/prefix/', warc_headers=None):
record = self._create_response_record(url, headers, content)
record = self._create_response_record(url, headers, content, warc_headers)
wburl = WbUrl(ts + '/' + url)
url_rewriter = UrlRewriter(wburl, prefix)
@ -217,5 +224,184 @@ class TestContentRewriter(object):
assert b''.join(gen).decode('utf-8') == content
def test_hls_default_max(self):
headers = {'Content-Type': 'application/vnd.apple.mpegurl'}
with open(os.path.join(get_test_dir(), 'text_content', 'sample_hls.m3u8'), 'rt') as fh:
content = fh.read()
headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701oe_',
url='http://example.com/path/master.m3u8')
assert headers.headers == [('Content-Type', 'application/vnd.apple.mpegurl')]
filtered = """\
#EXTM3U
#EXT-X-MEDIA:TYPE=SUBTITLES,GROUP-ID="WebVTT",NAME="English",DEFAULT=YES,AUTOSELECT=YES,FORCED=NO,URI="https://example.com/subtitles/"
#EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=4495000,RESOLUTION=1920x1080,CODECS="avc1.640028, mp4a.40.2",SUBTITLES="WebVTT"
http://example.com/video_6.m3u8
"""
assert b''.join(gen).decode('utf-8') == filtered
def test_hls_custom_max_resolution(self):
headers = {'Content-Type': 'application/x-mpegURL'}
with open(os.path.join(get_test_dir(), 'text_content', 'sample_hls.m3u8'), 'rt') as fh:
content = fh.read()
metadata = {'adaptive_max_resolution': 921600,
'adaptive_max_bandwidth': 2000000}
headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701oe_',
url='http://example.com/path/master.m3u8',
warc_headers={'WARC-JSON-Metadata': json.dumps(metadata)})
assert headers.headers == [('Content-Type', 'application/x-mpegURL')]
filtered = """\
#EXTM3U
#EXT-X-MEDIA:TYPE=SUBTITLES,GROUP-ID="WebVTT",NAME="English",DEFAULT=YES,AUTOSELECT=YES,FORCED=NO,URI="https://example.com/subtitles/"
#EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=2505000,RESOLUTION=1280x720,CODECS="avc1.77.30, mp4a.40.2",SUBTITLES="WebVTT"
http://example.com/video_5.m3u8
"""
assert b''.join(gen).decode('utf-8') == filtered
def test_hls_custom_max_bandwidth(self):
headers = {'Content-Type': 'application/x-mpegURL'}
with open(os.path.join(get_test_dir(), 'text_content', 'sample_hls.m3u8'), 'rt') as fh:
content = fh.read()
metadata = {'adaptive_max_bandwidth': 2000000}
headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701oe_',
url='http://example.com/path/master.m3u8',
warc_headers={'WARC-JSON-Metadata': json.dumps(metadata)})
assert headers.headers == [('Content-Type', 'application/x-mpegURL')]
filtered = """\
#EXTM3U
#EXT-X-MEDIA:TYPE=SUBTITLES,GROUP-ID="WebVTT",NAME="English",DEFAULT=YES,AUTOSELECT=YES,FORCED=NO,URI="https://example.com/subtitles/"
#EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=1002000,RESOLUTION=640x360,CODECS="avc1.77.30, mp4a.40.2",SUBTITLES="WebVTT"
http://example.com/video_4.m3u8
"""
assert b''.join(gen).decode('utf-8') == filtered
def test_dash_default_max(self):
headers = {'Content-Type': 'application/dash+xml'}
with open(os.path.join(get_test_dir(), 'text_content', 'sample_dash.mpd'), 'rt') as fh:
content = fh.read()
headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701oe_',
url='http://example.com/path/manifest.mpd')
assert headers.headers == [('Content-Type', 'application/dash+xml')]
filtered = """\
<?xml version='1.0' encoding='UTF-8'?>
<MPD xmlns="urn:mpeg:dash:schema:mpd:2011" mediaPresentationDuration="PT0H3M1.63S" minBufferTime="PT1.5S" profiles="urn:mpeg:dash:profile:isoff-on-demand:2011" type="static">
<Period duration="PT0H3M1.63S" start="PT0S">
<AdaptationSet>
<ContentComponent contentType="video" id="1" />
<Representation bandwidth="4190760" codecs="avc1.640028" height="1080" id="1" mimeType="video/mp4" width="1920">
<BaseURL>http://example.com/video-10.mp4</BaseURL>
<SegmentBase indexRange="674-1149">
<Initialization range="0-673" />
</SegmentBase>
</Representation>
</AdaptationSet>
<AdaptationSet>
<ContentComponent contentType="audio" id="2" />
<Representation bandwidth="255236" codecs="mp4a.40.2" id="7" mimeType="audio/mp4" numChannels="2" sampleRate="44100">
<BaseURL>http://example.com/audio-2.mp4</BaseURL>
<SegmentBase indexRange="592-851">
<Initialization range="0-591" />
</SegmentBase>
</Representation>
</AdaptationSet>
</Period>
</MPD>"""
assert b''.join(gen).decode('utf-8') == filtered
def test_dash_custom_max_resolution(self):
headers = {'Content-Type': 'application/dash+xml'}
with open(os.path.join(get_test_dir(), 'text_content', 'sample_dash.mpd'), 'rt') as fh:
content = fh.read()
metadata = {'adaptive_max_resolution': 921600,
'adaptive_max_bandwidth': 2000000}
headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701oe_',
url='http://example.com/path/manifest.mpd',
warc_headers={'WARC-JSON-Metadata': json.dumps(metadata)})
assert headers.headers == [('Content-Type', 'application/dash+xml')]
filtered = """\
<?xml version='1.0' encoding='UTF-8'?>
<MPD xmlns="urn:mpeg:dash:schema:mpd:2011" mediaPresentationDuration="PT0H3M1.63S" minBufferTime="PT1.5S" profiles="urn:mpeg:dash:profile:isoff-on-demand:2011" type="static">
<Period duration="PT0H3M1.63S" start="PT0S">
<AdaptationSet>
<ContentComponent contentType="video" id="1" />
<Representation bandwidth="2073921" codecs="avc1.4d401f" height="720" id="2" mimeType="video/mp4" width="1280">
<BaseURL>http://example.com/video-9.mp4</BaseURL>
<SegmentBase indexRange="708-1183">
<Initialization range="0-707" />
</SegmentBase>
</Representation>
</AdaptationSet>
<AdaptationSet>
<ContentComponent contentType="audio" id="2" />
<Representation bandwidth="255236" codecs="mp4a.40.2" id="7" mimeType="audio/mp4" numChannels="2" sampleRate="44100">
<BaseURL>http://example.com/audio-2.mp4</BaseURL>
<SegmentBase indexRange="592-851">
<Initialization range="0-591" />
</SegmentBase>
</Representation>
</AdaptationSet>
</Period>
</MPD>"""
assert b''.join(gen).decode('utf-8') == filtered
def test_dash_custom_max_bandwidth(self):
headers = {'Content-Type': 'application/dash+xml'}
with open(os.path.join(get_test_dir(), 'text_content', 'sample_dash.mpd'), 'rt') as fh:
content = fh.read()
metadata = {'adaptive_max_bandwidth': 2000000}
headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701oe_',
url='http://example.com/path/manifest.mpd',
warc_headers={'WARC-JSON-Metadata': json.dumps(metadata)})
assert headers.headers == [('Content-Type', 'application/dash+xml')]
filtered = """\
<?xml version='1.0' encoding='UTF-8'?>
<MPD xmlns="urn:mpeg:dash:schema:mpd:2011" mediaPresentationDuration="PT0H3M1.63S" minBufferTime="PT1.5S" profiles="urn:mpeg:dash:profile:isoff-on-demand:2011" type="static">
<Period duration="PT0H3M1.63S" start="PT0S">
<AdaptationSet>
<ContentComponent contentType="video" id="1" />
<Representation bandwidth="869460" codecs="avc1.4d401e" height="480" id="3" mimeType="video/mp4" width="854">
<BaseURL>http://example.com/video-8.mp4</BaseURL>
<SegmentBase indexRange="708-1183">
<Initialization range="0-707" />
</SegmentBase>
</Representation>
</AdaptationSet>
<AdaptationSet>
<ContentComponent contentType="audio" id="2" />
<Representation bandwidth="255236" codecs="mp4a.40.2" id="7" mimeType="audio/mp4" numChannels="2" sampleRate="44100">
<BaseURL>http://example.com/audio-2.mp4</BaseURL>
<SegmentBase indexRange="592-851">
<Initialization range="0-591" />
</SegmentBase>
</Representation>
</AdaptationSet>
</Period>
</MPD>"""
assert b''.join(gen).decode('utf-8') == filtered

View File

@ -12,9 +12,16 @@ default_filters:
# ignore query args for the following mime types
mimes:
- 'application/dash+xml'
# flash
- 'application/x-shockwave-flash'
# dash
- 'application/dash+xml'
# hls
- 'application/x-mpegURL'
- 'application/vnd.apple.mpegurl'
# apply following url normalization rules
# on both match url and request url
# to find a match (not limited to query argument removal)

View File

@ -237,6 +237,10 @@ class LiveWebLoader(BaseLoader):
UNREWRITE_HEADERS = ('location', 'content-location')
VIDEO_MIMES = ('application/x-mpegURL',
'application/vnd.apple.mpegurl',
'application/dash+xml')
def __init__(self, forward_proxy_prefix=None, adapter=None):
self.forward_proxy_prefix = forward_proxy_prefix
@ -378,8 +382,15 @@ class LiveWebLoader(BaseLoader):
if remote_ip:
warc_headers['WARC-IP-Address'] = remote_ip
ct = upstream_res.headers.get('Content-Type')
if ct:
metadata = self.get_custom_metadata(ct, dt)
if metadata:
warc_headers['WARC-JSON-Metadata'] = json.dumps(metadata)
warc_headers['Content-Type'] = 'application/http; msgtype=response'
self._set_content_len(upstream_res.headers.get('Content-Length', -1),
warc_headers,
len(http_headers_buff))
@ -455,6 +466,11 @@ class LiveWebLoader(BaseLoader):
logger.debug('FAILED: ' + method + ' ' + load_url + ': ' + str(e))
raise LiveResourceException(load_url)
def get_custom_metadata(self, content_type, dt):
if content_type in self.VIDEO_MIMES:
return {'adaptive_max_resolution': 1280 * 720,
'adaptive_max_bandwidth': 2000000}
def __str__(self):
return 'LiveWebLoader'

View File

@ -0,0 +1,65 @@
<MPD xmlns="urn:mpeg:dash:schema:mpd:2011" mediaPresentationDuration="PT0H3M1.63S" minBufferTime="PT1.5S" profiles="urn:mpeg:dash:profile:isoff-on-demand:2011"
type="static">
<Period duration="PT0H3M1.63S" start="PT0S">
<AdaptationSet>
<ContentComponent contentType="video" id="1" />
<Representation bandwidth="4190760" codecs="avc1.640028" height="1080" id="1" mimeType="video/mp4" width="1920">
<BaseURL>http://example.com/video-10.mp4</BaseURL>
<SegmentBase indexRange="674-1149">
<Initialization range="0-673" />
</SegmentBase>
</Representation>
<Representation bandwidth="2073921" codecs="avc1.4d401f" height="720" id="2" mimeType="video/mp4" width="1280">
<BaseURL>http://example.com/video-9.mp4</BaseURL>
<SegmentBase indexRange="708-1183">
<Initialization range="0-707" />
</SegmentBase>
</Representation>
<Representation bandwidth="869460" codecs="avc1.4d401e" height="480" id="3" mimeType="video/mp4" width="854">
<BaseURL>http://example.com/video-8.mp4</BaseURL>
<SegmentBase indexRange="708-1183">
<Initialization range="0-707" />
</SegmentBase>
</Representation>
<Representation bandwidth="686521" codecs="avc1.4d401e" height="360" id="4" mimeType="video/mp4" width="640">
<BaseURL>http://example.com/video-7.mp4</BaseURL>
<SegmentBase indexRange="708-1183">
<Initialization range="0-707" />
</SegmentBase>
</Representation>
<Representation bandwidth="264835" codecs="avc1.4d4015" height="240" id="5" mimeType="video/mp4" width="426">
<BaseURL>http://example.com/video-6.mp4</BaseURL>
<SegmentBase indexRange="672-1147">
<Initialization range="0-671" />
</SegmentBase>
</Representation>
<Representation bandwidth="100000" codecs="avc1.4d4015" height="144" id="5" mimeType="video/mp4" width="256">
<BaseURL>http://example.com/video-5.mp4</BaseURL>
<SegmentBase indexRange="671-1146">
<Initialization range="0-670" />
</SegmentBase>
</Representation>
</AdaptationSet>
<AdaptationSet>
<ContentComponent contentType="audio" id="2" />
<Representation bandwidth="127236" codecs="mp4a.40.2" id="6" mimeType="audio/mp4" numChannels="2" sampleRate="44100">
<BaseURL>http://example.com/audio-1.mp4</BaseURL>
<SegmentBase indexRange="592-851">
<Initialization range="0-591" />
</SegmentBase>
</Representation>
<Representation bandwidth="255236" codecs="mp4a.40.2" id="7" mimeType="audio/mp4" numChannels="2" sampleRate="44100">
<BaseURL>http://example.com/audio-2.mp4</BaseURL>
<SegmentBase indexRange="592-851">
<Initialization range="0-591" />
</SegmentBase>
</Representation>
<Representation bandwidth="31749" codecs="mp4a.40.5" id="8" mimeType="audio/mp4" numChannels="1" sampleRate="22050">
<BaseURL>http://example.com/audio-0.mp4</BaseURL>
<SegmentBase indexRange="592-851">
<Initialization range="0-591" />
</SegmentBase>
</Representation>
</AdaptationSet>
</Period>
</MPD>

View File

@ -0,0 +1,16 @@
#EXTM3U
#EXT-X-MEDIA:TYPE=SUBTITLES,GROUP-ID="WebVTT",NAME="English",DEFAULT=YES,AUTOSELECT=YES,FORCED=NO,URI="https://example.com/subtitles/"
#EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=610000,RESOLUTION=640x360,CODECS="avc1.66.30, mp4a.40.2",SUBTITLES="WebVTT"
http://example.com/video_1.m3u8
#EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=416000,RESOLUTION=400x224,CODECS="avc1.66.30, mp4a.40.2",SUBTITLES="WebVTT"
http://example.com/video_2.m3u8
#EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=797000,RESOLUTION=640x360,CODECS="avc1.66.30, mp4a.40.2",SUBTITLES="WebVTT"
http://example.com/video_3.m3u8
#EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=1002000,RESOLUTION=640x360,CODECS="avc1.77.30, mp4a.40.2",SUBTITLES="WebVTT"
http://example.com/video_4.m3u8
#EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=2505000,RESOLUTION=1280x720,CODECS="avc1.77.30, mp4a.40.2",SUBTITLES="WebVTT"
http://example.com/video_5.m3u8
#EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=4495000,RESOLUTION=1920x1080,CODECS="avc1.640028, mp4a.40.2",SUBTITLES="WebVTT"
http://example.com/video_6.m3u8
#EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=38000,CODECS="mp4a.40.2",SUBTITLES="WebVTT"
http://example.com/audio_0.m3u8