From 772993ba53f0bd0045f92541a7edb475c8d4c643 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 6 Sep 2017 23:23:39 -0700 Subject: [PATCH] Adaptive Streaming Improvements (#236) * adaptive rewrite improvements: - Add 'application/vnd.apple.mpegurl' as HLS type in rules.yaml and default_rewriter.py - Support setting max resolution and max bandwidth to choose, defaults to 480x854 and 200000 respectively - LiveWebLoader provides a get_custom_metadata for specifying WARC-JSON-Metadata header, per mime type (TODO: support customization via rules) - When filtering, first limiting by resolution (if set), then by bandwidth (if set), otherwise default to max bandwidth - Max resoluton/max bandwidth stored in WARC record under WARC-JSON-Metadata as 'adaptive_max_resolution' and 'adaptive_max_bandwidth' to ensure replayability. If absent, choose absolute max in manifest to be backwards compatible - Add sample HLS and DASH manifests for testing, with and without max resolution/bandwidth settings. --- pywb/rewrite/content_rewriter.py | 21 ++- pywb/rewrite/default_rewriter.py | 1 + pywb/rewrite/rewrite_amf.py | 2 +- pywb/rewrite/rewrite_dash.py | 31 ++-- pywb/rewrite/rewrite_hls.py | 30 ++- pywb/rewrite/test/test_content_rewriter.py | 194 +++++++++++++++++++- pywb/rules.yaml | 9 +- pywb/warcserver/resource/responseloader.py | 16 ++ sample_archive/text_content/sample_dash.mpd | 65 +++++++ sample_archive/text_content/sample_hls.m3u8 | 16 ++ 10 files changed, 361 insertions(+), 24 deletions(-) create mode 100644 sample_archive/text_content/sample_dash.mpd create mode 100644 sample_archive/text_content/sample_hls.m3u8 diff --git a/pywb/rewrite/content_rewriter.py b/pywb/rewrite/content_rewriter.py index a24bf1cd..c067c587 100644 --- a/pywb/rewrite/content_rewriter.py +++ b/pywb/rewrite/content_rewriter.py @@ -8,6 +8,7 @@ from warcio.utils import to_native_str import re import webencodings import tempfile +import json from pywb.utils.io import StreamIter, BUFF_SIZE @@ -197,11 +198,27 @@ class BufferedRewriter(object): stream_buffer.write(buff) stream_buffer.seek(0) - return StreamIter(self.rewrite_stream(stream_buffer)) + return StreamIter(self.rewrite_stream(stream_buffer, rwinfo)) - def rewrite_stream(self, stream): + def rewrite_stream(self, stream, rwinfo): raise NotImplemented('implement in subclass') + def _get_record_metadata(self, rwinfo): + client_metadata = rwinfo.record.rec_headers.get_header('WARC-JSON-Metadata') + if client_metadata: + try: + return json.loads(client_metadata) + except: + pass + + return {} + + def _get_adaptive_metadata(self, rwinfo): + metadata = self._get_record_metadata(rwinfo) + max_resolution = int(metadata.get('adaptive_max_resolution', 0)) + max_bandwidth = int(metadata.get('adaptive_max_bandwidth', 1000000000)) + return max_resolution, max_bandwidth + # ============================================================================ class StreamingRewriter(object): diff --git a/pywb/rewrite/default_rewriter.py b/pywb/rewrite/default_rewriter.py index f96541ff..07eee084 100644 --- a/pywb/rewrite/default_rewriter.py +++ b/pywb/rewrite/default_rewriter.py @@ -63,6 +63,7 @@ class DefaultRewriter(BaseContentRewriter): # HLS 'application/x-mpegURL': 'hls', + 'application/vnd.apple.mpegurl': 'hls', # DASH 'application/dash+xml': 'dash', diff --git a/pywb/rewrite/rewrite_amf.py b/pywb/rewrite/rewrite_amf.py index aba45edf..8a471604 100644 --- a/pywb/rewrite/rewrite_amf.py +++ b/pywb/rewrite/rewrite_amf.py @@ -7,7 +7,7 @@ from pywb.rewrite.content_rewriter import BufferedRewriter # ============================================================================ # Experimental: not fully tested class RewriteAMF(BufferedRewriter): #pragma: no cover - def rewrite_stream(self, stream): + def rewrite_stream(self, stream, rwinfo): try: from pyamf import remoting diff --git a/pywb/rewrite/rewrite_dash.py b/pywb/rewrite/rewrite_dash.py index ab3dd5ea..a5a4e93f 100644 --- a/pywb/rewrite/rewrite_dash.py +++ b/pywb/rewrite/rewrite_dash.py @@ -1,5 +1,5 @@ from contextlib import closing -from io import BytesIO, StringIO +from io import BytesIO import json import xml.etree.ElementTree as ET @@ -9,11 +9,12 @@ from pywb.rewrite.content_rewriter import BufferedRewriter # ============================================================================ class RewriteDASH(BufferedRewriter): - def rewrite_stream(self, stream): - res_buff, best_ids = self.rewrite_dash(stream) + def rewrite_stream(self, stream, rwinfo): + res_buff, best_ids = self.rewrite_dash(stream, rwinfo) return res_buff - def rewrite_dash(self, stream): + def rewrite_dash(self, stream, rwinfo): + max_resolution, max_bandwidth = self._get_adaptive_metadata(rwinfo) ET.register_namespace('', 'urn:mpeg:dash:schema:mpd:2011') namespaces = {'mpd': 'urn:mpeg:dash:schema:mpd:2011'} @@ -26,24 +27,32 @@ class RewriteDASH(BufferedRewriter): for period in root.findall('mpd:Period', namespaces): for adaptset in period.findall('mpd:AdaptationSet', namespaces): - best = None + best_resolution = 0 + best_bandwidth = 0 + for repres in adaptset.findall('mpd:Representation', namespaces): - bandwidth = int(repres.get('bandwidth', '0')) - if not best or bandwidth > int(best.get('bandwidth', '0')): + curr_resolution = int(repres.get('width', '0')) * int(repres.get('height', '0')) + curr_bandwidth = int(repres.get('bandwidth', 0)) + if curr_resolution and max_resolution: + if curr_resolution <= max_resolution and curr_resolution > best_resolution: + best_resolution = curr_resolution + best_bandwidth = curr_bandwidth + best = repres + elif curr_bandwidth <= max_bandwidth and curr_bandwidth > best_bandwidth: + best_resolution = curr_resolution + best_bandwidth = curr_bandwidth best = repres - if best: + if best is not None: best_ids.append(best.get('id')) for repres in adaptset.findall('mpd:Representation', namespaces): if repres != best: adaptset.remove(repres) - string_io = StringIO() - tree.write(string_io, encoding='unicode', xml_declaration=True) buff_io = BytesIO() - buff_io.write(string_io.getvalue().encode('utf-8')) + tree.write(buff_io, encoding='UTF-8', xml_declaration=True) buff_io.seek(0) return buff_io, best_ids diff --git a/pywb/rewrite/rewrite_hls.py b/pywb/rewrite/rewrite_hls.py index 06be6750..618d6a38 100644 --- a/pywb/rewrite/rewrite_hls.py +++ b/pywb/rewrite/rewrite_hls.py @@ -7,23 +7,43 @@ from pywb.rewrite.content_rewriter import BufferedRewriter # ============================================================================ class RewriteHLS(BufferedRewriter): EXT_INF = re.compile('#EXT-X-STREAM-INF:(?:.*[,])?BANDWIDTH=([\d]+)') + EXT_RESOLUTION = re.compile('RESOLUTION=([\d]+)x([\d]+)') + + def rewrite_stream(self, stream, rwinfo): + max_resolution, max_bandwidth = self._get_adaptive_metadata(rwinfo) - def rewrite_stream(self, stream): buff = stream.read() lines = buff.decode('utf-8').split('\n') - best = None indexes = [] count = 0 best_index = None + best_bandwidth = 0 + best_resolution = 0 + for line in lines: m = self.EXT_INF.match(line) if m: indexes.append(count) - bandwidth = int(m.group(1)) - if not best or bandwidth > best: - best = bandwidth + curr_bandwidth = int(m.group(1)) + + # resolution + m2 = self.EXT_RESOLUTION.search(line) + if m2: + curr_resolution = int(m2.group(1)) * int(m2.group(2)) + else: + curr_resolution = 0 + + if max_resolution and curr_resolution: + if curr_resolution > best_resolution and curr_resolution <= max_resolution: + best_resolution = curr_resolution + best_bandwidth = curr_bandwidth + best_index = count + + elif curr_bandwidth > best_bandwidth and curr_bandwidth <= max_bandwidth: + best_resolution = curr_resolution + best_bandwidth = curr_bandwidth best_index = count count = count + 1 diff --git a/pywb/rewrite/test/test_content_rewriter.py b/pywb/rewrite/test/test_content_rewriter.py index c5e0fb37..981a0018 100644 --- a/pywb/rewrite/test/test_content_rewriter.py +++ b/pywb/rewrite/test/test_content_rewriter.py @@ -10,8 +10,12 @@ from pywb.rewrite.wburl import WbUrl from pywb.rewrite.url_rewriter import UrlRewriter from pywb.rewrite.default_rewriter import DefaultRewriter +from pywb import get_test_dir +import os +import json import pytest + @pytest.fixture(params=[{'Content-Type': 'text/html'}, {'Content-Type': 'application/xhtml+xml'}, {'Content-Type': 'application/octet-stream'}, @@ -28,9 +32,11 @@ class TestContentRewriter(object): def setup_class(self): self.content_rewriter = DefaultRewriter() - def _create_response_record(self, url, headers, payload): + def _create_response_record(self, url, headers, payload, warc_headers): writer = BufferWARCWriter() + warc_headers = warc_headers or {} + payload = payload.encode('utf-8') http_headers = StatusAndHeaders('200 OK', headers, protocol='HTTP/1.0') @@ -38,12 +44,13 @@ class TestContentRewriter(object): return writer.create_warc_record(url, 'response', payload=BytesIO(payload), length=len(payload), - http_headers=http_headers) + http_headers=http_headers, + warc_headers_dict=warc_headers) def rewrite_record(self, headers, content, ts, url='http://example.com/', - prefix='http://localhost:8080/prefix/'): + prefix='http://localhost:8080/prefix/', warc_headers=None): - record = self._create_response_record(url, headers, content) + record = self._create_response_record(url, headers, content, warc_headers) wburl = WbUrl(ts + '/' + url) url_rewriter = UrlRewriter(wburl, prefix) @@ -217,5 +224,184 @@ class TestContentRewriter(object): assert b''.join(gen).decode('utf-8') == content + def test_hls_default_max(self): + headers = {'Content-Type': 'application/vnd.apple.mpegurl'} + with open(os.path.join(get_test_dir(), 'text_content', 'sample_hls.m3u8'), 'rt') as fh: + content = fh.read() + + headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701oe_', + url='http://example.com/path/master.m3u8') + + assert headers.headers == [('Content-Type', 'application/vnd.apple.mpegurl')] + filtered = """\ +#EXTM3U +#EXT-X-MEDIA:TYPE=SUBTITLES,GROUP-ID="WebVTT",NAME="English",DEFAULT=YES,AUTOSELECT=YES,FORCED=NO,URI="https://example.com/subtitles/" +#EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=4495000,RESOLUTION=1920x1080,CODECS="avc1.640028, mp4a.40.2",SUBTITLES="WebVTT" +http://example.com/video_6.m3u8 +""" + + assert b''.join(gen).decode('utf-8') == filtered + + def test_hls_custom_max_resolution(self): + headers = {'Content-Type': 'application/x-mpegURL'} + with open(os.path.join(get_test_dir(), 'text_content', 'sample_hls.m3u8'), 'rt') as fh: + content = fh.read() + + metadata = {'adaptive_max_resolution': 921600, + 'adaptive_max_bandwidth': 2000000} + + headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701oe_', + url='http://example.com/path/master.m3u8', + warc_headers={'WARC-JSON-Metadata': json.dumps(metadata)}) + + assert headers.headers == [('Content-Type', 'application/x-mpegURL')] + filtered = """\ +#EXTM3U +#EXT-X-MEDIA:TYPE=SUBTITLES,GROUP-ID="WebVTT",NAME="English",DEFAULT=YES,AUTOSELECT=YES,FORCED=NO,URI="https://example.com/subtitles/" +#EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=2505000,RESOLUTION=1280x720,CODECS="avc1.77.30, mp4a.40.2",SUBTITLES="WebVTT" +http://example.com/video_5.m3u8 +""" + + assert b''.join(gen).decode('utf-8') == filtered + + def test_hls_custom_max_bandwidth(self): + headers = {'Content-Type': 'application/x-mpegURL'} + with open(os.path.join(get_test_dir(), 'text_content', 'sample_hls.m3u8'), 'rt') as fh: + content = fh.read() + + metadata = {'adaptive_max_bandwidth': 2000000} + + headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701oe_', + url='http://example.com/path/master.m3u8', + warc_headers={'WARC-JSON-Metadata': json.dumps(metadata)}) + + assert headers.headers == [('Content-Type', 'application/x-mpegURL')] + filtered = """\ +#EXTM3U +#EXT-X-MEDIA:TYPE=SUBTITLES,GROUP-ID="WebVTT",NAME="English",DEFAULT=YES,AUTOSELECT=YES,FORCED=NO,URI="https://example.com/subtitles/" +#EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=1002000,RESOLUTION=640x360,CODECS="avc1.77.30, mp4a.40.2",SUBTITLES="WebVTT" +http://example.com/video_4.m3u8 +""" + + assert b''.join(gen).decode('utf-8') == filtered + + def test_dash_default_max(self): + headers = {'Content-Type': 'application/dash+xml'} + with open(os.path.join(get_test_dir(), 'text_content', 'sample_dash.mpd'), 'rt') as fh: + content = fh.read() + + headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701oe_', + url='http://example.com/path/manifest.mpd') + + assert headers.headers == [('Content-Type', 'application/dash+xml')] + + filtered = """\ + + + + + + + http://example.com/video-10.mp4 + + + + + + + + + http://example.com/audio-2.mp4 + + + + + + +""" + assert b''.join(gen).decode('utf-8') == filtered + + def test_dash_custom_max_resolution(self): + headers = {'Content-Type': 'application/dash+xml'} + with open(os.path.join(get_test_dir(), 'text_content', 'sample_dash.mpd'), 'rt') as fh: + content = fh.read() + + metadata = {'adaptive_max_resolution': 921600, + 'adaptive_max_bandwidth': 2000000} + + headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701oe_', + url='http://example.com/path/manifest.mpd', + warc_headers={'WARC-JSON-Metadata': json.dumps(metadata)}) + + assert headers.headers == [('Content-Type', 'application/dash+xml')] + + filtered = """\ + + + + + + + http://example.com/video-9.mp4 + + + + + + + + + http://example.com/audio-2.mp4 + + + + + + +""" + + assert b''.join(gen).decode('utf-8') == filtered + + + def test_dash_custom_max_bandwidth(self): + headers = {'Content-Type': 'application/dash+xml'} + with open(os.path.join(get_test_dir(), 'text_content', 'sample_dash.mpd'), 'rt') as fh: + content = fh.read() + + metadata = {'adaptive_max_bandwidth': 2000000} + + headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701oe_', + url='http://example.com/path/manifest.mpd', + warc_headers={'WARC-JSON-Metadata': json.dumps(metadata)}) + + assert headers.headers == [('Content-Type', 'application/dash+xml')] + + filtered = """\ + + + + + + + http://example.com/video-8.mp4 + + + + + + + + + http://example.com/audio-2.mp4 + + + + + + +""" + + assert b''.join(gen).decode('utf-8') == filtered + diff --git a/pywb/rules.yaml b/pywb/rules.yaml index a5be4dea..25d7139b 100644 --- a/pywb/rules.yaml +++ b/pywb/rules.yaml @@ -12,9 +12,16 @@ default_filters: # ignore query args for the following mime types mimes: - - 'application/dash+xml' + # flash - 'application/x-shockwave-flash' + # dash + - 'application/dash+xml' + + # hls + - 'application/x-mpegURL' + - 'application/vnd.apple.mpegurl' + # apply following url normalization rules # on both match url and request url # to find a match (not limited to query argument removal) diff --git a/pywb/warcserver/resource/responseloader.py b/pywb/warcserver/resource/responseloader.py index 63725c0e..f2b49186 100644 --- a/pywb/warcserver/resource/responseloader.py +++ b/pywb/warcserver/resource/responseloader.py @@ -237,6 +237,10 @@ class LiveWebLoader(BaseLoader): UNREWRITE_HEADERS = ('location', 'content-location') + VIDEO_MIMES = ('application/x-mpegURL', + 'application/vnd.apple.mpegurl', + 'application/dash+xml') + def __init__(self, forward_proxy_prefix=None, adapter=None): self.forward_proxy_prefix = forward_proxy_prefix @@ -378,8 +382,15 @@ class LiveWebLoader(BaseLoader): if remote_ip: warc_headers['WARC-IP-Address'] = remote_ip + ct = upstream_res.headers.get('Content-Type') + if ct: + metadata = self.get_custom_metadata(ct, dt) + if metadata: + warc_headers['WARC-JSON-Metadata'] = json.dumps(metadata) + warc_headers['Content-Type'] = 'application/http; msgtype=response' + self._set_content_len(upstream_res.headers.get('Content-Length', -1), warc_headers, len(http_headers_buff)) @@ -455,6 +466,11 @@ class LiveWebLoader(BaseLoader): logger.debug('FAILED: ' + method + ' ' + load_url + ': ' + str(e)) raise LiveResourceException(load_url) + def get_custom_metadata(self, content_type, dt): + if content_type in self.VIDEO_MIMES: + return {'adaptive_max_resolution': 1280 * 720, + 'adaptive_max_bandwidth': 2000000} + def __str__(self): return 'LiveWebLoader' diff --git a/sample_archive/text_content/sample_dash.mpd b/sample_archive/text_content/sample_dash.mpd new file mode 100644 index 00000000..7f83670f --- /dev/null +++ b/sample_archive/text_content/sample_dash.mpd @@ -0,0 +1,65 @@ + + + + + + http://example.com/video-10.mp4 + + + + + + http://example.com/video-9.mp4 + + + + + + http://example.com/video-8.mp4 + + + + + + http://example.com/video-7.mp4 + + + + + + http://example.com/video-6.mp4 + + + + + + http://example.com/video-5.mp4 + + + + + + + + + http://example.com/audio-1.mp4 + + + + + + http://example.com/audio-2.mp4 + + + + + + http://example.com/audio-0.mp4 + + + + + + + diff --git a/sample_archive/text_content/sample_hls.m3u8 b/sample_archive/text_content/sample_hls.m3u8 new file mode 100644 index 00000000..8b9c21ce --- /dev/null +++ b/sample_archive/text_content/sample_hls.m3u8 @@ -0,0 +1,16 @@ +#EXTM3U +#EXT-X-MEDIA:TYPE=SUBTITLES,GROUP-ID="WebVTT",NAME="English",DEFAULT=YES,AUTOSELECT=YES,FORCED=NO,URI="https://example.com/subtitles/" +#EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=610000,RESOLUTION=640x360,CODECS="avc1.66.30, mp4a.40.2",SUBTITLES="WebVTT" +http://example.com/video_1.m3u8 +#EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=416000,RESOLUTION=400x224,CODECS="avc1.66.30, mp4a.40.2",SUBTITLES="WebVTT" +http://example.com/video_2.m3u8 +#EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=797000,RESOLUTION=640x360,CODECS="avc1.66.30, mp4a.40.2",SUBTITLES="WebVTT" +http://example.com/video_3.m3u8 +#EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=1002000,RESOLUTION=640x360,CODECS="avc1.77.30, mp4a.40.2",SUBTITLES="WebVTT" +http://example.com/video_4.m3u8 +#EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=2505000,RESOLUTION=1280x720,CODECS="avc1.77.30, mp4a.40.2",SUBTITLES="WebVTT" +http://example.com/video_5.m3u8 +#EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=4495000,RESOLUTION=1920x1080,CODECS="avc1.640028, mp4a.40.2",SUBTITLES="WebVTT" +http://example.com/video_6.m3u8 +#EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=38000,CODECS="mp4a.40.2",SUBTITLES="WebVTT" +http://example.com/audio_0.m3u8