From db3ba5a067dc705cba021759a81a9cee1f3f8412 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 2 Nov 2017 19:43:48 -0700 Subject: [PATCH] Rules Work (vimeo) and live_only flag (#264) * rules work: - apply 'js_regexs' on json content also, using 'js-proxy' rewriter - rules for vimeo, disable hls/dash - add 'live_only' flag 'rewrite' to enable rewrite only when 'is_live' is set - tests: add test for new vimeo rules, testing live_only cli: add '--record' cli option to enable quick-recording from live collection --- pywb/apps/cli.py | 4 ++++ pywb/rewrite/content_rewriter.py | 14 ++++++++++++- pywb/rewrite/test/test_content_rewriter.py | 23 +++++++++++++++++++++- pywb/rules.yaml | 17 ++++++++++++++++ 4 files changed, 56 insertions(+), 2 deletions(-) diff --git a/pywb/apps/cli.py b/pywb/apps/cli.py index ae5f786f..77c6f9f9 100644 --- a/pywb/apps/cli.py +++ b/pywb/apps/cli.py @@ -35,6 +35,7 @@ class BaseCli(object): parser.add_argument('--profile', action='store_true') parser.add_argument('--live', action='store_true', help='Add live-web handler at /live') + parser.add_argument('--record', action='store_true') parser.add_argument('--proxy', help='Enable HTTP/S Proxy on specified collection') parser.add_argument('--proxy-record', action='store_true', help='Enable Proxy Recording into specified collection') @@ -71,6 +72,9 @@ class BaseCli(object): if self.r.debug: self.extra_config['debug'] = True + if self.r.record: + self.extra_config['recorder'] = 'live' + def run(self): self.run_gevent() return self diff --git a/pywb/rewrite/content_rewriter.py b/pywb/rewrite/content_rewriter.py index 3906026e..295e11e4 100644 --- a/pywb/rewrite/content_rewriter.py +++ b/pywb/rewrite/content_rewriter.py @@ -70,7 +70,19 @@ class BaseContentRewriter(object): return {} + def has_custom_rules(self, rule, cdx): + if 'js_regex_func' not in rule: + return False + + if rule.get('live_only') and not cdx.get('is_live'): + return False + + return True + def get_rw_class(self, rule, text_type, rwinfo): + if text_type == 'json' and 'js_regex_func' in rule: + text_type = 'js-proxy' + if text_type == 'js' and not rwinfo.is_url_rw(): text_type = 'js-proxy' @@ -89,7 +101,7 @@ class BaseContentRewriter(object): if rw_type in ('js', 'js-proxy'): extra_rules = [] - if 'js_regex_func' in rule: + if self.has_custom_rules(rule, cdx): extra_rules = rule['js_regex_func'](rwinfo.url_rewriter) # if js-proxy and no rules, default to none diff --git a/pywb/rewrite/test/test_content_rewriter.py b/pywb/rewrite/test/test_content_rewriter.py index ba83e166..19b4dbf7 100644 --- a/pywb/rewrite/test/test_content_rewriter.py +++ b/pywb/rewrite/test/test_content_rewriter.py @@ -13,11 +13,13 @@ from pywb.rewrite.url_rewriter import UrlRewriter from pywb.rewrite.default_rewriter import DefaultRewriter from pywb import get_test_dir + import os import json import pytest +# ============================================================================ @pytest.fixture(params=[{'Content-Type': 'text/html'}, {'Content-Type': 'application/xhtml+xml'}, {'Content-Type': 'application/octet-stream'}, @@ -51,7 +53,7 @@ class TestContentRewriter(object): def rewrite_record(self, headers, content, ts, url='http://example.com/', prefix='http://localhost:8080/prefix/', warc_headers=None, - request_url=None): + request_url=None, is_live=None): record = self._create_response_record(url, headers, content, warc_headers) @@ -64,6 +66,7 @@ class TestContentRewriter(object): cdx['urlkey'] = canonicalize(url) if request_url != url: cdx['is_fuzzy'] = '1' + cdx['is_live'] = is_live return self.content_rewriter(record, url_rewriter, None, cdx=cdx) @@ -272,6 +275,24 @@ class TestContentRewriter(object): assert b''.join(gen).decode('utf-8') == '{"ssid":"5678"}' + def test_custom_live_only(self): + headers = {'Content-Type': 'application/json'} + content = '{"foo":"bar", "dash": {"on": "true"}, "some": ["list"]' + + # is_live + rw_headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701mp_', + url='https://player.vimeo.com/video/123445/config/config?A=B', + is_live='1') + + # rewritten + assert b''.join(gen).decode('utf-8') == '{"foo":"bar", "__dash": {"on": "true"}, "some": ["list"]' + + # not is_live + rw_headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701mp_', + url='https://player.vimeo.com/video/123445/config/config?A=B') + + assert b''.join(gen).decode('utf-8') == content + def test_hls_default_max(self): headers = {'Content-Type': 'application/vnd.apple.mpegurl'} with open(os.path.join(get_test_dir(), 'text_content', 'sample_hls.m3u8'), 'rt') as fh: diff --git a/pywb/rules.yaml b/pywb/rules.yaml index 8697974a..545cb7fe 100644 --- a/pywb/rules.yaml +++ b/pywb/rules.yaml @@ -261,6 +261,21 @@ rules: # only use non query part of url, ignore query fuzzy_lookup: '()' + - url_prefix: 'com,vimeo,player)/video/' + + fuzzy_lookup: + match: + regex: 'com,vimeo.player\)/video/[\d]+/config?.*' + + rewrite: + live_only: true + js_regexs: + - match: '"dash":' + replace: '"__dash":' + + - match: '"hls":' + replace: '"__hls":' + - url_prefix: 'com,vimeocdn,' fuzzy_lookup: '()' @@ -279,7 +294,9 @@ rules: - videoFileId - signature + # vine + #================================================================= - url_prefix: 'co,vine,cdn,' fuzzy_lookup: