From 403167fbe009760d75a1946fe37b61b38c5796df Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 11 Apr 2022 14:51:11 -0700 Subject: [PATCH] User-Agent Detection Fix + New-Style rewriting on by default + Dependency Update (2.6.6) (#708) * js rewriting: default to moden js-proxy based rewriting by default, use legacy rewriting only if browsers are older than minimum, as suggested in #707 * user-agent detection: use ua_parser for user-agent detection instead of obsolete werkzeug.useragent, which also did not support browsers >=100 * tests: additional tests for rewriting with various user-agents, defaulting to new-style rewriting for unknown browsers * dockerfile: Update Dockerfile to use py3.8 * tests: skip s3 tests dependent on commoncrawl data (for now, need better s3 tests). * bump to 2.6.6, update CHANGES --- CHANGES.rst | 8 ++++ Dockerfile | 2 +- pywb/rewrite/default_rewriter.py | 49 +++++++++++++--------- pywb/rewrite/test/test_content_rewriter.py | 14 +++---- pywb/utils/test/test_loaders.py | 5 ++- pywb/version.py | 2 +- requirements.txt | 3 +- tests/test_integration.py | 42 +++++++++++++++---- 8 files changed, 85 insertions(+), 40 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 3fee6c49..686734d7 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,3 +1,11 @@ +pywb 2.6.6 changelist +~~~~~~~~~~~~~~~~~~~~~ + +* dependency: don't use obsolete werkzeug useragent package `#704 `_ +* fix user-agent detection: use ua-parser module, default to new js-proxy mode, unless older browser detected `#707 `_ +* fix tests: disable broken s3 tests for now +* Dockerfile: use python 3.8 by default + pywb 2.6.5 changelist ~~~~~~~~~~~~~~~~~~~~~ diff --git a/Dockerfile b/Dockerfile index 31729d6e..a248fd4c 100755 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -ARG PYTHON=python:3.7.2 +ARG PYTHON=python:3.8 FROM $PYTHON diff --git a/pywb/rewrite/default_rewriter.py b/pywb/rewrite/default_rewriter.py index f05ecc70..a584774f 100644 --- a/pywb/rewrite/default_rewriter.py +++ b/pywb/rewrite/default_rewriter.py @@ -20,7 +20,7 @@ from pywb.rewrite.rewrite_js_workers import JSWorkerRewriter from pywb import DEFAULT_RULES_FILE import copy -from werkzeug.useragents import UserAgent +from ua_parser import user_agent_parser # ============================================================================ @@ -34,7 +34,7 @@ class DefaultRewriter(BaseContentRewriter): 'css': CSSRewriter, - 'js': JSLocationOnlyRewriter, + 'js': JSWombatProxyRewriter, 'js-proxy': JSNoneRewriter, 'js-worker': JSWorkerRewriter, @@ -119,33 +119,44 @@ class RewriterWithJSProxy(DefaultRewriter): super(RewriterWithJSProxy, self).__init__(*args, **kwargs) def get_rewriter(self, rw_type, rwinfo=None): - if rw_type == 'js' and rwinfo: - # check if UA allows this - if self.ua_allows_obj_proxy(rwinfo.url_rewriter.rewrite_opts): - return JSWombatProxyRewriter + if rw_type != 'js' or not rwinfo: + return super(RewriterWithJSProxy, self).get_rewriter(rw_type, rwinfo) - # otherwise, return default rewriter - return super(RewriterWithJSProxy, self).get_rewriter(rw_type, rwinfo) + # check if should use old non-proxy rewriter + if self.ua_no_obj_proxy(rwinfo.url_rewriter.rewrite_opts): + print("loc only") + return JSLocationOnlyRewriter + else: + # otherwise, return default, js proxy-capable rewriter + return JSWombatProxyRewriter - def ua_allows_obj_proxy(self, opts): + def ua_no_obj_proxy(self, opts): ua = opts.get('ua') if not ua: ua_string = opts.get('ua_string') if ua_string: - ua = UserAgent(ua_string) + ua = user_agent_parser.ParseUserAgent(ua_string) if ua is None: - return True + return False supported = { - 'chrome': '49.0', - 'firefox': '44.0', - 'safari': '10.0', - 'opera': '36.0', - 'edge': '12.0', - 'msie': None, + 'chrome': 49, + 'firefox': 4, + 'safari': 10, + 'opera': 36, + 'edge': 12, + 'ie': 1000, } - min_vers = supported.get(ua.browser) + min_vers = supported.get(ua.get("family", "").lower()) + if not min_vers: + return False + + try: + ua_version = int(ua.get("major", 0)) + except: + return False + + return ua_version < min_vers - return (min_vers and ua.version >= min_vers) diff --git a/pywb/rewrite/test/test_content_rewriter.py b/pywb/rewrite/test/test_content_rewriter.py index 174a222b..7f4298c1 100644 --- a/pywb/rewrite/test/test_content_rewriter.py +++ b/pywb/rewrite/test/test_content_rewriter.py @@ -13,7 +13,7 @@ from pywb.utils.io import chunk_encode_iter from pywb.rewrite.wburl import WbUrl from pywb.rewrite.url_rewriter import UrlRewriter -from pywb.rewrite.default_rewriter import DefaultRewriter, RewriterWithJSProxy +from pywb.rewrite.default_rewriter import RewriterWithJSProxy from pywb import get_test_dir @@ -39,8 +39,7 @@ def headers(request): class TestContentRewriter(object): @classmethod def setup_class(self): - self.content_rewriter = DefaultRewriter() - self.js_proxy_content_rewriter = RewriterWithJSProxy() + self.content_rewriter = RewriterWithJSProxy() def _create_response_record(self, url, headers, payload, warc_headers): writer = BufferWARCWriter() @@ -65,7 +64,6 @@ class TestContentRewriter(object): record = self._create_response_record(url, headers, content, warc_headers) wburl = WbUrl(ts + '/' + (request_url or url)) - url_rewriter = UrlRewriter(wburl, prefix) cdx = CDXObject() cdx['url'] = url @@ -79,11 +77,13 @@ class TestContentRewriter(object): return '' if use_js_proxy: - rewriter = self.js_proxy_content_rewriter + rewrite_opts = {} else: - rewriter = self.content_rewriter + rewrite_opts = {'ua_string': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/10.0 Safari/537.36'} - return rewriter(record, url_rewriter, cookie_rewriter=None, + url_rewriter = UrlRewriter(wburl, prefix, rewrite_opts=rewrite_opts) + + return self.content_rewriter(record, url_rewriter, cookie_rewriter=None, head_insert_func=insert_func, cdx=cdx, environ=environ) diff --git a/pywb/utils/test/test_loaders.py b/pywb/utils/test/test_loaders.py index 12dbc14b..0366a08d 100644 --- a/pywb/utils/test/test_loaders.py +++ b/pywb/utils/test/test_loaders.py @@ -97,7 +97,7 @@ from pywb import get_test_dir test_cdx_dir = get_test_dir() + 'cdx/' - +@pytest.mark.skip("skip for now, made need different s3 source") def test_s3_read_1(): pytest.importorskip('boto3') @@ -112,13 +112,14 @@ def test_s3_read_1(): assert reader.readline() == b'WARC/1.0\r\n' assert reader.readline() == b'WARC-Type: response\r\n' +@pytest.mark.skip("skip for now, made need different s3 source") def test_s3_read_2(): pytest.importorskip('boto3') res = BlockLoader().load('s3://commoncrawl/crawl-data/CC-MAIN-2015-11/index.html') buff = res.read() - assert len(buff) == 2082 + assert len(buff) == 2330 reader = DecompressingBufferedReader(BytesIO(buff)) assert reader.readline() == b'\n' diff --git a/pywb/version.py b/pywb/version.py index d7d04e0a..8c5301d7 100644 --- a/pywb/version.py +++ b/pywb/version.py @@ -1,4 +1,4 @@ -__version__ = '2.6.5' +__version__ = '2.6.6' if __name__ == '__main__': print(__version__) diff --git a/requirements.txt b/requirements.txt index cde664bc..b4737574 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,7 +6,7 @@ jinja2<3.0.0 surt>=0.3.1 brotlipy pyyaml -werkzeug==1.0.1 +werkzeug webencodings gevent==20.9.0 webassets==0.12.1 @@ -16,3 +16,4 @@ fakeredis<1.0 tldextract python-dateutil markupsafe<2.1.0 +ua_parser diff --git a/tests/test_integration.py b/tests/test_integration.py index f5abf5f0..1b0b33e4 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -269,17 +269,41 @@ class TestWbIntegration(BaseConfigTest): assert resp.content_length != 0 assert resp.content_type == 'application/x-javascript' - # test with Chrome user agent - resp = self.get('/pywb/20140126200625{0}/http://www.iana.org/_js/2013.1/jquery.js', fmod, - headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'}) - assert 'let window = _____WB$wombat$assign$function_____(' in resp.text + user_agents = [ + # chrome + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36' + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.3071.115 Safari/537.36' + # firefox + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:98.0) Gecko/20100101 Firefox/98.0' + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:98.0) Gecko/20100101 Firefox/100.0', + # safari + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15' + # other + 'some-custom-browser' + ] - def test_replay_js_ie11_no_obj_proxy(self, fmod): - # IE11 user-agent, no proxy - resp = self.get('/pywb/20140126200625{0}/http://www.iana.org/_js/2013.1/jquery.js', fmod, - headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko'}) + # test with each user-agent + for ua in user_agents: + resp = self.get('/pywb/20140126200625{0}/http://www.iana.org/_js/2013.1/jquery.js', fmod, + headers={'User-Agent': ua}) - assert 'let window = _____WB$wombat$assign$function_____(' not in resp.text + assert 'let window = _____WB$wombat$assign$function_____(' in resp.text + + def test_replay_js_no_obj_proxy(self, fmod): + user_agents = [ + # IE11 user-agent, no proxy + "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko" + # old chrome + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/19.0.3071.115 Safari/537.36' + # old firefox + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:98.0) Gecko/20100101 Firefox/12.0' + ] + + for ua in user_agents: + resp = self.get('/pywb/20140126200625{0}/http://www.iana.org/_js/2013.1/jquery.js', fmod, + headers={'User-Agent': ua}) + + assert 'let window = _____WB$wombat$assign$function_____(' not in resp.text def test_replay_non_exact(self, fmod): # non-exact mode, don't redirect to exact capture