1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-14 15:53:28 +01:00

User-Agent Detection Fix + New-Style rewriting on by default + Dependency Update (2.6.6) (#708)

* js rewriting: default to moden js-proxy based rewriting by default, use legacy rewriting only if browsers are older than minimum, as suggested in #707 
* user-agent detection: use ua_parser for user-agent detection instead of obsolete werkzeug.useragent, which also did not support browsers >=100
* tests: additional tests for rewriting with various user-agents, defaulting to new-style rewriting for unknown browsers
* dockerfile: Update Dockerfile to use py3.8
* tests: skip s3 tests dependent on commoncrawl data (for now, need better s3 tests).
* bump to 2.6.6, update CHANGES
This commit is contained in:
Ilya Kreymer 2022-04-11 14:51:11 -07:00 committed by GitHub
parent 63ac82ee6f
commit 403167fbe0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 85 additions and 40 deletions

View File

@ -1,3 +1,11 @@
pywb 2.6.6 changelist
~~~~~~~~~~~~~~~~~~~~~
* dependency: don't use obsolete werkzeug useragent package `#704 <https://github.com/webrecorder/pywb/pull/704>`_
* fix user-agent detection: use ua-parser module, default to new js-proxy mode, unless older browser detected `#707 <https://github.com/webrecorder/pywb/pull/707>`_
* fix tests: disable broken s3 tests for now
* Dockerfile: use python 3.8 by default
pywb 2.6.5 changelist
~~~~~~~~~~~~~~~~~~~~~

View File

@ -1,4 +1,4 @@
ARG PYTHON=python:3.7.2
ARG PYTHON=python:3.8
FROM $PYTHON

View File

@ -20,7 +20,7 @@ from pywb.rewrite.rewrite_js_workers import JSWorkerRewriter
from pywb import DEFAULT_RULES_FILE
import copy
from werkzeug.useragents import UserAgent
from ua_parser import user_agent_parser
# ============================================================================
@ -34,7 +34,7 @@ class DefaultRewriter(BaseContentRewriter):
'css': CSSRewriter,
'js': JSLocationOnlyRewriter,
'js': JSWombatProxyRewriter,
'js-proxy': JSNoneRewriter,
'js-worker': JSWorkerRewriter,
@ -119,33 +119,44 @@ class RewriterWithJSProxy(DefaultRewriter):
super(RewriterWithJSProxy, self).__init__(*args, **kwargs)
def get_rewriter(self, rw_type, rwinfo=None):
if rw_type == 'js' and rwinfo:
# check if UA allows this
if self.ua_allows_obj_proxy(rwinfo.url_rewriter.rewrite_opts):
return JSWombatProxyRewriter
# otherwise, return default rewriter
if rw_type != 'js' or not rwinfo:
return super(RewriterWithJSProxy, self).get_rewriter(rw_type, rwinfo)
def ua_allows_obj_proxy(self, opts):
# check if should use old non-proxy rewriter
if self.ua_no_obj_proxy(rwinfo.url_rewriter.rewrite_opts):
print("loc only")
return JSLocationOnlyRewriter
else:
# otherwise, return default, js proxy-capable rewriter
return JSWombatProxyRewriter
def ua_no_obj_proxy(self, opts):
ua = opts.get('ua')
if not ua:
ua_string = opts.get('ua_string')
if ua_string:
ua = UserAgent(ua_string)
ua = user_agent_parser.ParseUserAgent(ua_string)
if ua is None:
return True
return False
supported = {
'chrome': '49.0',
'firefox': '44.0',
'safari': '10.0',
'opera': '36.0',
'edge': '12.0',
'msie': None,
'chrome': 49,
'firefox': 4,
'safari': 10,
'opera': 36,
'edge': 12,
'ie': 1000,
}
min_vers = supported.get(ua.browser)
min_vers = supported.get(ua.get("family", "").lower())
if not min_vers:
return False
try:
ua_version = int(ua.get("major", 0))
except:
return False
return ua_version < min_vers
return (min_vers and ua.version >= min_vers)

View File

@ -13,7 +13,7 @@ from pywb.utils.io import chunk_encode_iter
from pywb.rewrite.wburl import WbUrl
from pywb.rewrite.url_rewriter import UrlRewriter
from pywb.rewrite.default_rewriter import DefaultRewriter, RewriterWithJSProxy
from pywb.rewrite.default_rewriter import RewriterWithJSProxy
from pywb import get_test_dir
@ -39,8 +39,7 @@ def headers(request):
class TestContentRewriter(object):
@classmethod
def setup_class(self):
self.content_rewriter = DefaultRewriter()
self.js_proxy_content_rewriter = RewriterWithJSProxy()
self.content_rewriter = RewriterWithJSProxy()
def _create_response_record(self, url, headers, payload, warc_headers):
writer = BufferWARCWriter()
@ -65,7 +64,6 @@ class TestContentRewriter(object):
record = self._create_response_record(url, headers, content, warc_headers)
wburl = WbUrl(ts + '/' + (request_url or url))
url_rewriter = UrlRewriter(wburl, prefix)
cdx = CDXObject()
cdx['url'] = url
@ -79,11 +77,13 @@ class TestContentRewriter(object):
return ''
if use_js_proxy:
rewriter = self.js_proxy_content_rewriter
rewrite_opts = {}
else:
rewriter = self.content_rewriter
rewrite_opts = {'ua_string': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/10.0 Safari/537.36'}
return rewriter(record, url_rewriter, cookie_rewriter=None,
url_rewriter = UrlRewriter(wburl, prefix, rewrite_opts=rewrite_opts)
return self.content_rewriter(record, url_rewriter, cookie_rewriter=None,
head_insert_func=insert_func,
cdx=cdx,
environ=environ)

View File

@ -97,7 +97,7 @@ from pywb import get_test_dir
test_cdx_dir = get_test_dir() + 'cdx/'
@pytest.mark.skip("skip for now, made need different s3 source")
def test_s3_read_1():
pytest.importorskip('boto3')
@ -112,13 +112,14 @@ def test_s3_read_1():
assert reader.readline() == b'WARC/1.0\r\n'
assert reader.readline() == b'WARC-Type: response\r\n'
@pytest.mark.skip("skip for now, made need different s3 source")
def test_s3_read_2():
pytest.importorskip('boto3')
res = BlockLoader().load('s3://commoncrawl/crawl-data/CC-MAIN-2015-11/index.html')
buff = res.read()
assert len(buff) == 2082
assert len(buff) == 2330
reader = DecompressingBufferedReader(BytesIO(buff))
assert reader.readline() == b'<!DOCTYPE html>\n'

View File

@ -1,4 +1,4 @@
__version__ = '2.6.5'
__version__ = '2.6.6'
if __name__ == '__main__':
print(__version__)

View File

@ -6,7 +6,7 @@ jinja2<3.0.0
surt>=0.3.1
brotlipy
pyyaml
werkzeug==1.0.1
werkzeug
webencodings
gevent==20.9.0
webassets==0.12.1
@ -16,3 +16,4 @@ fakeredis<1.0
tldextract
python-dateutil
markupsafe<2.1.0
ua_parser

View File

@ -269,15 +269,39 @@ class TestWbIntegration(BaseConfigTest):
assert resp.content_length != 0
assert resp.content_type == 'application/x-javascript'
# test with Chrome user agent
user_agents = [
# chrome
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.3071.115 Safari/537.36'
# firefox
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:98.0) Gecko/20100101 Firefox/98.0'
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:98.0) Gecko/20100101 Firefox/100.0',
# safari
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15'
# other
'some-custom-browser'
]
# test with each user-agent
for ua in user_agents:
resp = self.get('/pywb/20140126200625{0}/http://www.iana.org/_js/2013.1/jquery.js', fmod,
headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'})
headers={'User-Agent': ua})
assert 'let window = _____WB$wombat$assign$function_____(' in resp.text
def test_replay_js_ie11_no_obj_proxy(self, fmod):
def test_replay_js_no_obj_proxy(self, fmod):
user_agents = [
# IE11 user-agent, no proxy
"Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
# old chrome
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/19.0.3071.115 Safari/537.36'
# old firefox
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:98.0) Gecko/20100101 Firefox/12.0'
]
for ua in user_agents:
resp = self.get('/pywb/20140126200625{0}/http://www.iana.org/_js/2013.1/jquery.js', fmod,
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko'})
headers={'User-Agent': ua})
assert 'let window = _____WB$wombat$assign$function_____(' not in resp.text