mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-14 15:53:28 +01:00
User-Agent Detection Fix + New-Style rewriting on by default + Dependency Update (2.6.6) (#708)
* js rewriting: default to moden js-proxy based rewriting by default, use legacy rewriting only if browsers are older than minimum, as suggested in #707 * user-agent detection: use ua_parser for user-agent detection instead of obsolete werkzeug.useragent, which also did not support browsers >=100 * tests: additional tests for rewriting with various user-agents, defaulting to new-style rewriting for unknown browsers * dockerfile: Update Dockerfile to use py3.8 * tests: skip s3 tests dependent on commoncrawl data (for now, need better s3 tests). * bump to 2.6.6, update CHANGES
This commit is contained in:
parent
63ac82ee6f
commit
403167fbe0
@ -1,3 +1,11 @@
|
||||
pywb 2.6.6 changelist
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
* dependency: don't use obsolete werkzeug useragent package `#704 <https://github.com/webrecorder/pywb/pull/704>`_
|
||||
* fix user-agent detection: use ua-parser module, default to new js-proxy mode, unless older browser detected `#707 <https://github.com/webrecorder/pywb/pull/707>`_
|
||||
* fix tests: disable broken s3 tests for now
|
||||
* Dockerfile: use python 3.8 by default
|
||||
|
||||
pywb 2.6.5 changelist
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
ARG PYTHON=python:3.7.2
|
||||
ARG PYTHON=python:3.8
|
||||
|
||||
FROM $PYTHON
|
||||
|
||||
|
@ -20,7 +20,7 @@ from pywb.rewrite.rewrite_js_workers import JSWorkerRewriter
|
||||
from pywb import DEFAULT_RULES_FILE
|
||||
|
||||
import copy
|
||||
from werkzeug.useragents import UserAgent
|
||||
from ua_parser import user_agent_parser
|
||||
|
||||
|
||||
# ============================================================================
|
||||
@ -34,7 +34,7 @@ class DefaultRewriter(BaseContentRewriter):
|
||||
|
||||
'css': CSSRewriter,
|
||||
|
||||
'js': JSLocationOnlyRewriter,
|
||||
'js': JSWombatProxyRewriter,
|
||||
'js-proxy': JSNoneRewriter,
|
||||
'js-worker': JSWorkerRewriter,
|
||||
|
||||
@ -119,33 +119,44 @@ class RewriterWithJSProxy(DefaultRewriter):
|
||||
super(RewriterWithJSProxy, self).__init__(*args, **kwargs)
|
||||
|
||||
def get_rewriter(self, rw_type, rwinfo=None):
|
||||
if rw_type == 'js' and rwinfo:
|
||||
# check if UA allows this
|
||||
if self.ua_allows_obj_proxy(rwinfo.url_rewriter.rewrite_opts):
|
||||
return JSWombatProxyRewriter
|
||||
if rw_type != 'js' or not rwinfo:
|
||||
return super(RewriterWithJSProxy, self).get_rewriter(rw_type, rwinfo)
|
||||
|
||||
# otherwise, return default rewriter
|
||||
return super(RewriterWithJSProxy, self).get_rewriter(rw_type, rwinfo)
|
||||
# check if should use old non-proxy rewriter
|
||||
if self.ua_no_obj_proxy(rwinfo.url_rewriter.rewrite_opts):
|
||||
print("loc only")
|
||||
return JSLocationOnlyRewriter
|
||||
else:
|
||||
# otherwise, return default, js proxy-capable rewriter
|
||||
return JSWombatProxyRewriter
|
||||
|
||||
def ua_allows_obj_proxy(self, opts):
|
||||
def ua_no_obj_proxy(self, opts):
|
||||
ua = opts.get('ua')
|
||||
if not ua:
|
||||
ua_string = opts.get('ua_string')
|
||||
if ua_string:
|
||||
ua = UserAgent(ua_string)
|
||||
ua = user_agent_parser.ParseUserAgent(ua_string)
|
||||
|
||||
if ua is None:
|
||||
return True
|
||||
return False
|
||||
|
||||
supported = {
|
||||
'chrome': '49.0',
|
||||
'firefox': '44.0',
|
||||
'safari': '10.0',
|
||||
'opera': '36.0',
|
||||
'edge': '12.0',
|
||||
'msie': None,
|
||||
'chrome': 49,
|
||||
'firefox': 4,
|
||||
'safari': 10,
|
||||
'opera': 36,
|
||||
'edge': 12,
|
||||
'ie': 1000,
|
||||
}
|
||||
|
||||
min_vers = supported.get(ua.browser)
|
||||
min_vers = supported.get(ua.get("family", "").lower())
|
||||
if not min_vers:
|
||||
return False
|
||||
|
||||
try:
|
||||
ua_version = int(ua.get("major", 0))
|
||||
except:
|
||||
return False
|
||||
|
||||
return ua_version < min_vers
|
||||
|
||||
return (min_vers and ua.version >= min_vers)
|
||||
|
@ -13,7 +13,7 @@ from pywb.utils.io import chunk_encode_iter
|
||||
|
||||
from pywb.rewrite.wburl import WbUrl
|
||||
from pywb.rewrite.url_rewriter import UrlRewriter
|
||||
from pywb.rewrite.default_rewriter import DefaultRewriter, RewriterWithJSProxy
|
||||
from pywb.rewrite.default_rewriter import RewriterWithJSProxy
|
||||
|
||||
from pywb import get_test_dir
|
||||
|
||||
@ -39,8 +39,7 @@ def headers(request):
|
||||
class TestContentRewriter(object):
|
||||
@classmethod
|
||||
def setup_class(self):
|
||||
self.content_rewriter = DefaultRewriter()
|
||||
self.js_proxy_content_rewriter = RewriterWithJSProxy()
|
||||
self.content_rewriter = RewriterWithJSProxy()
|
||||
|
||||
def _create_response_record(self, url, headers, payload, warc_headers):
|
||||
writer = BufferWARCWriter()
|
||||
@ -65,7 +64,6 @@ class TestContentRewriter(object):
|
||||
record = self._create_response_record(url, headers, content, warc_headers)
|
||||
|
||||
wburl = WbUrl(ts + '/' + (request_url or url))
|
||||
url_rewriter = UrlRewriter(wburl, prefix)
|
||||
|
||||
cdx = CDXObject()
|
||||
cdx['url'] = url
|
||||
@ -79,11 +77,13 @@ class TestContentRewriter(object):
|
||||
return ''
|
||||
|
||||
if use_js_proxy:
|
||||
rewriter = self.js_proxy_content_rewriter
|
||||
rewrite_opts = {}
|
||||
else:
|
||||
rewriter = self.content_rewriter
|
||||
rewrite_opts = {'ua_string': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/10.0 Safari/537.36'}
|
||||
|
||||
return rewriter(record, url_rewriter, cookie_rewriter=None,
|
||||
url_rewriter = UrlRewriter(wburl, prefix, rewrite_opts=rewrite_opts)
|
||||
|
||||
return self.content_rewriter(record, url_rewriter, cookie_rewriter=None,
|
||||
head_insert_func=insert_func,
|
||||
cdx=cdx,
|
||||
environ=environ)
|
||||
|
@ -97,7 +97,7 @@ from pywb import get_test_dir
|
||||
|
||||
test_cdx_dir = get_test_dir() + 'cdx/'
|
||||
|
||||
|
||||
@pytest.mark.skip("skip for now, made need different s3 source")
|
||||
def test_s3_read_1():
|
||||
pytest.importorskip('boto3')
|
||||
|
||||
@ -112,13 +112,14 @@ def test_s3_read_1():
|
||||
assert reader.readline() == b'WARC/1.0\r\n'
|
||||
assert reader.readline() == b'WARC-Type: response\r\n'
|
||||
|
||||
@pytest.mark.skip("skip for now, made need different s3 source")
|
||||
def test_s3_read_2():
|
||||
pytest.importorskip('boto3')
|
||||
|
||||
res = BlockLoader().load('s3://commoncrawl/crawl-data/CC-MAIN-2015-11/index.html')
|
||||
|
||||
buff = res.read()
|
||||
assert len(buff) == 2082
|
||||
assert len(buff) == 2330
|
||||
|
||||
reader = DecompressingBufferedReader(BytesIO(buff))
|
||||
assert reader.readline() == b'<!DOCTYPE html>\n'
|
||||
|
@ -1,4 +1,4 @@
|
||||
__version__ = '2.6.5'
|
||||
__version__ = '2.6.6'
|
||||
|
||||
if __name__ == '__main__':
|
||||
print(__version__)
|
||||
|
@ -6,7 +6,7 @@ jinja2<3.0.0
|
||||
surt>=0.3.1
|
||||
brotlipy
|
||||
pyyaml
|
||||
werkzeug==1.0.1
|
||||
werkzeug
|
||||
webencodings
|
||||
gevent==20.9.0
|
||||
webassets==0.12.1
|
||||
@ -16,3 +16,4 @@ fakeredis<1.0
|
||||
tldextract
|
||||
python-dateutil
|
||||
markupsafe<2.1.0
|
||||
ua_parser
|
||||
|
@ -269,17 +269,41 @@ class TestWbIntegration(BaseConfigTest):
|
||||
assert resp.content_length != 0
|
||||
assert resp.content_type == 'application/x-javascript'
|
||||
|
||||
# test with Chrome user agent
|
||||
resp = self.get('/pywb/20140126200625{0}/http://www.iana.org/_js/2013.1/jquery.js', fmod,
|
||||
headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'})
|
||||
assert 'let window = _____WB$wombat$assign$function_____(' in resp.text
|
||||
user_agents = [
|
||||
# chrome
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.3071.115 Safari/537.36'
|
||||
# firefox
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:98.0) Gecko/20100101 Firefox/98.0'
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:98.0) Gecko/20100101 Firefox/100.0',
|
||||
# safari
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15'
|
||||
# other
|
||||
'some-custom-browser'
|
||||
]
|
||||
|
||||
def test_replay_js_ie11_no_obj_proxy(self, fmod):
|
||||
# IE11 user-agent, no proxy
|
||||
resp = self.get('/pywb/20140126200625{0}/http://www.iana.org/_js/2013.1/jquery.js', fmod,
|
||||
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko'})
|
||||
# test with each user-agent
|
||||
for ua in user_agents:
|
||||
resp = self.get('/pywb/20140126200625{0}/http://www.iana.org/_js/2013.1/jquery.js', fmod,
|
||||
headers={'User-Agent': ua})
|
||||
|
||||
assert 'let window = _____WB$wombat$assign$function_____(' not in resp.text
|
||||
assert 'let window = _____WB$wombat$assign$function_____(' in resp.text
|
||||
|
||||
def test_replay_js_no_obj_proxy(self, fmod):
|
||||
user_agents = [
|
||||
# IE11 user-agent, no proxy
|
||||
"Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
|
||||
# old chrome
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/19.0.3071.115 Safari/537.36'
|
||||
# old firefox
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:98.0) Gecko/20100101 Firefox/12.0'
|
||||
]
|
||||
|
||||
for ua in user_agents:
|
||||
resp = self.get('/pywb/20140126200625{0}/http://www.iana.org/_js/2013.1/jquery.js', fmod,
|
||||
headers={'User-Agent': ua})
|
||||
|
||||
assert 'let window = _____WB$wombat$assign$function_____(' not in resp.text
|
||||
|
||||
def test_replay_non_exact(self, fmod):
|
||||
# non-exact mode, don't redirect to exact capture
|
||||
|
Loading…
x
Reference in New Issue
Block a user