mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
Title parse tweak (#498)
* proxy: update wombat history callback to fire immediately, update to latest wombat * title parse: add html unescaping (use original unescaped method overridden in htmlrewriter) tests: add tests for page fetch and title extraction
This commit is contained in:
parent
e79c657255
commit
1e9d8f44af
@ -20,6 +20,13 @@ class BaseContentRewriter(object):
|
||||
|
||||
TITLE = re.compile(r'<\s*title\s*>(.*)<\s*\/\s*title\s*>', re.M | re.I | re.S)
|
||||
|
||||
# set via html_rewriter since it overrides the default one
|
||||
html_unescape = None
|
||||
|
||||
@classmethod
|
||||
def set_unescape(cls, unescape):
|
||||
cls.html_unescape = unescape
|
||||
|
||||
@classmethod
|
||||
def _extract_title(cls, gen):
|
||||
title_res = list(gen)
|
||||
@ -31,7 +38,13 @@ class BaseContentRewriter(object):
|
||||
return
|
||||
|
||||
title_res = m.group(1)
|
||||
return title_res.strip()
|
||||
title_res = title_res.strip()
|
||||
try:
|
||||
title_res = cls.html_unescape(title_res)
|
||||
except Exception as e:
|
||||
pass
|
||||
|
||||
return title_res
|
||||
|
||||
def __init__(self, rules_file, replay_mod=''):
|
||||
self.rules = []
|
||||
|
@ -11,7 +11,7 @@ from six.moves.urllib.parse import urljoin, urlsplit, urlunsplit
|
||||
from pywb.rewrite.url_rewriter import UrlRewriter
|
||||
from pywb.rewrite.regex_rewriters import JSRewriter, CSSRewriter
|
||||
|
||||
from pywb.rewrite.content_rewriter import StreamingRewriter
|
||||
from pywb.rewrite.content_rewriter import StreamingRewriter, BaseContentRewriter
|
||||
|
||||
from six import text_type
|
||||
|
||||
@ -20,9 +20,16 @@ import six.moves.html_parser
|
||||
try:
|
||||
orig_unescape = six.moves.html_parser.unescape
|
||||
six.moves.html_parser.unescape = lambda x: x
|
||||
BaseContentRewriter.set_unescape(orig_unescape)
|
||||
except:
|
||||
orig_unescape = None
|
||||
|
||||
@staticmethod
|
||||
def __unescape(x):
|
||||
return HTMLParser().unescape(x)
|
||||
|
||||
BaseContentRewriter.set_unescape(__unescape)
|
||||
|
||||
|
||||
try:
|
||||
import _markupbase as markupbase
|
||||
|
File diff suppressed because one or more lines are too long
@ -13,17 +13,23 @@ import six
|
||||
|
||||
# ============================================================================
|
||||
def header_test_server(environ, start_response):
|
||||
body = b'body'
|
||||
value = u'⛄'
|
||||
value = value.encode('utf-8')
|
||||
if six.PY3:
|
||||
value = value.decode('latin-1')
|
||||
|
||||
headers = []
|
||||
if environ['PATH_INFO'] == '/unicode':
|
||||
body = b'body'
|
||||
value = u'⛄'
|
||||
value = value.encode('utf-8')
|
||||
if six.PY3:
|
||||
value = value.decode('latin-1')
|
||||
|
||||
headers = [('Content-Length', str(len(body))),
|
||||
('x-utf-8', value)]
|
||||
|
||||
elif environ['PATH_INFO'] == '/html-title':
|
||||
body = b'<html><title>Test'Title</title></html>'
|
||||
|
||||
headers = [('Content-Length', str(len(body))),
|
||||
('Content-Type', 'text/html')]
|
||||
|
||||
start_response('200 OK', headers=headers)
|
||||
return [body]
|
||||
|
||||
@ -132,6 +138,19 @@ class TestLiveRewriter(HttpBinLiveTests, BaseConfigTest):
|
||||
assert 'Set-Cookie' not in resp.headers
|
||||
assert resp.text == 'cookie value: testcookie=cookie-val'
|
||||
|
||||
def test_fetch_page_with_html_title(self, fmod_sl):
|
||||
resp = self.get('/live/{0}http://localhost:%s/html-title' % self.header_test_serv.port, fmod_sl,
|
||||
headers={'X-Wombat-History-Page': 'http://localhost:{0}/html-title'.format(self.header_test_serv.port),
|
||||
})
|
||||
assert resp.json == {'title': "Test'Title"}
|
||||
|
||||
def test_fetch_page_with_title(self, fmod_sl):
|
||||
resp = self.get('/live/{0}http://httpbin.org/html', fmod_sl,
|
||||
headers={'X-Wombat-History-Page': 'http://httpbin.org/html',
|
||||
'X-Wombat-History-Title': 'Test%20Title',
|
||||
})
|
||||
assert resp.json == {'title': 'Test Title'}
|
||||
|
||||
def test_live_live_frame(self):
|
||||
resp = self.testapp.get('/live/http://example.com/')
|
||||
assert resp.status_int == 200
|
||||
|
2
wombat
2
wombat
@ -1 +1 @@
|
||||
Subproject commit 5fdacc6cd4c89ee8cd1fcbd3fccd4907069050e3
|
||||
Subproject commit e647aa17a121bc9328809fc08b61b742c1357dd2
|
Loading…
x
Reference in New Issue
Block a user