mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
Title parse tweak (#498)
* proxy: update wombat history callback to fire immediately, update to latest wombat * title parse: add html unescaping (use original unescaped method overridden in htmlrewriter) tests: add tests for page fetch and title extraction
This commit is contained in:
parent
e79c657255
commit
1e9d8f44af
@ -20,6 +20,13 @@ class BaseContentRewriter(object):
|
|||||||
|
|
||||||
TITLE = re.compile(r'<\s*title\s*>(.*)<\s*\/\s*title\s*>', re.M | re.I | re.S)
|
TITLE = re.compile(r'<\s*title\s*>(.*)<\s*\/\s*title\s*>', re.M | re.I | re.S)
|
||||||
|
|
||||||
|
# set via html_rewriter since it overrides the default one
|
||||||
|
html_unescape = None
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def set_unescape(cls, unescape):
|
||||||
|
cls.html_unescape = unescape
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def _extract_title(cls, gen):
|
def _extract_title(cls, gen):
|
||||||
title_res = list(gen)
|
title_res = list(gen)
|
||||||
@ -31,7 +38,13 @@ class BaseContentRewriter(object):
|
|||||||
return
|
return
|
||||||
|
|
||||||
title_res = m.group(1)
|
title_res = m.group(1)
|
||||||
return title_res.strip()
|
title_res = title_res.strip()
|
||||||
|
try:
|
||||||
|
title_res = cls.html_unescape(title_res)
|
||||||
|
except Exception as e:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return title_res
|
||||||
|
|
||||||
def __init__(self, rules_file, replay_mod=''):
|
def __init__(self, rules_file, replay_mod=''):
|
||||||
self.rules = []
|
self.rules = []
|
||||||
|
@ -11,7 +11,7 @@ from six.moves.urllib.parse import urljoin, urlsplit, urlunsplit
|
|||||||
from pywb.rewrite.url_rewriter import UrlRewriter
|
from pywb.rewrite.url_rewriter import UrlRewriter
|
||||||
from pywb.rewrite.regex_rewriters import JSRewriter, CSSRewriter
|
from pywb.rewrite.regex_rewriters import JSRewriter, CSSRewriter
|
||||||
|
|
||||||
from pywb.rewrite.content_rewriter import StreamingRewriter
|
from pywb.rewrite.content_rewriter import StreamingRewriter, BaseContentRewriter
|
||||||
|
|
||||||
from six import text_type
|
from six import text_type
|
||||||
|
|
||||||
@ -20,9 +20,16 @@ import six.moves.html_parser
|
|||||||
try:
|
try:
|
||||||
orig_unescape = six.moves.html_parser.unescape
|
orig_unescape = six.moves.html_parser.unescape
|
||||||
six.moves.html_parser.unescape = lambda x: x
|
six.moves.html_parser.unescape = lambda x: x
|
||||||
|
BaseContentRewriter.set_unescape(orig_unescape)
|
||||||
except:
|
except:
|
||||||
orig_unescape = None
|
orig_unescape = None
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def __unescape(x):
|
||||||
|
return HTMLParser().unescape(x)
|
||||||
|
|
||||||
|
BaseContentRewriter.set_unescape(__unescape)
|
||||||
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import _markupbase as markupbase
|
import _markupbase as markupbase
|
||||||
|
File diff suppressed because one or more lines are too long
@ -13,17 +13,23 @@ import six
|
|||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
def header_test_server(environ, start_response):
|
def header_test_server(environ, start_response):
|
||||||
body = b'body'
|
|
||||||
value = u'⛄'
|
|
||||||
value = value.encode('utf-8')
|
|
||||||
if six.PY3:
|
|
||||||
value = value.decode('latin-1')
|
|
||||||
|
|
||||||
headers = []
|
headers = []
|
||||||
if environ['PATH_INFO'] == '/unicode':
|
if environ['PATH_INFO'] == '/unicode':
|
||||||
|
body = b'body'
|
||||||
|
value = u'⛄'
|
||||||
|
value = value.encode('utf-8')
|
||||||
|
if six.PY3:
|
||||||
|
value = value.decode('latin-1')
|
||||||
|
|
||||||
headers = [('Content-Length', str(len(body))),
|
headers = [('Content-Length', str(len(body))),
|
||||||
('x-utf-8', value)]
|
('x-utf-8', value)]
|
||||||
|
|
||||||
|
elif environ['PATH_INFO'] == '/html-title':
|
||||||
|
body = b'<html><title>Test'Title</title></html>'
|
||||||
|
|
||||||
|
headers = [('Content-Length', str(len(body))),
|
||||||
|
('Content-Type', 'text/html')]
|
||||||
|
|
||||||
start_response('200 OK', headers=headers)
|
start_response('200 OK', headers=headers)
|
||||||
return [body]
|
return [body]
|
||||||
|
|
||||||
@ -132,6 +138,19 @@ class TestLiveRewriter(HttpBinLiveTests, BaseConfigTest):
|
|||||||
assert 'Set-Cookie' not in resp.headers
|
assert 'Set-Cookie' not in resp.headers
|
||||||
assert resp.text == 'cookie value: testcookie=cookie-val'
|
assert resp.text == 'cookie value: testcookie=cookie-val'
|
||||||
|
|
||||||
|
def test_fetch_page_with_html_title(self, fmod_sl):
|
||||||
|
resp = self.get('/live/{0}http://localhost:%s/html-title' % self.header_test_serv.port, fmod_sl,
|
||||||
|
headers={'X-Wombat-History-Page': 'http://localhost:{0}/html-title'.format(self.header_test_serv.port),
|
||||||
|
})
|
||||||
|
assert resp.json == {'title': "Test'Title"}
|
||||||
|
|
||||||
|
def test_fetch_page_with_title(self, fmod_sl):
|
||||||
|
resp = self.get('/live/{0}http://httpbin.org/html', fmod_sl,
|
||||||
|
headers={'X-Wombat-History-Page': 'http://httpbin.org/html',
|
||||||
|
'X-Wombat-History-Title': 'Test%20Title',
|
||||||
|
})
|
||||||
|
assert resp.json == {'title': 'Test Title'}
|
||||||
|
|
||||||
def test_live_live_frame(self):
|
def test_live_live_frame(self):
|
||||||
resp = self.testapp.get('/live/http://example.com/')
|
resp = self.testapp.get('/live/http://example.com/')
|
||||||
assert resp.status_int == 200
|
assert resp.status_int == 200
|
||||||
|
2
wombat
2
wombat
@ -1 +1 @@
|
|||||||
Subproject commit 5fdacc6cd4c89ee8cd1fcbd3fccd4907069050e3
|
Subproject commit e647aa17a121bc9328809fc08b61b742c1357dd2
|
Loading…
x
Reference in New Issue
Block a user