1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

Title parse tweak (#498)

* proxy: update wombat history callback to fire immediately, update to latest wombat
* title parse: add html unescaping (use original unescaped method overridden in htmlrewriter)
tests: add tests for page fetch and title extraction
This commit is contained in:
Ilya Kreymer 2019-08-13 16:12:37 -07:00 committed by GitHub
parent e79c657255
commit 1e9d8f44af
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 49 additions and 10 deletions

View File

@ -20,6 +20,13 @@ class BaseContentRewriter(object):
TITLE = re.compile(r'<\s*title\s*>(.*)<\s*\/\s*title\s*>', re.M | re.I | re.S)
# set via html_rewriter since it overrides the default one
html_unescape = None
@classmethod
def set_unescape(cls, unescape):
cls.html_unescape = unescape
@classmethod
def _extract_title(cls, gen):
title_res = list(gen)
@ -31,7 +38,13 @@ class BaseContentRewriter(object):
return
title_res = m.group(1)
return title_res.strip()
title_res = title_res.strip()
try:
title_res = cls.html_unescape(title_res)
except Exception as e:
pass
return title_res
def __init__(self, rules_file, replay_mod=''):
self.rules = []

View File

@ -11,7 +11,7 @@ from six.moves.urllib.parse import urljoin, urlsplit, urlunsplit
from pywb.rewrite.url_rewriter import UrlRewriter
from pywb.rewrite.regex_rewriters import JSRewriter, CSSRewriter
from pywb.rewrite.content_rewriter import StreamingRewriter
from pywb.rewrite.content_rewriter import StreamingRewriter, BaseContentRewriter
from six import text_type
@ -20,9 +20,16 @@ import six.moves.html_parser
try:
orig_unescape = six.moves.html_parser.unescape
six.moves.html_parser.unescape = lambda x: x
BaseContentRewriter.set_unescape(orig_unescape)
except:
orig_unescape = None
@staticmethod
def __unescape(x):
return HTMLParser().unescape(x)
BaseContentRewriter.set_unescape(__unescape)
try:
import _markupbase as markupbase

File diff suppressed because one or more lines are too long

View File

@ -13,17 +13,23 @@ import six
# ============================================================================
def header_test_server(environ, start_response):
body = b'body'
value = u''
value = value.encode('utf-8')
if six.PY3:
value = value.decode('latin-1')
headers = []
if environ['PATH_INFO'] == '/unicode':
body = b'body'
value = u''
value = value.encode('utf-8')
if six.PY3:
value = value.decode('latin-1')
headers = [('Content-Length', str(len(body))),
('x-utf-8', value)]
elif environ['PATH_INFO'] == '/html-title':
body = b'<html><title>Test&#39;Title</title></html>'
headers = [('Content-Length', str(len(body))),
('Content-Type', 'text/html')]
start_response('200 OK', headers=headers)
return [body]
@ -132,6 +138,19 @@ class TestLiveRewriter(HttpBinLiveTests, BaseConfigTest):
assert 'Set-Cookie' not in resp.headers
assert resp.text == 'cookie value: testcookie=cookie-val'
def test_fetch_page_with_html_title(self, fmod_sl):
resp = self.get('/live/{0}http://localhost:%s/html-title' % self.header_test_serv.port, fmod_sl,
headers={'X-Wombat-History-Page': 'http://localhost:{0}/html-title'.format(self.header_test_serv.port),
})
assert resp.json == {'title': "Test'Title"}
def test_fetch_page_with_title(self, fmod_sl):
resp = self.get('/live/{0}http://httpbin.org/html', fmod_sl,
headers={'X-Wombat-History-Page': 'http://httpbin.org/html',
'X-Wombat-History-Title': 'Test%20Title',
})
assert resp.json == {'title': 'Test Title'}
def test_live_live_frame(self):
resp = self.testapp.get('/live/http://example.com/')
assert resp.status_int == 200

2
wombat

@ -1 +1 @@
Subproject commit 5fdacc6cd4c89ee8cd1fcbd3fccd4907069050e3
Subproject commit e647aa17a121bc9328809fc08b61b742c1357dd2