mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
proxy mode HTMLInsertOnlyRewriter: (#496)
- insert head-insert before first tag that is not <html> or <head> insert before - addresses issue with rewriting pages that have no <head> tag (already handled in full rewriter) - tests: add tests for HTMLInsertOnlyRewriter - bump version to 2.3.3, update changelist
This commit is contained in:
parent
42089e237b
commit
bf9284fec5
@ -1,3 +1,9 @@
|
||||
pywb 2.3.3 changelist
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
* Proxy Mode: Ensure head insert added even if no ``<head>`` tag, insert after first tag that is not ``<html>`` or ``<head>`` (#496)
|
||||
|
||||
|
||||
pywb 2.3.2 changelist
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
|
@ -4,10 +4,10 @@ from pywb.rewrite.content_rewriter import StreamingRewriter
|
||||
|
||||
# ============================================================================
|
||||
class HTMLInsertOnlyRewriter(StreamingRewriter):
|
||||
""" Insert custom string into HTML <head> tag
|
||||
""" Insert custom string into HTML into the head, before any tag not <head> or <html>
|
||||
no other rewriting performed
|
||||
"""
|
||||
HEAD_REGEX = re.compile('<\s*head\\b[^>]*[>]+', re.I)
|
||||
NOT_HEAD_REGEX = re.compile(r'(<\s*\b)(?!(html|head))', re.I)
|
||||
|
||||
def __init__(self, url_rewriter, **kwargs):
|
||||
super(HTMLInsertOnlyRewriter, self).__init__(url_rewriter, False)
|
||||
@ -19,16 +19,16 @@ class HTMLInsertOnlyRewriter(StreamingRewriter):
|
||||
if self.done:
|
||||
return string
|
||||
|
||||
# only try to find <head> in first buffer
|
||||
self.done = True
|
||||
m = self.HEAD_REGEX.search(string)
|
||||
m = self.NOT_HEAD_REGEX.search(string)
|
||||
if m:
|
||||
inx = m.end()
|
||||
inx = m.start()
|
||||
buff = string[:inx]
|
||||
buff += self.head_insert
|
||||
buff += string[inx:]
|
||||
self.done = True
|
||||
return buff
|
||||
else:
|
||||
return string
|
||||
|
||||
|
||||
def final_read(self):
|
||||
return '' if self.done else self.head_insert
|
||||
|
30
pywb/rewrite/test/test_html_insert_rewriter.py
Normal file
30
pywb/rewrite/test/test_html_insert_rewriter.py
Normal file
@ -0,0 +1,30 @@
|
||||
|
||||
|
||||
|
||||
r'''
|
||||
>>> parse('<html><head><some-tag></head</html>')
|
||||
'<html><head><!--Insert--><some-tag></head</html>'
|
||||
|
||||
>>> parse('<HTML><A Href="page.html">Text</a></hTmL>')
|
||||
'<HTML><!--Insert--><A Href="page.html">Text</a></hTmL>'
|
||||
|
||||
>>> parse('<html> < head> <link>')
|
||||
'<html> < head> <!--Insert--><link>'
|
||||
|
||||
>>> parse('< head> <link> <html>')
|
||||
'< head> <!--Insert--><link> <html>'
|
||||
|
||||
>>> parse('<head></head>text')
|
||||
'<head></head>text<!--Insert-->'
|
||||
'''
|
||||
|
||||
from pywb.rewrite.url_rewriter import UrlRewriter
|
||||
from pywb.rewrite.html_insert_rewriter import HTMLInsertOnlyRewriter
|
||||
|
||||
def parse(html_text):
|
||||
urlrewriter = UrlRewriter('20131226101010/https://example.com/some/path.html', '/web/')
|
||||
|
||||
rewriter = HTMLInsertOnlyRewriter(urlrewriter, head_insert='<!--Insert-->')
|
||||
|
||||
return rewriter.rewrite(html_text) + rewriter.final_read()
|
||||
|
@ -1,4 +1,4 @@
|
||||
__version__ = '2.3.2'
|
||||
__version__ = '2.3.3'
|
||||
|
||||
if __name__ == '__main__':
|
||||
print(__version__)
|
||||
|
Loading…
x
Reference in New Issue
Block a user