1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

proxy mode HTMLInsertOnlyRewriter: (#496)

- insert head-insert before first tag that is not <html> or <head> insert before
- addresses issue with rewriting pages that have no <head> tag (already handled in full rewriter)
- tests: add tests for HTMLInsertOnlyRewriter
- bump version to 2.3.3, update changelist
This commit is contained in:
Ilya Kreymer 2019-08-03 11:24:50 -07:00 committed by GitHub
parent 42089e237b
commit bf9284fec5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 44 additions and 8 deletions

View File

@ -1,3 +1,9 @@
pywb 2.3.3 changelist
~~~~~~~~~~~~~~~~~~~~~
* Proxy Mode: Ensure head insert added even if no ``<head>`` tag, insert after first tag that is not ``<html>`` or ``<head>`` (#496)
pywb 2.3.2 changelist
~~~~~~~~~~~~~~~~~~~~~

View File

@ -4,10 +4,10 @@ from pywb.rewrite.content_rewriter import StreamingRewriter
# ============================================================================
class HTMLInsertOnlyRewriter(StreamingRewriter):
""" Insert custom string into HTML <head> tag
""" Insert custom string into HTML into the head, before any tag not <head> or <html>
no other rewriting performed
"""
HEAD_REGEX = re.compile('<\s*head\\b[^>]*[>]+', re.I)
NOT_HEAD_REGEX = re.compile(r'(<\s*\b)(?!(html|head))', re.I)
def __init__(self, url_rewriter, **kwargs):
super(HTMLInsertOnlyRewriter, self).__init__(url_rewriter, False)
@ -19,16 +19,16 @@ class HTMLInsertOnlyRewriter(StreamingRewriter):
if self.done:
return string
# only try to find <head> in first buffer
self.done = True
m = self.HEAD_REGEX.search(string)
m = self.NOT_HEAD_REGEX.search(string)
if m:
inx = m.end()
inx = m.start()
buff = string[:inx]
buff += self.head_insert
buff += string[inx:]
self.done = True
return buff
else:
return string
def final_read(self):
return '' if self.done else self.head_insert

View File

@ -0,0 +1,30 @@
r'''
>>> parse('<html><head><some-tag></head</html>')
'<html><head><!--Insert--><some-tag></head</html>'
>>> parse('<HTML><A Href="page.html">Text</a></hTmL>')
'<HTML><!--Insert--><A Href="page.html">Text</a></hTmL>'
>>> parse('<html> < head> <link>')
'<html> < head> <!--Insert--><link>'
>>> parse('< head> <link> <html>')
'< head> <!--Insert--><link> <html>'
>>> parse('<head></head>text')
'<head></head>text<!--Insert-->'
'''
from pywb.rewrite.url_rewriter import UrlRewriter
from pywb.rewrite.html_insert_rewriter import HTMLInsertOnlyRewriter
def parse(html_text):
urlrewriter = UrlRewriter('20131226101010/https://example.com/some/path.html', '/web/')
rewriter = HTMLInsertOnlyRewriter(urlrewriter, head_insert='<!--Insert-->')
return rewriter.rewrite(html_text) + rewriter.final_read()

View File

@ -1,4 +1,4 @@
__version__ = '2.3.2'
__version__ = '2.3.3'
if __name__ == '__main__':
print(__version__)