mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-31 03:04:12 +02:00
330 lines
12 KiB
Python
330 lines
12 KiB
Python
#!/usr/bin/env python
|
|
# -*- coding: utf-8 -*-
|
|
|
|
import sys
|
|
import re
|
|
|
|
from HTMLParser import HTMLParser
|
|
from url_rewriter import ArchivalUrlRewriter
|
|
from regex_rewriters import JSRewriter, CSSRewriter
|
|
|
|
#=================================================================
|
|
# WBHtml --html parser for custom rewriting, also handlers for script and css
|
|
#=================================================================
|
|
class WBHtml(HTMLParser):
|
|
r"""
|
|
>>> parse('<HTML><A Href="page.html">Text</a></hTmL>')
|
|
<HTML><a href="/web/20131226101010/http://example.com/some/path/page.html">Text</a></html>
|
|
|
|
>>> parse('<body x="y"><img src="../img.gif"/><br/></body>')
|
|
<body x="y"><img src="/web/20131226101010im_/http://example.com/some/img.gif"/><br/></body>
|
|
|
|
>>> parse('<body x="y"><img src="/img.gif"/><br/></body>')
|
|
<body x="y"><img src="/web/20131226101010im_/http://example.com/img.gif"/><br/></body>
|
|
|
|
>>> parse('<input "selected"><img src></div>')
|
|
<input "selected"=""><img src=""></div>
|
|
|
|
>>> parse('<html><head><base href="http://example.com/some/path/index.html"/>')
|
|
<html><head><base href="/web/20131226101010/http://example.com/some/path/index.html"/>
|
|
|
|
# HTML Entities
|
|
>>> parse('<a href="">› ></div>')
|
|
<a href="">› ></div>
|
|
|
|
# Don't rewrite anchors
|
|
>>> parse('<HTML><A Href="#abc">Text</a></hTmL>')
|
|
<HTML><a href="#abc">Text</a></html>
|
|
|
|
# Unicode
|
|
>>> parse('<a href="http://испытание.испытание/">испытание</a>')
|
|
<a href="/web/20131226101010/http://испытание.испытание/">испытание</a>
|
|
|
|
# Meta tag
|
|
>>> parse('<META http-equiv="refresh" content="10; URL=/abc/def.html">')
|
|
<meta http-equiv="refresh" content="10; URL=/web/20131226101010/http://example.com/abc/def.html">
|
|
|
|
>>> parse('<meta http-equiv="Content-type" content="text/html; charset=utf-8" />')
|
|
<meta http-equiv="Content-type" content="text/html; charset=utf-8"/>
|
|
|
|
>>> parse('<META http-equiv="refresh" content>')
|
|
<meta http-equiv="refresh" content="">
|
|
|
|
# Script tag
|
|
>>> parse('<script>window.location = "http://example.com/a/b/c.html"</script>')
|
|
<script>window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html"</script>
|
|
|
|
# Unterminated script tag auto-terminate
|
|
>>> parse('<script>window.location = "http://example.com/a/b/c.html"</sc>')
|
|
<script>window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html"</sc></script>
|
|
|
|
>>> parse('<script>/*<![CDATA[*/window.location = "http://example.com/a/b/c.html;/*]]>*/"</script>')
|
|
<script>/*<![CDATA[*/window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html;/*]]>*/"</script>
|
|
|
|
>>> parse('<div style="background: url(\'abc.html\')" onblah onclick="location = \'redirect.html\'"></div>')
|
|
<div style="background: url('/web/20131226101010/http://example.com/some/path/abc.html')" onblah="" onclick="WB_wombat_location = 'redirect.html'"></div>
|
|
|
|
>>> parse('<style>@import "styles.css" .a { font-face: url(\'myfont.ttf\') }</style>')
|
|
<style>@import "/web/20131226101010/http://example.com/some/path/styles.css" .a { font-face: url('/web/20131226101010/http://example.com/some/path/myfont.ttf') }</style>
|
|
|
|
# Unterminated style tag auto-terminate
|
|
>>> parse('<style>@import url(styles.css)')
|
|
<style>@import url(/web/20131226101010/http://example.com/some/path/styles.css)</style>
|
|
|
|
# Head Insertion
|
|
>>> parse('<html><head><script src="other.js"></script></head><body>Test</body></html>', headInsert = '<script src="cool.js"></script>')
|
|
<html><head><script src="cool.js"></script><script src="/web/20131226101010js_/http://example.com/some/path/other.js"></script></head><body>Test</body></html>
|
|
|
|
>>> parse('<body><div>SomeTest</div>', headInsert = '/* Insert */')
|
|
/* Insert */<body><div>SomeTest</div>
|
|
|
|
>>> parse('<link href="abc.txt"><div>SomeTest</div>', headInsert = '<script>load_stuff();</script>')
|
|
<link href="/web/20131226101010oe_/http://example.com/some/path/abc.txt"><script>load_stuff();</script><div>SomeTest</div>
|
|
|
|
"""
|
|
|
|
REWRITE_TAGS = {
|
|
'a': {'href': ''},
|
|
'applet': {'codebase': 'oe_',
|
|
'archive': 'oe_'},
|
|
'area': {'href': ''},
|
|
'base': {'href': ''},
|
|
'blockquote': {'cite': ''},
|
|
'body': {'background': 'im_'},
|
|
'del': {'cite': ''},
|
|
'embed': {'src': 'oe_'},
|
|
'head': {'': ''}, # for head rewriting
|
|
'iframe': {'src': 'if_'},
|
|
'img': {'src': 'im_'},
|
|
'ins': {'cite': ''},
|
|
'input': {'src': 'im_'},
|
|
'form': {'action': ''},
|
|
'frame': {'src': 'fr_'},
|
|
'link': {'href': 'oe_'},
|
|
'meta': {'content': ''},
|
|
'object': {'codebase': 'oe_',
|
|
'data': 'oe_'},
|
|
'q': {'cite': ''},
|
|
'ref': {'href': 'oe_'},
|
|
'script': {'src': 'js_'},
|
|
'div': {'data-src' : '',
|
|
'data-uri' : ''},
|
|
'li': {'data-src' : '',
|
|
'data-uri' : ''},
|
|
}
|
|
|
|
STATE_TAGS = ['script', 'style']
|
|
|
|
HEAD_TAGS = ['html', 'head', 'base', 'link', 'meta', 'title', 'style', 'script', 'object', 'bgsound']
|
|
|
|
class AccumBuff:
|
|
def __init__(self):
|
|
self.buff = ''
|
|
|
|
def write(self, string):
|
|
self.buff += string
|
|
|
|
|
|
def __init__(self, url_rewriter, outstream = None, headInsert = None, jsRewriterClass = JSRewriter, cssRewriterClass = CSSRewriter):
|
|
HTMLParser.__init__(self)
|
|
|
|
self.url_rewriter = url_rewriter
|
|
self._wbParseContext = None
|
|
self.out = outstream if outstream else WBHtml.AccumBuff()
|
|
|
|
self.jsRewriter = jsRewriterClass(url_rewriter)
|
|
self.cssRewriter = cssRewriterClass(url_rewriter)
|
|
|
|
self.headInsert = headInsert
|
|
|
|
|
|
# ===========================
|
|
META_REFRESH_REGEX = re.compile('^[\\d.]+\\s*;\\s*url\\s*=\\s*(.+?)\\s*$', re.IGNORECASE | re.MULTILINE)
|
|
|
|
def _rewriteMetaRefresh(self, metaRefresh):
|
|
if not metaRefresh:
|
|
return None
|
|
|
|
m = WBHtml.META_REFRESH_REGEX.match(metaRefresh)
|
|
if not m:
|
|
return metaRefresh
|
|
|
|
try:
|
|
metaRefresh = metaRefresh[:m.start(1)] + self._rewriteURL(m.group(1)) + metaRefresh[m.end(1):]
|
|
except Exception:
|
|
pass
|
|
|
|
return metaRefresh
|
|
# ===========================
|
|
|
|
def _rewriteURL(self, value, mod = None):
|
|
return self.url_rewriter.rewrite(value, mod) if value else None
|
|
|
|
|
|
def _rewriteCSS(self, cssContent):
|
|
return self.cssRewriter.rewrite(cssContent) if cssContent else None
|
|
|
|
def _rewriteScript(self, scriptContent):
|
|
return self.jsRewriter.rewrite(scriptContent) if scriptContent else None
|
|
|
|
def hasAttr(self, tagAttrs, attr):
|
|
name, value = attr
|
|
for attrName, attrValue in tagAttrs:
|
|
if attrName == name:
|
|
return value.lower() == attrValue.lower()
|
|
return False
|
|
|
|
def rewriteTagAttrs(self, tag, tagAttrs, isStartEnd):
|
|
# special case: script or style parse context
|
|
if (tag in WBHtml.STATE_TAGS) and (self._wbParseContext == None):
|
|
self._wbParseContext = tag
|
|
|
|
# special case: head insertion, non-head tags
|
|
elif (self.headInsert and (self._wbParseContext == None) and (tag not in WBHtml.HEAD_TAGS)):
|
|
self.out.write(self.headInsert)
|
|
self.headInsert = None
|
|
|
|
# attr rewriting
|
|
handler = WBHtml.REWRITE_TAGS.get(tag)
|
|
if not handler:
|
|
handler = WBHtml.REWRITE_TAGS.get('')
|
|
|
|
if not handler:
|
|
return False
|
|
|
|
self.out.write('<' + tag)
|
|
|
|
for attr in tagAttrs:
|
|
attrName, attrValue = attr
|
|
|
|
# special case: inline JS/event handler
|
|
if (attrValue and attrValue.startswith('javascript:')) or attrName.startswith('on'):
|
|
attrValue = self._rewriteScript(attrValue)
|
|
|
|
# special case: inline CSS/style attribute
|
|
elif attrName == 'style':
|
|
attrValue = self._rewriteCSS(attrValue)
|
|
|
|
# special case: meta tag
|
|
elif (tag == 'meta') and (attrName == 'content'):
|
|
if self.hasAttr(tagAttrs, ('http-equiv', 'refresh')):
|
|
attrValue = self._rewriteMetaRefresh(attrValue)
|
|
|
|
else:
|
|
# special case: base tag
|
|
if (tag == 'base') and (attrName == 'href') and attrValue:
|
|
self.url_rewriter.setBaseUrl(attrValue)
|
|
|
|
rwMod = handler.get(attrName)
|
|
if rwMod is not None:
|
|
attrValue = self._rewriteURL(attrValue, rwMod)
|
|
|
|
# parser doesn't differentiate between 'attr=""' and just 'attr'
|
|
# 'attr=""' is more common, so use that form
|
|
if attrValue:
|
|
self.out.write(' ' + attrName + '="' + attrValue + '"')
|
|
else:
|
|
self.out.write(' ' + attrName + '=""')
|
|
|
|
self.out.write('/>' if isStartEnd else '>')
|
|
|
|
# special case: head tag
|
|
if (self.headInsert) and (self._wbParseContext == None) and (tag == 'head'):
|
|
self.out.write(self.headInsert)
|
|
self.headInsert = None
|
|
|
|
return True
|
|
|
|
|
|
def parseData(self, data):
|
|
if self._wbParseContext == 'script':
|
|
data = self._rewriteScript(data)
|
|
elif self._wbParseContext == 'style':
|
|
data = self._rewriteCSS(data)
|
|
|
|
self.out.write(data)
|
|
|
|
def rewrite(self, string):
|
|
if not self.out:
|
|
self.out = WBHtml.AccumBuff()
|
|
|
|
self.feed(string)
|
|
|
|
result = self.out.buff
|
|
# Clear buffer to create new one for next rewrite()
|
|
self.out = None
|
|
|
|
return result
|
|
|
|
# HTMLParser overrides below
|
|
def close(self):
|
|
if (self._wbParseContext):
|
|
result = self.rewrite('</' + self._wbParseContext + '>')
|
|
self._wbParseContext = None
|
|
else:
|
|
result = ''
|
|
|
|
HTMLParser.close(self)
|
|
return result
|
|
|
|
def handle_starttag(self, tag, attrs):
|
|
if not self.rewriteTagAttrs(tag, attrs, False):
|
|
self.out.write(self.get_starttag_text())
|
|
|
|
def handle_startendtag(self, tag, attrs):
|
|
if not self.rewriteTagAttrs(tag, attrs, True):
|
|
self.out.write(self.get_starttag_text())
|
|
|
|
def handle_endtag(self, tag):
|
|
if (tag == self._wbParseContext):
|
|
self._wbParseContext = None
|
|
|
|
self.out.write('</' + tag + '>')
|
|
|
|
def handle_data(self, data):
|
|
self.parseData(data)
|
|
|
|
def handle_entityref(self, data):
|
|
self.out.write('&' + data + ';')
|
|
|
|
def handle_charref(self, data):
|
|
self.out.write('&#' + data + ';')
|
|
|
|
def handle_comment(self, data):
|
|
self.out.write('<!--')
|
|
self.parseData(data)
|
|
self.out.write('-->')
|
|
|
|
def handle_decl(self, data):
|
|
self.out.write('<!' + data + '>')
|
|
|
|
def handle_pi(self, data):
|
|
self.out.write('<?' + data + '>')
|
|
|
|
def unknown_decl(self, data):
|
|
self.out.write('<![')
|
|
self.parseData(data)
|
|
self.out.write(']>')
|
|
|
|
|
|
# instantiate the parser and fed it some HTML
|
|
#parser = WBHtml()
|
|
#instr = '<HTML X=\'a\' B=\'234\' some="other"><a href="Test"><BR/><head><title>Test</title></head>\n<body><h1>Parse me!</h1></body></HTML>'
|
|
#print instr
|
|
#print
|
|
#parser.feed(instr)
|
|
#print
|
|
import utils
|
|
if __name__ == "__main__" or utils.enable_doctests():
|
|
|
|
url_rewriter = ArchivalUrlRewriter('/20131226101010/http://example.com/some/path/index.html', '/web/')
|
|
|
|
def parse(data, headInsert = None):
|
|
parser = WBHtml(url_rewriter, headInsert = headInsert)
|
|
print parser.rewrite(data) + parser.close()
|
|
|
|
import doctest
|
|
doctest.testmod()
|
|
|
|
|