#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys
import re
from HTMLParser import HTMLParser
from url_rewriter import ArchivalUrlRewriter
from regex_rewriters import JSRewriter, CSSRewriter
#=================================================================
# WBHtml --html parser for custom rewriting, also handlers for script and css
#=================================================================
class WBHtml(HTMLParser):
r"""
>>> parse('Text')
Text
>>> parse('
')
>>> parse(' ')
>>> parse('')
>>> parse('')
# HTML Entities
>>> parse('› >')
› >
# Don't rewrite anchors
>>> parse('Text')
Text
# Unicode
>>> parse('испытание')
испытание
# Meta tag
>>> parse('')
>>> parse('')
>>> parse('')
# Script tag
>>> parse('')
# Unterminated script tag auto-terminate
>>> parse('
>>> parse('')
>>> parse('')
>>> parse('')
# Unterminated style tag auto-terminate
>>> parse('
# Head Insertion
>>> parse('Test', headInsert = '')
Test
>>> parse('
SomeTest
', headInsert = '/* Insert */')
/* Insert */
SomeTest
>>> parse('
SomeTest
', headInsert = '')
SomeTest
"""
REWRITE_TAGS = {
'a': {'href': ''},
'applet': {'codebase': 'oe_',
'archive': 'oe_'},
'area': {'href': ''},
'base': {'href': ''},
'blockquote': {'cite': ''},
'body': {'background': 'im_'},
'del': {'cite': ''},
'embed': {'src': 'oe_'},
'head': {'': ''}, # for head rewriting
'iframe': {'src': 'if_'},
'img': {'src': 'im_'},
'ins': {'cite': ''},
'input': {'src': 'im_'},
'form': {'action': ''},
'frame': {'src': 'fr_'},
'link': {'href': 'oe_'},
'meta': {'content': ''},
'object': {'codebase': 'oe_',
'data': 'oe_'},
'q': {'cite': ''},
'ref': {'href': 'oe_'},
'script': {'src': 'js_'},
'div': {'data-src' : '',
'data-uri' : ''},
'li': {'data-src' : '',
'data-uri' : ''},
}
STATE_TAGS = ['script', 'style']
HEAD_TAGS = ['html', 'head', 'base', 'link', 'meta', 'title', 'style', 'script', 'object', 'bgsound']
class AccumBuff:
def __init__(self):
self.buff = ''
def write(self, string):
self.buff += string
def __init__(self, url_rewriter, outstream = None, headInsert = None, jsRewriterClass = JSRewriter, cssRewriterClass = CSSRewriter):
HTMLParser.__init__(self)
self.url_rewriter = url_rewriter
self._wbParseContext = None
self.out = outstream if outstream else WBHtml.AccumBuff()
self.jsRewriter = jsRewriterClass(url_rewriter)
self.cssRewriter = cssRewriterClass(url_rewriter)
self.headInsert = headInsert
# ===========================
META_REFRESH_REGEX = re.compile('^[\\d.]+\\s*;\\s*url\\s*=\\s*(.+?)\\s*$', re.IGNORECASE | re.MULTILINE)
def _rewriteMetaRefresh(self, metaRefresh):
if not metaRefresh:
return None
m = WBHtml.META_REFRESH_REGEX.match(metaRefresh)
if not m:
return metaRefresh
try:
metaRefresh = metaRefresh[:m.start(1)] + self._rewriteURL(m.group(1)) + metaRefresh[m.end(1):]
except Exception:
pass
return metaRefresh
# ===========================
def _rewriteURL(self, value, mod = None):
return self.url_rewriter.rewrite(value, mod) if value else None
def _rewriteCSS(self, cssContent):
return self.cssRewriter.rewrite(cssContent) if cssContent else None
def _rewriteScript(self, scriptContent):
return self.jsRewriter.rewrite(scriptContent) if scriptContent else None
def hasAttr(self, tagAttrs, attr):
name, value = attr
for attrName, attrValue in tagAttrs:
if attrName == name:
return value.lower() == attrValue.lower()
return False
def rewriteTagAttrs(self, tag, tagAttrs, isStartEnd):
# special case: script or style parse context
if (tag in WBHtml.STATE_TAGS) and (self._wbParseContext == None):
self._wbParseContext = tag
# special case: head insertion, non-head tags
elif (self.headInsert and (self._wbParseContext == None) and (tag not in WBHtml.HEAD_TAGS)):
self.out.write(self.headInsert)
self.headInsert = None
# attr rewriting
handler = WBHtml.REWRITE_TAGS.get(tag)
if not handler:
handler = WBHtml.REWRITE_TAGS.get('')
if not handler:
return False
self.out.write('<' + tag)
for attr in tagAttrs:
attrName, attrValue = attr
# special case: inline JS/event handler
if (attrValue and attrValue.startswith('javascript:')) or attrName.startswith('on'):
attrValue = self._rewriteScript(attrValue)
# special case: inline CSS/style attribute
elif attrName == 'style':
attrValue = self._rewriteCSS(attrValue)
# special case: meta tag
elif (tag == 'meta') and (attrName == 'content'):
if self.hasAttr(tagAttrs, ('http-equiv', 'refresh')):
attrValue = self._rewriteMetaRefresh(attrValue)
else:
# special case: base tag
if (tag == 'base') and (attrName == 'href') and attrValue:
self.url_rewriter.setBaseUrl(attrValue)
rwMod = handler.get(attrName)
if rwMod is not None:
attrValue = self._rewriteURL(attrValue, rwMod)
# parser doesn't differentiate between 'attr=""' and just 'attr'
# 'attr=""' is more common, so use that form
if attrValue:
self.out.write(' ' + attrName + '="' + attrValue + '"')
else:
self.out.write(' ' + attrName + '=""')
self.out.write('/>' if isStartEnd else '>')
# special case: head tag
if (self.headInsert) and (self._wbParseContext == None) and (tag == 'head'):
self.out.write(self.headInsert)
self.headInsert = None
return True
def parseData(self, data):
if self._wbParseContext == 'script':
data = self._rewriteScript(data)
elif self._wbParseContext == 'style':
data = self._rewriteCSS(data)
self.out.write(data)
def rewrite(self, string):
if not self.out:
self.out = WBHtml.AccumBuff()
self.feed(string)
result = self.out.buff
# Clear buffer to create new one for next rewrite()
self.out = None
return result
# HTMLParser overrides below
def close(self):
if (self._wbParseContext):
result = self.rewrite('' + self._wbParseContext + '>')
self._wbParseContext = None
else:
result = ''
HTMLParser.close(self)
return result
def handle_starttag(self, tag, attrs):
if not self.rewriteTagAttrs(tag, attrs, False):
self.out.write(self.get_starttag_text())
def handle_startendtag(self, tag, attrs):
if not self.rewriteTagAttrs(tag, attrs, True):
self.out.write(self.get_starttag_text())
def handle_endtag(self, tag):
if (tag == self._wbParseContext):
self._wbParseContext = None
self.out.write('' + tag + '>')
def handle_data(self, data):
self.parseData(data)
def handle_entityref(self, data):
self.out.write('&' + data + ';')
def handle_charref(self, data):
self.out.write('' + data + ';')
def handle_comment(self, data):
self.out.write('')
def handle_decl(self, data):
self.out.write('')
def handle_pi(self, data):
self.out.write('' + data + '>')
def unknown_decl(self, data):
self.out.write('')
# instantiate the parser and fed it some HTML
#parser = WBHtml()
#instr = ' Test\n