1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

html parser fleshed out!

This commit is contained in:
Ilya Kreymer 2013-12-22 18:12:05 -08:00
parent fbf29e80d6
commit 37e57f7013
3 changed files with 170 additions and 45 deletions

View File

@ -129,24 +129,28 @@ class ArchivalUrl:
# Str Representation
# ====================
def __str__(self):
if self.type == ArchivalUrl.QUERY or self.type == ArchivalUrl.URL_QUERY:
@staticmethod
def to_str(atype, mod, timestamp, url):
if atype == ArchivalUrl.QUERY or atype == ArchivalUrl.URL_QUERY:
tsmod = "/"
if self.mod:
tsmod += self.mod + "/"
if self.timestamp:
tsmod += self.timestamp
if mod:
tsmod += mod + "/"
if timestamp:
tsmod += timestamp
tsmod += "*/" + self.url
if self.type == ArchivalUrl.URL_QUERY:
tsmod += "*/" + url
if atype == ArchivalUrl.URL_QUERY:
tsmod += "*"
return tsmod
else:
tsmod = self.timestamp + self.mod
tsmod = timestamp + mod
if len(tsmod) > 0:
return "/" + tsmod + "/" + self.url
return "/" + tsmod + "/" + url
else:
return "/" + self.url
return "/" + url
def __str__(self):
return ArchivalUrl.to_str(self.type, self.mod, self.timestamp, self.url)
def __repr__(self):
return str((self.type, self.timestamp, self.mod, self.url, str(self)))

View File

@ -1,12 +1,10 @@
import sys
import re
from HTMLParser import HTMLParser
from wburlrewriter import ArchivalUrlRewriter
tag_list = {
'a': {'href': ''},
'img': {'src': 'im_'}
}
# create a subclass and override the handler methods
class WBHtml(HTMLParser):
@ -14,57 +12,164 @@ class WBHtml(HTMLParser):
>>> WBHtml(rewriter).feed('<HTML><A Href="page.html">Text</a></hTmL>')
<HTML><a href="/web/20131226101010/http://example.com/some/path/page.html">Text</a></html>
>>> WBHtml(rewriter).feed('<img src="../img.gif"/><br/>')
<img src="/web/20131226101010im_/http://example.com/some/img.gif"/><br/>
>>> WBHtml(rewriter).feed('<body x="y"><img src="../img.gif"/><br/></body>')
<body x="y"><img src="/web/20131226101010im_/http://example.com/some/img.gif"/><br/></body>
>>> WBHtml(rewriter).feed('<body x="y"><img src="/img.gif"/><br/></body>')
<body x="y"><img src="/web/20131226101010im_/http://example.com/img.gif"/><br/></body>
>>> WBHtml(rewriter).feed('<META http-equiv="refresh" content="10; URL=/abc/def.html">')
<meta http-equiv="refresh" content="10; URL=/web/20131226101010/http://example.com/abc/def.html">
"""
REWRITE_TAGS = {
'a': {'href': ''},
'applet': {'codebase': 'oe_',
'archive': 'oe_'},
'area': {'href': ''},
'base': {'href': ''},
'blockquote': {'cite': ''},
'body': {'background': 'im_'},
'del': {'cite': ''},
'embed': {'src': 'oe_'},
'iframe': {'src': 'if_'},
'img': {'src': 'im_'},
'ins': {'cite': ''},
'input': {'src': 'im_'},
'form': {'action': ''},
'frame': {'src': 'fr_'},
'link': {'href': 'oe_'},
'meta': {'content': ''},
'object': {'codebase': 'oe_',
'data': 'oe_'},
'q': {'cite': ''},
'script': {'src': 'js_'},
'div': {'data-src' : '',
'data-uri' : ''},
'li': {'data-src' : '',
'data-uri' : ''},
}
STATE_TAGS = ['head', 'body', 'script', 'style']
def __init__(self, rewriter, outstream = None):
HTMLParser.__init__(self)
self.rewriter = rewriter
self._wbParseContext = None
self.out = outstream if outstream else sys.stdout
def _rewriteAttr(self, mod, value):
# ===========================
META_REFRESH_REGEX = re.compile('^[\\d.]+\\s*;\\s*url\\s*=\\s*(.+?)\\s*$', re.IGNORECASE | re.MULTILINE)
def _rewriteMetaRefresh(self, metaRefresh):
m = WBHtml.META_REFRESH_REGEX.match(metaRefresh)
if not m:
return metaRefresh
try:
metaRefresh = metaRefresh[:m.start(1)] + self._rewriteURL(m.group(1)) + metaRefresh[m.end(1):]
except Exception:
pass
return metaRefresh
# ===========================
NO_REWRITE_PREFIX = ['javascript:', 'data:', 'mailto:']
def _rewriteURL(self, value, mod = None):
if any (value.startswith(x) for x in WBHtml.NO_REWRITE_PREFIX):
return value
return self.rewriter.rewrite(value, mod)
def rewriteTagAttrs(self, tag, tagAttrs, isStartEnd):
rwAttrs = tag_list.get(tag)
if not rwAttrs:
rwAttrs = tag_list.get('')
if not rwAttrs:
def _rewriteCSS(self, cssContent):
return cssContent
def _rewriteScript(self, scriptContent):
return scriptContent
def hasAttr(self, tagAttrs, attr):
name, value = attr
for attrName, attrValue in tagAttrs:
if attrName == name:
return value.lower() == attrValue.lower()
return False
def rewriteTagAttrs(self, tag, tagAttrs, isStartEnd):
handler = WBHtml.REWRITE_TAGS.get(tag)
if not handler:
handler = WBHtml.REWRITE_TAGS.get('')
if not handler:
return False
# special case: base tag
if (tag == 'base'):
newBase = tagAttrs.get('href')
if newBase:
self.rewriter.setBaseUrl(newBase[1])
# special case: script or style parse context
elif ((tag == 'script') or (tag == 'style')) and (self._wbParseContext == None):
self._wbParseContext = tag
self.out.write('<' + tag)
for attr in tagAttrs:
name, value = attr
rwMod = rwAttrs.get(name)
attrName, attrValue = attr
if rwMod is not None:
value = self._rewriteAttr(rwMod, value)
# special case: inline JS/event handler
if attrValue.startswith('javascript:') or attrName.startswith("on"):
attrValue = self._rewriteScript(attrValue)
self.out.write(' {0}="{1}"'.format(name, value))
# special case: inline CSS/style attribute
elif attrName == 'style':
attrValue = self._rewriteCSS(attrValue)
# special case: meta tag
elif (tag == 'meta') and (attrName == 'content') and self.hasAttr(tagAttrs, ('http-equiv', 'refresh')):
attrValue = self._rewriteMetaRefresh(attrValue)
else:
rwMod = handler.get(attrName)
if rwMod is not None:
attrValue = self._rewriteURL(attrValue, rwMod)
self.out.write(' {0}="{1}"'.format(attrName, attrValue))
self.out.write('/>' if isStartEnd else '>')
return True
def handle_starttag(self, tag, attrs):
if not self.rewriteTagAttrs(tag, attrs, False):
self.out.write(self.get_starttag_text())
def handle_startendtag(self, tag, attrs):
if not self.rewriteTagAttrs(tag, attrs, True):
self.out.write(self.get_starttag_text())
def handle_endtag(self, tag):
if (tag == self._wbParseContext):
self._wbParseContext = None
self.out.write('</' + tag + '>')
def handle_data(self, data):
def parseData(self, data):
if self._wbParseContext == 'script':
data = self._rewriteScript(data)
elif self._wbParseContext == 'style':
data = self._rewriteCSS(data)
self.out.write(data)
def handle_data(self, data):
self.parseData(data)
def handle_entityref(self, data):
self.out.write('&' + data)
@ -72,7 +177,9 @@ class WBHtml(HTMLParser):
self.out.write('&#' + data)
def handle_comment(self, data):
self.out.write('<!--' + data + '-->')
self.out.write('<!--')
self.parseData(data)
self.out.write('-->')
def handle_decl(self, data):
self.out.write('<!' + data + '>')
@ -81,9 +188,9 @@ class WBHtml(HTMLParser):
self.out.write('<?' + data + '>')
def unknown_decl(self, data):
self.out.write('<![' + data + ']>')
self.out.write('<![')
self.parseData(data)
self.out.write(']>')
# instantiate the parser and fed it some HTML

View File

@ -20,6 +20,12 @@ class ArchivalUrlRewriter:
>>> test_rewrite('../../other.html', '/*/http://example.com/index.html', 'localhost:8080/')
'localhost:8080/*/http://example.com/other.html'
>>> test_rewrite('path/../../other.html', '/*/http://example.com/index.html', 'localhost:8080/')
'localhost:8080/*/http://example.com/other.html'
>>> test_rewrite('http://some-other-site.com', '/20101226101112/http://example.com/index.html', 'localhost:8080/')
'localhost:8080/20101226101112/http://some-other-site.com'
>>> test_rewrite('../../other.html', '/2020/http://example.com/index.html', '/')
'/2020/http://example.com/other.html'
@ -28,25 +34,33 @@ class ArchivalUrlRewriter:
"""
def __init__(self, wburl_str, prefix):
self.wburl_str = wburl_str
self.wburl = ArchivalUrl(wburl_str)
self.prefix = prefix
if self.prefix.endswith('/'):
self.prefix = self.prefix[:-1]
def rewrite(self, rel_url, mod = None):
if '../' in rel_url or mod:
wburl = ArchivalUrl(self.wburl_str)
wburl.url = urlparse.urljoin(wburl.url, rel_url)
wburl.url = wburl.url.replace('../', '')
if mod is not None:
wburl.mod = mod
wburl = self.wburl
final_url = self.prefix + str(wburl)
else:
final_url = urlparse.urljoin(self.prefix + self.wburl_str, rel_url)
# Disable optimization, doesn't work for external links
# if relative path or different mod, create rewrite from split up ArchivalUrl
#if rel_url.startswith('/') or ('../' in rel_url) or mod:
newUrl = urlparse.urljoin(wburl.url, rel_url).replace('../', '')
if mod is None:
mod = wburl.mod
final_url = self.prefix + ArchivalUrl.to_str(wburl.type, mod, wburl.timestamp, newUrl)
# otherwise, optimize, and join directly with full url
#else:
# final_url = urlparse.urljoin(self.prefix + wburl.original_url, rel_url)
return final_url
def setBaseUrl(self, newUrl):
self.wburl.url = newUrl
if __name__ == "__main__":
import doctest