mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
html parser fleshed out!
This commit is contained in:
parent
fbf29e80d6
commit
37e57f7013
@ -129,24 +129,28 @@ class ArchivalUrl:
|
|||||||
|
|
||||||
# Str Representation
|
# Str Representation
|
||||||
# ====================
|
# ====================
|
||||||
def __str__(self):
|
@staticmethod
|
||||||
if self.type == ArchivalUrl.QUERY or self.type == ArchivalUrl.URL_QUERY:
|
def to_str(atype, mod, timestamp, url):
|
||||||
|
if atype == ArchivalUrl.QUERY or atype == ArchivalUrl.URL_QUERY:
|
||||||
tsmod = "/"
|
tsmod = "/"
|
||||||
if self.mod:
|
if mod:
|
||||||
tsmod += self.mod + "/"
|
tsmod += mod + "/"
|
||||||
if self.timestamp:
|
if timestamp:
|
||||||
tsmod += self.timestamp
|
tsmod += timestamp
|
||||||
|
|
||||||
tsmod += "*/" + self.url
|
tsmod += "*/" + url
|
||||||
if self.type == ArchivalUrl.URL_QUERY:
|
if atype == ArchivalUrl.URL_QUERY:
|
||||||
tsmod += "*"
|
tsmod += "*"
|
||||||
return tsmod
|
return tsmod
|
||||||
else:
|
else:
|
||||||
tsmod = self.timestamp + self.mod
|
tsmod = timestamp + mod
|
||||||
if len(tsmod) > 0:
|
if len(tsmod) > 0:
|
||||||
return "/" + tsmod + "/" + self.url
|
return "/" + tsmod + "/" + url
|
||||||
else:
|
else:
|
||||||
return "/" + self.url
|
return "/" + url
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return ArchivalUrl.to_str(self.type, self.mod, self.timestamp, self.url)
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return str((self.type, self.timestamp, self.mod, self.url, str(self)))
|
return str((self.type, self.timestamp, self.mod, self.url, str(self)))
|
||||||
|
155
pywb/wbhtml.py
155
pywb/wbhtml.py
@ -1,12 +1,10 @@
|
|||||||
import sys
|
import sys
|
||||||
|
import re
|
||||||
|
|
||||||
from HTMLParser import HTMLParser
|
from HTMLParser import HTMLParser
|
||||||
from wburlrewriter import ArchivalUrlRewriter
|
from wburlrewriter import ArchivalUrlRewriter
|
||||||
|
|
||||||
tag_list = {
|
|
||||||
'a': {'href': ''},
|
|
||||||
'img': {'src': 'im_'}
|
|
||||||
}
|
|
||||||
|
|
||||||
# create a subclass and override the handler methods
|
# create a subclass and override the handler methods
|
||||||
class WBHtml(HTMLParser):
|
class WBHtml(HTMLParser):
|
||||||
@ -14,57 +12,164 @@ class WBHtml(HTMLParser):
|
|||||||
>>> WBHtml(rewriter).feed('<HTML><A Href="page.html">Text</a></hTmL>')
|
>>> WBHtml(rewriter).feed('<HTML><A Href="page.html">Text</a></hTmL>')
|
||||||
<HTML><a href="/web/20131226101010/http://example.com/some/path/page.html">Text</a></html>
|
<HTML><a href="/web/20131226101010/http://example.com/some/path/page.html">Text</a></html>
|
||||||
|
|
||||||
>>> WBHtml(rewriter).feed('<img src="../img.gif"/><br/>')
|
>>> WBHtml(rewriter).feed('<body x="y"><img src="../img.gif"/><br/></body>')
|
||||||
<img src="/web/20131226101010im_/http://example.com/some/img.gif"/><br/>
|
<body x="y"><img src="/web/20131226101010im_/http://example.com/some/img.gif"/><br/></body>
|
||||||
|
|
||||||
|
>>> WBHtml(rewriter).feed('<body x="y"><img src="/img.gif"/><br/></body>')
|
||||||
|
<body x="y"><img src="/web/20131226101010im_/http://example.com/img.gif"/><br/></body>
|
||||||
|
|
||||||
|
>>> WBHtml(rewriter).feed('<META http-equiv="refresh" content="10; URL=/abc/def.html">')
|
||||||
|
<meta http-equiv="refresh" content="10; URL=/web/20131226101010/http://example.com/abc/def.html">
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
REWRITE_TAGS = {
|
||||||
|
'a': {'href': ''},
|
||||||
|
'applet': {'codebase': 'oe_',
|
||||||
|
'archive': 'oe_'},
|
||||||
|
'area': {'href': ''},
|
||||||
|
'base': {'href': ''},
|
||||||
|
'blockquote': {'cite': ''},
|
||||||
|
'body': {'background': 'im_'},
|
||||||
|
'del': {'cite': ''},
|
||||||
|
'embed': {'src': 'oe_'},
|
||||||
|
'iframe': {'src': 'if_'},
|
||||||
|
'img': {'src': 'im_'},
|
||||||
|
'ins': {'cite': ''},
|
||||||
|
'input': {'src': 'im_'},
|
||||||
|
'form': {'action': ''},
|
||||||
|
'frame': {'src': 'fr_'},
|
||||||
|
'link': {'href': 'oe_'},
|
||||||
|
'meta': {'content': ''},
|
||||||
|
'object': {'codebase': 'oe_',
|
||||||
|
'data': 'oe_'},
|
||||||
|
'q': {'cite': ''},
|
||||||
|
'script': {'src': 'js_'},
|
||||||
|
'div': {'data-src' : '',
|
||||||
|
'data-uri' : ''},
|
||||||
|
'li': {'data-src' : '',
|
||||||
|
'data-uri' : ''},
|
||||||
|
}
|
||||||
|
|
||||||
|
STATE_TAGS = ['head', 'body', 'script', 'style']
|
||||||
|
|
||||||
|
|
||||||
def __init__(self, rewriter, outstream = None):
|
def __init__(self, rewriter, outstream = None):
|
||||||
HTMLParser.__init__(self)
|
HTMLParser.__init__(self)
|
||||||
|
|
||||||
self.rewriter = rewriter
|
self.rewriter = rewriter
|
||||||
|
self._wbParseContext = None
|
||||||
self.out = outstream if outstream else sys.stdout
|
self.out = outstream if outstream else sys.stdout
|
||||||
|
|
||||||
def _rewriteAttr(self, mod, value):
|
|
||||||
|
# ===========================
|
||||||
|
META_REFRESH_REGEX = re.compile('^[\\d.]+\\s*;\\s*url\\s*=\\s*(.+?)\\s*$', re.IGNORECASE | re.MULTILINE)
|
||||||
|
|
||||||
|
def _rewriteMetaRefresh(self, metaRefresh):
|
||||||
|
m = WBHtml.META_REFRESH_REGEX.match(metaRefresh)
|
||||||
|
if not m:
|
||||||
|
return metaRefresh
|
||||||
|
|
||||||
|
try:
|
||||||
|
metaRefresh = metaRefresh[:m.start(1)] + self._rewriteURL(m.group(1)) + metaRefresh[m.end(1):]
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return metaRefresh
|
||||||
|
# ===========================
|
||||||
|
|
||||||
|
NO_REWRITE_PREFIX = ['javascript:', 'data:', 'mailto:']
|
||||||
|
|
||||||
|
def _rewriteURL(self, value, mod = None):
|
||||||
|
if any (value.startswith(x) for x in WBHtml.NO_REWRITE_PREFIX):
|
||||||
|
return value
|
||||||
|
|
||||||
return self.rewriter.rewrite(value, mod)
|
return self.rewriter.rewrite(value, mod)
|
||||||
|
|
||||||
def rewriteTagAttrs(self, tag, tagAttrs, isStartEnd):
|
|
||||||
rwAttrs = tag_list.get(tag)
|
|
||||||
if not rwAttrs:
|
|
||||||
rwAttrs = tag_list.get('')
|
|
||||||
|
|
||||||
if not rwAttrs:
|
def _rewriteCSS(self, cssContent):
|
||||||
|
return cssContent
|
||||||
|
|
||||||
|
def _rewriteScript(self, scriptContent):
|
||||||
|
return scriptContent
|
||||||
|
|
||||||
|
def hasAttr(self, tagAttrs, attr):
|
||||||
|
name, value = attr
|
||||||
|
for attrName, attrValue in tagAttrs:
|
||||||
|
if attrName == name:
|
||||||
|
return value.lower() == attrValue.lower()
|
||||||
|
return False
|
||||||
|
|
||||||
|
def rewriteTagAttrs(self, tag, tagAttrs, isStartEnd):
|
||||||
|
handler = WBHtml.REWRITE_TAGS.get(tag)
|
||||||
|
if not handler:
|
||||||
|
handler = WBHtml.REWRITE_TAGS.get('')
|
||||||
|
|
||||||
|
if not handler:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
# special case: base tag
|
||||||
|
if (tag == 'base'):
|
||||||
|
newBase = tagAttrs.get('href')
|
||||||
|
if newBase:
|
||||||
|
self.rewriter.setBaseUrl(newBase[1])
|
||||||
|
|
||||||
|
# special case: script or style parse context
|
||||||
|
elif ((tag == 'script') or (tag == 'style')) and (self._wbParseContext == None):
|
||||||
|
self._wbParseContext = tag
|
||||||
|
|
||||||
self.out.write('<' + tag)
|
self.out.write('<' + tag)
|
||||||
|
|
||||||
for attr in tagAttrs:
|
for attr in tagAttrs:
|
||||||
name, value = attr
|
attrName, attrValue = attr
|
||||||
rwMod = rwAttrs.get(name)
|
|
||||||
|
|
||||||
if rwMod is not None:
|
# special case: inline JS/event handler
|
||||||
value = self._rewriteAttr(rwMod, value)
|
if attrValue.startswith('javascript:') or attrName.startswith("on"):
|
||||||
|
attrValue = self._rewriteScript(attrValue)
|
||||||
|
|
||||||
self.out.write(' {0}="{1}"'.format(name, value))
|
# special case: inline CSS/style attribute
|
||||||
|
elif attrName == 'style':
|
||||||
|
attrValue = self._rewriteCSS(attrValue)
|
||||||
|
|
||||||
|
# special case: meta tag
|
||||||
|
elif (tag == 'meta') and (attrName == 'content') and self.hasAttr(tagAttrs, ('http-equiv', 'refresh')):
|
||||||
|
attrValue = self._rewriteMetaRefresh(attrValue)
|
||||||
|
|
||||||
|
else:
|
||||||
|
rwMod = handler.get(attrName)
|
||||||
|
if rwMod is not None:
|
||||||
|
attrValue = self._rewriteURL(attrValue, rwMod)
|
||||||
|
|
||||||
|
self.out.write(' {0}="{1}"'.format(attrName, attrValue))
|
||||||
|
|
||||||
self.out.write('/>' if isStartEnd else '>')
|
self.out.write('/>' if isStartEnd else '>')
|
||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def handle_starttag(self, tag, attrs):
|
def handle_starttag(self, tag, attrs):
|
||||||
|
|
||||||
if not self.rewriteTagAttrs(tag, attrs, False):
|
if not self.rewriteTagAttrs(tag, attrs, False):
|
||||||
self.out.write(self.get_starttag_text())
|
self.out.write(self.get_starttag_text())
|
||||||
|
|
||||||
def handle_startendtag(self, tag, attrs):
|
def handle_startendtag(self, tag, attrs):
|
||||||
|
|
||||||
if not self.rewriteTagAttrs(tag, attrs, True):
|
if not self.rewriteTagAttrs(tag, attrs, True):
|
||||||
self.out.write(self.get_starttag_text())
|
self.out.write(self.get_starttag_text())
|
||||||
|
|
||||||
def handle_endtag(self, tag):
|
def handle_endtag(self, tag):
|
||||||
|
if (tag == self._wbParseContext):
|
||||||
|
self._wbParseContext = None
|
||||||
|
|
||||||
self.out.write('</' + tag + '>')
|
self.out.write('</' + tag + '>')
|
||||||
|
|
||||||
def handle_data(self, data):
|
def parseData(self, data):
|
||||||
|
if self._wbParseContext == 'script':
|
||||||
|
data = self._rewriteScript(data)
|
||||||
|
elif self._wbParseContext == 'style':
|
||||||
|
data = self._rewriteCSS(data)
|
||||||
|
|
||||||
self.out.write(data)
|
self.out.write(data)
|
||||||
|
|
||||||
|
def handle_data(self, data):
|
||||||
|
self.parseData(data)
|
||||||
|
|
||||||
def handle_entityref(self, data):
|
def handle_entityref(self, data):
|
||||||
self.out.write('&' + data)
|
self.out.write('&' + data)
|
||||||
|
|
||||||
@ -72,7 +177,9 @@ class WBHtml(HTMLParser):
|
|||||||
self.out.write('&#' + data)
|
self.out.write('&#' + data)
|
||||||
|
|
||||||
def handle_comment(self, data):
|
def handle_comment(self, data):
|
||||||
self.out.write('<!--' + data + '-->')
|
self.out.write('<!--')
|
||||||
|
self.parseData(data)
|
||||||
|
self.out.write('-->')
|
||||||
|
|
||||||
def handle_decl(self, data):
|
def handle_decl(self, data):
|
||||||
self.out.write('<!' + data + '>')
|
self.out.write('<!' + data + '>')
|
||||||
@ -81,9 +188,9 @@ class WBHtml(HTMLParser):
|
|||||||
self.out.write('<?' + data + '>')
|
self.out.write('<?' + data + '>')
|
||||||
|
|
||||||
def unknown_decl(self, data):
|
def unknown_decl(self, data):
|
||||||
self.out.write('<![' + data + ']>')
|
self.out.write('<![')
|
||||||
|
self.parseData(data)
|
||||||
|
self.out.write(']>')
|
||||||
|
|
||||||
|
|
||||||
# instantiate the parser and fed it some HTML
|
# instantiate the parser and fed it some HTML
|
||||||
|
@ -20,6 +20,12 @@ class ArchivalUrlRewriter:
|
|||||||
>>> test_rewrite('../../other.html', '/*/http://example.com/index.html', 'localhost:8080/')
|
>>> test_rewrite('../../other.html', '/*/http://example.com/index.html', 'localhost:8080/')
|
||||||
'localhost:8080/*/http://example.com/other.html'
|
'localhost:8080/*/http://example.com/other.html'
|
||||||
|
|
||||||
|
>>> test_rewrite('path/../../other.html', '/*/http://example.com/index.html', 'localhost:8080/')
|
||||||
|
'localhost:8080/*/http://example.com/other.html'
|
||||||
|
|
||||||
|
>>> test_rewrite('http://some-other-site.com', '/20101226101112/http://example.com/index.html', 'localhost:8080/')
|
||||||
|
'localhost:8080/20101226101112/http://some-other-site.com'
|
||||||
|
|
||||||
>>> test_rewrite('../../other.html', '/2020/http://example.com/index.html', '/')
|
>>> test_rewrite('../../other.html', '/2020/http://example.com/index.html', '/')
|
||||||
'/2020/http://example.com/other.html'
|
'/2020/http://example.com/other.html'
|
||||||
|
|
||||||
@ -28,25 +34,33 @@ class ArchivalUrlRewriter:
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, wburl_str, prefix):
|
def __init__(self, wburl_str, prefix):
|
||||||
self.wburl_str = wburl_str
|
self.wburl = ArchivalUrl(wburl_str)
|
||||||
self.prefix = prefix
|
self.prefix = prefix
|
||||||
|
|
||||||
if self.prefix.endswith('/'):
|
if self.prefix.endswith('/'):
|
||||||
self.prefix = self.prefix[:-1]
|
self.prefix = self.prefix[:-1]
|
||||||
|
|
||||||
def rewrite(self, rel_url, mod = None):
|
def rewrite(self, rel_url, mod = None):
|
||||||
if '../' in rel_url or mod:
|
wburl = self.wburl
|
||||||
wburl = ArchivalUrl(self.wburl_str)
|
|
||||||
wburl.url = urlparse.urljoin(wburl.url, rel_url)
|
|
||||||
wburl.url = wburl.url.replace('../', '')
|
|
||||||
if mod is not None:
|
|
||||||
wburl.mod = mod
|
|
||||||
|
|
||||||
final_url = self.prefix + str(wburl)
|
# Disable optimization, doesn't work for external links
|
||||||
else:
|
# if relative path or different mod, create rewrite from split up ArchivalUrl
|
||||||
final_url = urlparse.urljoin(self.prefix + self.wburl_str, rel_url)
|
#if rel_url.startswith('/') or ('../' in rel_url) or mod:
|
||||||
|
newUrl = urlparse.urljoin(wburl.url, rel_url).replace('../', '')
|
||||||
|
|
||||||
|
if mod is None:
|
||||||
|
mod = wburl.mod
|
||||||
|
|
||||||
|
final_url = self.prefix + ArchivalUrl.to_str(wburl.type, mod, wburl.timestamp, newUrl)
|
||||||
|
# otherwise, optimize, and join directly with full url
|
||||||
|
#else:
|
||||||
|
# final_url = urlparse.urljoin(self.prefix + wburl.original_url, rel_url)
|
||||||
|
|
||||||
return final_url
|
return final_url
|
||||||
|
|
||||||
|
def setBaseUrl(self, newUrl):
|
||||||
|
self.wburl.url = newUrl
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import doctest
|
import doctest
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user