mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
html parser fleshed out!
This commit is contained in:
parent
fbf29e80d6
commit
37e57f7013
@ -129,24 +129,28 @@ class ArchivalUrl:
|
||||
|
||||
# Str Representation
|
||||
# ====================
|
||||
def __str__(self):
|
||||
if self.type == ArchivalUrl.QUERY or self.type == ArchivalUrl.URL_QUERY:
|
||||
@staticmethod
|
||||
def to_str(atype, mod, timestamp, url):
|
||||
if atype == ArchivalUrl.QUERY or atype == ArchivalUrl.URL_QUERY:
|
||||
tsmod = "/"
|
||||
if self.mod:
|
||||
tsmod += self.mod + "/"
|
||||
if self.timestamp:
|
||||
tsmod += self.timestamp
|
||||
if mod:
|
||||
tsmod += mod + "/"
|
||||
if timestamp:
|
||||
tsmod += timestamp
|
||||
|
||||
tsmod += "*/" + self.url
|
||||
if self.type == ArchivalUrl.URL_QUERY:
|
||||
tsmod += "*/" + url
|
||||
if atype == ArchivalUrl.URL_QUERY:
|
||||
tsmod += "*"
|
||||
return tsmod
|
||||
else:
|
||||
tsmod = self.timestamp + self.mod
|
||||
tsmod = timestamp + mod
|
||||
if len(tsmod) > 0:
|
||||
return "/" + tsmod + "/" + self.url
|
||||
return "/" + tsmod + "/" + url
|
||||
else:
|
||||
return "/" + self.url
|
||||
return "/" + url
|
||||
|
||||
def __str__(self):
|
||||
return ArchivalUrl.to_str(self.type, self.mod, self.timestamp, self.url)
|
||||
|
||||
def __repr__(self):
|
||||
return str((self.type, self.timestamp, self.mod, self.url, str(self)))
|
||||
|
155
pywb/wbhtml.py
155
pywb/wbhtml.py
@ -1,12 +1,10 @@
|
||||
import sys
|
||||
import re
|
||||
|
||||
from HTMLParser import HTMLParser
|
||||
from wburlrewriter import ArchivalUrlRewriter
|
||||
|
||||
tag_list = {
|
||||
'a': {'href': ''},
|
||||
'img': {'src': 'im_'}
|
||||
}
|
||||
|
||||
|
||||
# create a subclass and override the handler methods
|
||||
class WBHtml(HTMLParser):
|
||||
@ -14,57 +12,164 @@ class WBHtml(HTMLParser):
|
||||
>>> WBHtml(rewriter).feed('<HTML><A Href="page.html">Text</a></hTmL>')
|
||||
<HTML><a href="/web/20131226101010/http://example.com/some/path/page.html">Text</a></html>
|
||||
|
||||
>>> WBHtml(rewriter).feed('<img src="../img.gif"/><br/>')
|
||||
<img src="/web/20131226101010im_/http://example.com/some/img.gif"/><br/>
|
||||
>>> WBHtml(rewriter).feed('<body x="y"><img src="../img.gif"/><br/></body>')
|
||||
<body x="y"><img src="/web/20131226101010im_/http://example.com/some/img.gif"/><br/></body>
|
||||
|
||||
>>> WBHtml(rewriter).feed('<body x="y"><img src="/img.gif"/><br/></body>')
|
||||
<body x="y"><img src="/web/20131226101010im_/http://example.com/img.gif"/><br/></body>
|
||||
|
||||
>>> WBHtml(rewriter).feed('<META http-equiv="refresh" content="10; URL=/abc/def.html">')
|
||||
<meta http-equiv="refresh" content="10; URL=/web/20131226101010/http://example.com/abc/def.html">
|
||||
"""
|
||||
|
||||
REWRITE_TAGS = {
|
||||
'a': {'href': ''},
|
||||
'applet': {'codebase': 'oe_',
|
||||
'archive': 'oe_'},
|
||||
'area': {'href': ''},
|
||||
'base': {'href': ''},
|
||||
'blockquote': {'cite': ''},
|
||||
'body': {'background': 'im_'},
|
||||
'del': {'cite': ''},
|
||||
'embed': {'src': 'oe_'},
|
||||
'iframe': {'src': 'if_'},
|
||||
'img': {'src': 'im_'},
|
||||
'ins': {'cite': ''},
|
||||
'input': {'src': 'im_'},
|
||||
'form': {'action': ''},
|
||||
'frame': {'src': 'fr_'},
|
||||
'link': {'href': 'oe_'},
|
||||
'meta': {'content': ''},
|
||||
'object': {'codebase': 'oe_',
|
||||
'data': 'oe_'},
|
||||
'q': {'cite': ''},
|
||||
'script': {'src': 'js_'},
|
||||
'div': {'data-src' : '',
|
||||
'data-uri' : ''},
|
||||
'li': {'data-src' : '',
|
||||
'data-uri' : ''},
|
||||
}
|
||||
|
||||
STATE_TAGS = ['head', 'body', 'script', 'style']
|
||||
|
||||
|
||||
def __init__(self, rewriter, outstream = None):
|
||||
HTMLParser.__init__(self)
|
||||
|
||||
self.rewriter = rewriter
|
||||
self._wbParseContext = None
|
||||
self.out = outstream if outstream else sys.stdout
|
||||
|
||||
def _rewriteAttr(self, mod, value):
|
||||
|
||||
# ===========================
|
||||
META_REFRESH_REGEX = re.compile('^[\\d.]+\\s*;\\s*url\\s*=\\s*(.+?)\\s*$', re.IGNORECASE | re.MULTILINE)
|
||||
|
||||
def _rewriteMetaRefresh(self, metaRefresh):
|
||||
m = WBHtml.META_REFRESH_REGEX.match(metaRefresh)
|
||||
if not m:
|
||||
return metaRefresh
|
||||
|
||||
try:
|
||||
metaRefresh = metaRefresh[:m.start(1)] + self._rewriteURL(m.group(1)) + metaRefresh[m.end(1):]
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return metaRefresh
|
||||
# ===========================
|
||||
|
||||
NO_REWRITE_PREFIX = ['javascript:', 'data:', 'mailto:']
|
||||
|
||||
def _rewriteURL(self, value, mod = None):
|
||||
if any (value.startswith(x) for x in WBHtml.NO_REWRITE_PREFIX):
|
||||
return value
|
||||
|
||||
return self.rewriter.rewrite(value, mod)
|
||||
|
||||
def rewriteTagAttrs(self, tag, tagAttrs, isStartEnd):
|
||||
rwAttrs = tag_list.get(tag)
|
||||
if not rwAttrs:
|
||||
rwAttrs = tag_list.get('')
|
||||
|
||||
if not rwAttrs:
|
||||
def _rewriteCSS(self, cssContent):
|
||||
return cssContent
|
||||
|
||||
def _rewriteScript(self, scriptContent):
|
||||
return scriptContent
|
||||
|
||||
def hasAttr(self, tagAttrs, attr):
|
||||
name, value = attr
|
||||
for attrName, attrValue in tagAttrs:
|
||||
if attrName == name:
|
||||
return value.lower() == attrValue.lower()
|
||||
return False
|
||||
|
||||
def rewriteTagAttrs(self, tag, tagAttrs, isStartEnd):
|
||||
handler = WBHtml.REWRITE_TAGS.get(tag)
|
||||
if not handler:
|
||||
handler = WBHtml.REWRITE_TAGS.get('')
|
||||
|
||||
if not handler:
|
||||
return False
|
||||
|
||||
# special case: base tag
|
||||
if (tag == 'base'):
|
||||
newBase = tagAttrs.get('href')
|
||||
if newBase:
|
||||
self.rewriter.setBaseUrl(newBase[1])
|
||||
|
||||
# special case: script or style parse context
|
||||
elif ((tag == 'script') or (tag == 'style')) and (self._wbParseContext == None):
|
||||
self._wbParseContext = tag
|
||||
|
||||
self.out.write('<' + tag)
|
||||
|
||||
for attr in tagAttrs:
|
||||
name, value = attr
|
||||
rwMod = rwAttrs.get(name)
|
||||
attrName, attrValue = attr
|
||||
|
||||
if rwMod is not None:
|
||||
value = self._rewriteAttr(rwMod, value)
|
||||
# special case: inline JS/event handler
|
||||
if attrValue.startswith('javascript:') or attrName.startswith("on"):
|
||||
attrValue = self._rewriteScript(attrValue)
|
||||
|
||||
self.out.write(' {0}="{1}"'.format(name, value))
|
||||
# special case: inline CSS/style attribute
|
||||
elif attrName == 'style':
|
||||
attrValue = self._rewriteCSS(attrValue)
|
||||
|
||||
# special case: meta tag
|
||||
elif (tag == 'meta') and (attrName == 'content') and self.hasAttr(tagAttrs, ('http-equiv', 'refresh')):
|
||||
attrValue = self._rewriteMetaRefresh(attrValue)
|
||||
|
||||
else:
|
||||
rwMod = handler.get(attrName)
|
||||
if rwMod is not None:
|
||||
attrValue = self._rewriteURL(attrValue, rwMod)
|
||||
|
||||
self.out.write(' {0}="{1}"'.format(attrName, attrValue))
|
||||
|
||||
self.out.write('/>' if isStartEnd else '>')
|
||||
|
||||
return True
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
|
||||
if not self.rewriteTagAttrs(tag, attrs, False):
|
||||
self.out.write(self.get_starttag_text())
|
||||
|
||||
def handle_startendtag(self, tag, attrs):
|
||||
|
||||
if not self.rewriteTagAttrs(tag, attrs, True):
|
||||
self.out.write(self.get_starttag_text())
|
||||
|
||||
def handle_endtag(self, tag):
|
||||
if (tag == self._wbParseContext):
|
||||
self._wbParseContext = None
|
||||
|
||||
self.out.write('</' + tag + '>')
|
||||
|
||||
def handle_data(self, data):
|
||||
def parseData(self, data):
|
||||
if self._wbParseContext == 'script':
|
||||
data = self._rewriteScript(data)
|
||||
elif self._wbParseContext == 'style':
|
||||
data = self._rewriteCSS(data)
|
||||
|
||||
self.out.write(data)
|
||||
|
||||
def handle_data(self, data):
|
||||
self.parseData(data)
|
||||
|
||||
def handle_entityref(self, data):
|
||||
self.out.write('&' + data)
|
||||
|
||||
@ -72,7 +177,9 @@ class WBHtml(HTMLParser):
|
||||
self.out.write('&#' + data)
|
||||
|
||||
def handle_comment(self, data):
|
||||
self.out.write('<!--' + data + '-->')
|
||||
self.out.write('<!--')
|
||||
self.parseData(data)
|
||||
self.out.write('-->')
|
||||
|
||||
def handle_decl(self, data):
|
||||
self.out.write('<!' + data + '>')
|
||||
@ -81,9 +188,9 @@ class WBHtml(HTMLParser):
|
||||
self.out.write('<?' + data + '>')
|
||||
|
||||
def unknown_decl(self, data):
|
||||
self.out.write('<![' + data + ']>')
|
||||
|
||||
|
||||
self.out.write('<![')
|
||||
self.parseData(data)
|
||||
self.out.write(']>')
|
||||
|
||||
|
||||
# instantiate the parser and fed it some HTML
|
||||
|
@ -20,6 +20,12 @@ class ArchivalUrlRewriter:
|
||||
>>> test_rewrite('../../other.html', '/*/http://example.com/index.html', 'localhost:8080/')
|
||||
'localhost:8080/*/http://example.com/other.html'
|
||||
|
||||
>>> test_rewrite('path/../../other.html', '/*/http://example.com/index.html', 'localhost:8080/')
|
||||
'localhost:8080/*/http://example.com/other.html'
|
||||
|
||||
>>> test_rewrite('http://some-other-site.com', '/20101226101112/http://example.com/index.html', 'localhost:8080/')
|
||||
'localhost:8080/20101226101112/http://some-other-site.com'
|
||||
|
||||
>>> test_rewrite('../../other.html', '/2020/http://example.com/index.html', '/')
|
||||
'/2020/http://example.com/other.html'
|
||||
|
||||
@ -28,25 +34,33 @@ class ArchivalUrlRewriter:
|
||||
"""
|
||||
|
||||
def __init__(self, wburl_str, prefix):
|
||||
self.wburl_str = wburl_str
|
||||
self.wburl = ArchivalUrl(wburl_str)
|
||||
self.prefix = prefix
|
||||
|
||||
if self.prefix.endswith('/'):
|
||||
self.prefix = self.prefix[:-1]
|
||||
|
||||
def rewrite(self, rel_url, mod = None):
|
||||
if '../' in rel_url or mod:
|
||||
wburl = ArchivalUrl(self.wburl_str)
|
||||
wburl.url = urlparse.urljoin(wburl.url, rel_url)
|
||||
wburl.url = wburl.url.replace('../', '')
|
||||
if mod is not None:
|
||||
wburl.mod = mod
|
||||
wburl = self.wburl
|
||||
|
||||
final_url = self.prefix + str(wburl)
|
||||
else:
|
||||
final_url = urlparse.urljoin(self.prefix + self.wburl_str, rel_url)
|
||||
# Disable optimization, doesn't work for external links
|
||||
# if relative path or different mod, create rewrite from split up ArchivalUrl
|
||||
#if rel_url.startswith('/') or ('../' in rel_url) or mod:
|
||||
newUrl = urlparse.urljoin(wburl.url, rel_url).replace('../', '')
|
||||
|
||||
if mod is None:
|
||||
mod = wburl.mod
|
||||
|
||||
final_url = self.prefix + ArchivalUrl.to_str(wburl.type, mod, wburl.timestamp, newUrl)
|
||||
# otherwise, optimize, and join directly with full url
|
||||
#else:
|
||||
# final_url = urlparse.urljoin(self.prefix + wburl.original_url, rel_url)
|
||||
|
||||
return final_url
|
||||
|
||||
def setBaseUrl(self, newUrl):
|
||||
self.wburl.url = newUrl
|
||||
|
||||
if __name__ == "__main__":
|
||||
import doctest
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user