diff --git a/pywb/wbarchivalurl.py b/pywb/wbarchivalurl.py index 85bb4cbe..89345f02 100644 --- a/pywb/wbarchivalurl.py +++ b/pywb/wbarchivalurl.py @@ -129,24 +129,28 @@ class ArchivalUrl: # Str Representation # ==================== - def __str__(self): - if self.type == ArchivalUrl.QUERY or self.type == ArchivalUrl.URL_QUERY: + @staticmethod + def to_str(atype, mod, timestamp, url): + if atype == ArchivalUrl.QUERY or atype == ArchivalUrl.URL_QUERY: tsmod = "/" - if self.mod: - tsmod += self.mod + "/" - if self.timestamp: - tsmod += self.timestamp + if mod: + tsmod += mod + "/" + if timestamp: + tsmod += timestamp - tsmod += "*/" + self.url - if self.type == ArchivalUrl.URL_QUERY: + tsmod += "*/" + url + if atype == ArchivalUrl.URL_QUERY: tsmod += "*" return tsmod else: - tsmod = self.timestamp + self.mod + tsmod = timestamp + mod if len(tsmod) > 0: - return "/" + tsmod + "/" + self.url + return "/" + tsmod + "/" + url else: - return "/" + self.url + return "/" + url + + def __str__(self): + return ArchivalUrl.to_str(self.type, self.mod, self.timestamp, self.url) def __repr__(self): return str((self.type, self.timestamp, self.mod, self.url, str(self))) diff --git a/pywb/wbhtml.py b/pywb/wbhtml.py index 7574cf77..d707eef3 100644 --- a/pywb/wbhtml.py +++ b/pywb/wbhtml.py @@ -1,12 +1,10 @@ import sys +import re from HTMLParser import HTMLParser from wburlrewriter import ArchivalUrlRewriter -tag_list = { - 'a': {'href': ''}, - 'img': {'src': 'im_'} -} + # create a subclass and override the handler methods class WBHtml(HTMLParser): @@ -14,57 +12,164 @@ class WBHtml(HTMLParser): >>> WBHtml(rewriter).feed('Text') Text - >>> WBHtml(rewriter).feed('
') -
+ >>> WBHtml(rewriter).feed('
') +
+ >>> WBHtml(rewriter).feed('
') +
+ + >>> WBHtml(rewriter).feed('') + """ + REWRITE_TAGS = { + 'a': {'href': ''}, + 'applet': {'codebase': 'oe_', + 'archive': 'oe_'}, + 'area': {'href': ''}, + 'base': {'href': ''}, + 'blockquote': {'cite': ''}, + 'body': {'background': 'im_'}, + 'del': {'cite': ''}, + 'embed': {'src': 'oe_'}, + 'iframe': {'src': 'if_'}, + 'img': {'src': 'im_'}, + 'ins': {'cite': ''}, + 'input': {'src': 'im_'}, + 'form': {'action': ''}, + 'frame': {'src': 'fr_'}, + 'link': {'href': 'oe_'}, + 'meta': {'content': ''}, + 'object': {'codebase': 'oe_', + 'data': 'oe_'}, + 'q': {'cite': ''}, + 'script': {'src': 'js_'}, + 'div': {'data-src' : '', + 'data-uri' : ''}, + 'li': {'data-src' : '', + 'data-uri' : ''}, + } + + STATE_TAGS = ['head', 'body', 'script', 'style'] + + def __init__(self, rewriter, outstream = None): HTMLParser.__init__(self) self.rewriter = rewriter + self._wbParseContext = None self.out = outstream if outstream else sys.stdout - def _rewriteAttr(self, mod, value): + + # =========================== + META_REFRESH_REGEX = re.compile('^[\\d.]+\\s*;\\s*url\\s*=\\s*(.+?)\\s*$', re.IGNORECASE | re.MULTILINE) + + def _rewriteMetaRefresh(self, metaRefresh): + m = WBHtml.META_REFRESH_REGEX.match(metaRefresh) + if not m: + return metaRefresh + + try: + metaRefresh = metaRefresh[:m.start(1)] + self._rewriteURL(m.group(1)) + metaRefresh[m.end(1):] + except Exception: + pass + + return metaRefresh + # =========================== + + NO_REWRITE_PREFIX = ['javascript:', 'data:', 'mailto:'] + + def _rewriteURL(self, value, mod = None): + if any (value.startswith(x) for x in WBHtml.NO_REWRITE_PREFIX): + return value + return self.rewriter.rewrite(value, mod) - def rewriteTagAttrs(self, tag, tagAttrs, isStartEnd): - rwAttrs = tag_list.get(tag) - if not rwAttrs: - rwAttrs = tag_list.get('') - if not rwAttrs: + def _rewriteCSS(self, cssContent): + return cssContent + + def _rewriteScript(self, scriptContent): + return scriptContent + + def hasAttr(self, tagAttrs, attr): + name, value = attr + for attrName, attrValue in tagAttrs: + if attrName == name: + return value.lower() == attrValue.lower() + return False + + def rewriteTagAttrs(self, tag, tagAttrs, isStartEnd): + handler = WBHtml.REWRITE_TAGS.get(tag) + if not handler: + handler = WBHtml.REWRITE_TAGS.get('') + + if not handler: return False + # special case: base tag + if (tag == 'base'): + newBase = tagAttrs.get('href') + if newBase: + self.rewriter.setBaseUrl(newBase[1]) + + # special case: script or style parse context + elif ((tag == 'script') or (tag == 'style')) and (self._wbParseContext == None): + self._wbParseContext = tag + self.out.write('<' + tag) + for attr in tagAttrs: - name, value = attr - rwMod = rwAttrs.get(name) + attrName, attrValue = attr - if rwMod is not None: - value = self._rewriteAttr(rwMod, value) + # special case: inline JS/event handler + if attrValue.startswith('javascript:') or attrName.startswith("on"): + attrValue = self._rewriteScript(attrValue) - self.out.write(' {0}="{1}"'.format(name, value)) + # special case: inline CSS/style attribute + elif attrName == 'style': + attrValue = self._rewriteCSS(attrValue) + + # special case: meta tag + elif (tag == 'meta') and (attrName == 'content') and self.hasAttr(tagAttrs, ('http-equiv', 'refresh')): + attrValue = self._rewriteMetaRefresh(attrValue) + + else: + rwMod = handler.get(attrName) + if rwMod is not None: + attrValue = self._rewriteURL(attrValue, rwMod) + + self.out.write(' {0}="{1}"'.format(attrName, attrValue)) self.out.write('/>' if isStartEnd else '>') + return True def handle_starttag(self, tag, attrs): - if not self.rewriteTagAttrs(tag, attrs, False): self.out.write(self.get_starttag_text()) def handle_startendtag(self, tag, attrs): - if not self.rewriteTagAttrs(tag, attrs, True): self.out.write(self.get_starttag_text()) def handle_endtag(self, tag): + if (tag == self._wbParseContext): + self._wbParseContext = None + self.out.write('') - def handle_data(self, data): + def parseData(self, data): + if self._wbParseContext == 'script': + data = self._rewriteScript(data) + elif self._wbParseContext == 'style': + data = self._rewriteCSS(data) + self.out.write(data) + def handle_data(self, data): + self.parseData(data) + def handle_entityref(self, data): self.out.write('&' + data) @@ -72,7 +177,9 @@ class WBHtml(HTMLParser): self.out.write('&#' + data) def handle_comment(self, data): - self.out.write('') + self.out.write('') def handle_decl(self, data): self.out.write('') @@ -81,9 +188,9 @@ class WBHtml(HTMLParser): self.out.write('') def unknown_decl(self, data): - self.out.write('') - - + self.out.write('') # instantiate the parser and fed it some HTML diff --git a/pywb/wburlrewriter.py b/pywb/wburlrewriter.py index 0ff3ef97..041dc753 100644 --- a/pywb/wburlrewriter.py +++ b/pywb/wburlrewriter.py @@ -20,6 +20,12 @@ class ArchivalUrlRewriter: >>> test_rewrite('../../other.html', '/*/http://example.com/index.html', 'localhost:8080/') 'localhost:8080/*/http://example.com/other.html' + >>> test_rewrite('path/../../other.html', '/*/http://example.com/index.html', 'localhost:8080/') + 'localhost:8080/*/http://example.com/other.html' + + >>> test_rewrite('http://some-other-site.com', '/20101226101112/http://example.com/index.html', 'localhost:8080/') + 'localhost:8080/20101226101112/http://some-other-site.com' + >>> test_rewrite('../../other.html', '/2020/http://example.com/index.html', '/') '/2020/http://example.com/other.html' @@ -28,25 +34,33 @@ class ArchivalUrlRewriter: """ def __init__(self, wburl_str, prefix): - self.wburl_str = wburl_str + self.wburl = ArchivalUrl(wburl_str) self.prefix = prefix + if self.prefix.endswith('/'): self.prefix = self.prefix[:-1] def rewrite(self, rel_url, mod = None): - if '../' in rel_url or mod: - wburl = ArchivalUrl(self.wburl_str) - wburl.url = urlparse.urljoin(wburl.url, rel_url) - wburl.url = wburl.url.replace('../', '') - if mod is not None: - wburl.mod = mod + wburl = self.wburl - final_url = self.prefix + str(wburl) - else: - final_url = urlparse.urljoin(self.prefix + self.wburl_str, rel_url) + # Disable optimization, doesn't work for external links + # if relative path or different mod, create rewrite from split up ArchivalUrl + #if rel_url.startswith('/') or ('../' in rel_url) or mod: + newUrl = urlparse.urljoin(wburl.url, rel_url).replace('../', '') + + if mod is None: + mod = wburl.mod + + final_url = self.prefix + ArchivalUrl.to_str(wburl.type, mod, wburl.timestamp, newUrl) + # otherwise, optimize, and join directly with full url + #else: + # final_url = urlparse.urljoin(self.prefix + wburl.original_url, rel_url) return final_url + def setBaseUrl(self, newUrl): + self.wburl.url = newUrl + if __name__ == "__main__": import doctest