html parser fleshed out!

2025-03-15 00:03:28 +01:00 · 2013-12-22 18:12:05 -08:00 · 2013-12-22 18:12:05 -08:00 · 37e57f7013
commit 37e57f7013
parent fbf29e80d6
3 changed files with 170 additions and 45 deletions
--- a/pywb/wbarchivalurl.py
+++ b/pywb/wbarchivalurl.py
@ -129,24 +129,28 @@ class ArchivalUrl:

    # Str Representation
    # ====================
-    def __str__(self):
-        if self.type == ArchivalUrl.QUERY or self.type == ArchivalUrl.URL_QUERY:
+    @staticmethod
+    def to_str(atype, mod, timestamp, url):
+        if atype == ArchivalUrl.QUERY or atype == ArchivalUrl.URL_QUERY:
            tsmod = "/"
-            if self.mod:
-                tsmod += self.mod + "/"
-            if self.timestamp:
-                tsmod += self.timestamp
+            if mod:
+                tsmod += mod + "/"
+            if timestamp:
+                tsmod += timestamp

-            tsmod += "*/" + self.url
-            if self.type == ArchivalUrl.URL_QUERY:
+            tsmod += "*/" + url
+            if atype == ArchivalUrl.URL_QUERY:
                tsmod += "*"
            return tsmod
        else:
-            tsmod = self.timestamp + self.mod
+            tsmod = timestamp + mod
            if len(tsmod) > 0:
-                return "/" + tsmod + "/" + self.url
+                return "/" + tsmod + "/" + url
            else:
-                return "/" + self.url
+                return "/" + url
+
+    def __str__(self):
+        return ArchivalUrl.to_str(self.type, self.mod, self.timestamp, self.url)

    def __repr__(self):
        return str((self.type, self.timestamp, self.mod, self.url, str(self)))
--- a/pywb/wbhtml.py
+++ b/pywb/wbhtml.py
@ -1,12 +1,10 @@
 import sys
+import re

 from HTMLParser import HTMLParser
 from wburlrewriter import ArchivalUrlRewriter

-tag_list = {
-    'a': {'href': ''},
-    'img': {'src': 'im_'}
-}
+

 # create a subclass and override the handler methods
 class WBHtml(HTMLParser):
@ -14,57 +12,164 @@ class WBHtml(HTMLParser):
    >>> WBHtml(rewriter).feed('<HTML><A Href="page.html">Text</a></hTmL>')
    <HTML><a href="/web/20131226101010/http://example.com/some/path/page.html">Text</a></html>

-    >>> WBHtml(rewriter).feed('<img src="../img.gif"/><br/>')
-    <img src="/web/20131226101010im_/http://example.com/some/img.gif"/><br/>
+    >>> WBHtml(rewriter).feed('<body x="y"><img src="../img.gif"/><br/></body>')
+    <body x="y"><img src="/web/20131226101010im_/http://example.com/some/img.gif"/><br/></body>

+    >>> WBHtml(rewriter).feed('<body x="y"><img src="/img.gif"/><br/></body>')
+    <body x="y"><img src="/web/20131226101010im_/http://example.com/img.gif"/><br/></body>
+
+    >>> WBHtml(rewriter).feed('<META http-equiv="refresh" content="10; URL=/abc/def.html">')
+    <meta http-equiv="refresh" content="10; URL=/web/20131226101010/http://example.com/abc/def.html">
    """

+    REWRITE_TAGS = {
+        'a':       {'href': ''},
+        'applet':  {'codebase': 'oe_',
+                    'archive': 'oe_'},
+        'area':    {'href': ''},
+        'base':    {'href': ''},
+        'blockquote': {'cite': ''},
+        'body':    {'background': 'im_'},
+        'del':     {'cite': ''},
+        'embed':   {'src': 'oe_'},
+        'iframe':  {'src': 'if_'},
+        'img':     {'src': 'im_'},
+        'ins':     {'cite': ''},
+        'input':   {'src': 'im_'},
+        'form':    {'action': ''},
+        'frame':   {'src': 'fr_'},
+        'link':    {'href': 'oe_'},
+        'meta':    {'content': ''},
+        'object':  {'codebase': 'oe_',
+                    'data': 'oe_'},
+        'q':       {'cite': ''},
+        'script':  {'src': 'js_'},
+        'div':     {'data-src' : '',
+                    'data-uri' : ''},
+        'li':      {'data-src' : '',
+                    'data-uri' : ''},
+    }
+
+    STATE_TAGS = ['head', 'body', 'script', 'style']
+
+
    def __init__(self, rewriter, outstream = None):
        HTMLParser.__init__(self)

        self.rewriter = rewriter
+        self._wbParseContext = None
        self.out = outstream if outstream else sys.stdout

-    def _rewriteAttr(self, mod, value):
+
+    # ===========================
+    META_REFRESH_REGEX = re.compile('^[\\d.]+\\s*;\\s*url\\s*=\\s*(.+?)\\s*$', re.IGNORECASE | re.MULTILINE)
+
+    def _rewriteMetaRefresh(self, metaRefresh):
+        m = WBHtml.META_REFRESH_REGEX.match(metaRefresh)
+        if not m:
+            return metaRefresh
+
+        try:
+            metaRefresh = metaRefresh[:m.start(1)] + self._rewriteURL(m.group(1)) + metaRefresh[m.end(1):]
+        except Exception:
+            pass
+
+        return metaRefresh
+    # ===========================
+
+    NO_REWRITE_PREFIX = ['javascript:', 'data:', 'mailto:']
+
+    def _rewriteURL(self, value, mod = None):
+        if any (value.startswith(x) for x in WBHtml.NO_REWRITE_PREFIX):
+            return value
+
        return self.rewriter.rewrite(value, mod)

-    def rewriteTagAttrs(self, tag, tagAttrs, isStartEnd):
-        rwAttrs = tag_list.get(tag)
-        if not rwAttrs:
-            rwAttrs = tag_list.get('')

-        if not rwAttrs:
+    def _rewriteCSS(self, cssContent):
+        return cssContent
+
+    def _rewriteScript(self, scriptContent):
+        return scriptContent
+
+    def hasAttr(self, tagAttrs, attr):
+        name, value = attr
+        for attrName, attrValue in tagAttrs:
+            if attrName == name:
+                return value.lower() == attrValue.lower()
+        return False
+
+    def rewriteTagAttrs(self, tag, tagAttrs, isStartEnd):
+        handler = WBHtml.REWRITE_TAGS.get(tag)
+        if not handler:
+            handler = WBHtml.REWRITE_TAGS.get('')
+
+        if not handler:
            return False

+        # special case: base tag
+        if (tag == 'base'):
+            newBase = tagAttrs.get('href')
+            if newBase:
+                self.rewriter.setBaseUrl(newBase[1])
+
+        # special case: script or style parse context
+        elif ((tag == 'script') or (tag == 'style')) and (self._wbParseContext == None):
+            self._wbParseContext = tag
+
        self.out.write('<' + tag)
+
        for attr in tagAttrs:
-            name, value = attr
-            rwMod = rwAttrs.get(name)
+            attrName, attrValue = attr

-            if rwMod is not None:
-                value = self._rewriteAttr(rwMod, value)
+            # special case: inline JS/event handler
+            if attrValue.startswith('javascript:') or attrName.startswith("on"):
+                attrValue = self._rewriteScript(attrValue)

-            self.out.write(' {0}="{1}"'.format(name, value))
+            # special case: inline CSS/style attribute
+            elif attrName == 'style':
+                attrValue = self._rewriteCSS(attrValue)
+
+            # special case: meta tag
+            elif (tag == 'meta') and (attrName == 'content') and self.hasAttr(tagAttrs, ('http-equiv', 'refresh')):
+                attrValue = self._rewriteMetaRefresh(attrValue)
+
+            else:
+                rwMod = handler.get(attrName)
+                if rwMod is not None:
+                    attrValue = self._rewriteURL(attrValue, rwMod)
+
+            self.out.write(' {0}="{1}"'.format(attrName, attrValue))

        self.out.write('/>' if isStartEnd else '>')
+
        return True

    def handle_starttag(self, tag, attrs):
-
        if not self.rewriteTagAttrs(tag, attrs, False):
            self.out.write(self.get_starttag_text())

    def handle_startendtag(self, tag, attrs):
-
        if not self.rewriteTagAttrs(tag, attrs, True):
            self.out.write(self.get_starttag_text())

    def handle_endtag(self, tag):
+        if (tag == self._wbParseContext):
+            self._wbParseContext = None
+
        self.out.write('</' + tag + '>')

-    def handle_data(self, data):
+    def parseData(self, data):
+        if self._wbParseContext == 'script':
+            data = self._rewriteScript(data)
+        elif self._wbParseContext == 'style':
+            data = self._rewriteCSS(data)
+
        self.out.write(data)

+    def handle_data(self, data):
+        self.parseData(data)
+
    def handle_entityref(self, data):
        self.out.write('&' + data)

@ -72,7 +177,9 @@ class WBHtml(HTMLParser):
        self.out.write('&#' + data)

    def handle_comment(self, data):
-        self.out.write('<!--' + data + '-->')
+        self.out.write('<!--')
+        self.parseData(data)
+        self.out.write('-->')

    def handle_decl(self, data):
        self.out.write('<!' + data + '>')
@ -81,9 +188,9 @@ class WBHtml(HTMLParser):
        self.out.write('<?' + data + '>')

    def unknown_decl(self, data):
-        self.out.write('<![' + data + ']>')
-
-
+        self.out.write('<![')
+        self.parseData(data)
+        self.out.write(']>')


 # instantiate the parser and fed it some HTML
--- a/pywb/wburlrewriter.py
+++ b/pywb/wburlrewriter.py
@ -20,6 +20,12 @@ class ArchivalUrlRewriter:
    >>> test_rewrite('../../other.html', '/*/http://example.com/index.html', 'localhost:8080/')
    'localhost:8080/*/http://example.com/other.html'

+    >>> test_rewrite('path/../../other.html', '/*/http://example.com/index.html', 'localhost:8080/')
+    'localhost:8080/*/http://example.com/other.html'
+
+    >>> test_rewrite('http://some-other-site.com', '/20101226101112/http://example.com/index.html', 'localhost:8080/')
+    'localhost:8080/20101226101112/http://some-other-site.com'
+
    >>> test_rewrite('../../other.html', '/2020/http://example.com/index.html', '/')
    '/2020/http://example.com/other.html'

@ -28,25 +34,33 @@ class ArchivalUrlRewriter:
      """

    def __init__(self, wburl_str, prefix):
-        self.wburl_str = wburl_str
+        self.wburl = ArchivalUrl(wburl_str)
        self.prefix = prefix
+
        if self.prefix.endswith('/'):
            self.prefix = self.prefix[:-1]

    def rewrite(self, rel_url, mod = None):
-        if '../' in rel_url or mod:
-            wburl = ArchivalUrl(self.wburl_str)
-            wburl.url = urlparse.urljoin(wburl.url, rel_url)
-            wburl.url = wburl.url.replace('../', '')
-            if mod is not None:
-                wburl.mod = mod
+        wburl = self.wburl

-            final_url = self.prefix + str(wburl)
-        else:
-            final_url = urlparse.urljoin(self.prefix + self.wburl_str, rel_url)
+        # Disable optimization, doesn't work for external links
+        # if relative path or different mod, create rewrite from split up ArchivalUrl
+        #if rel_url.startswith('/') or ('../' in rel_url) or mod:
+        newUrl = urlparse.urljoin(wburl.url, rel_url).replace('../', '')
+
+        if mod is None:
+            mod = wburl.mod
+
+        final_url = self.prefix + ArchivalUrl.to_str(wburl.type, mod, wburl.timestamp, newUrl)
+        # otherwise, optimize, and join directly with full url
+        #else:
+        #    final_url = urlparse.urljoin(self.prefix + wburl.original_url, rel_url)

        return final_url

+    def setBaseUrl(self, newUrl):
+        self.wburl.url = newUrl
+
 if __name__ == "__main__":
    import doctest