add html parser!

urlrewriter support for changing modifier
2025-03-15 00:03:28 +01:00 · 2013-12-20 19:11:52 -08:00 · 2013-12-20 19:11:52 -08:00 · fbf29e80d6
commit fbf29e80d6
parent 072befe3c8
3 changed files with 115 additions and 9 deletions
--- a/pywb/indexreader.py
+++ b/pywb/indexreader.py
@ -48,19 +48,19 @@ class RemoteCDXServer:
            return response

    @staticmethod
-    def getQueryParams(wburl):
+    def getQueryParams(wburl, limit = '150000', collapseTime = '10', replayClosest = '10'):
        return {

            ArchivalUrl.QUERY:
-                {'collapseTime': '10', 'filter': '!statuscode:(500|502|504)', 'limit': '150000'},
+                {'collapseTime': collapseTime, 'filter': '!statuscode:(500|502|504)', 'limit': limit},

            ArchivalUrl.URL_QUERY:
-                {'collapse': 'urlkey', 'matchType': 'prefix', 'showGroupCount': True, 'showUniqCount': True, 'lastSkipTimestamp': True, 'limit': '100',
+                {'collapse': 'urlkey', 'matchType': 'prefix', 'showGroupCount': True, 'showUniqCount': True, 'lastSkipTimestamp': True, 'limit': limit,
                 'fl': 'urlkey,original,timestamp,endtimestamp,groupcount,uniqcount',
                },

            ArchivalUrl.REPLAY:
-                {'sort': 'closest', 'filter': '!statuscode:(500|502|504)', 'limit': '10', 'closest': wburl.timestamp, 'resolveRevisits': True},
+                {'sort': 'closest', 'filter': '!statuscode:(500|502|504)', 'limit': replayClosest, 'closest': wburl.timestamp, 'resolveRevisits': True},

            ArchivalUrl.LATEST_REPLAY:
                {'sort': 'reverse', 'filter': 'statuscode:[23]..', 'limit': '1', 'resolveRevisits': True}
@ -82,7 +82,7 @@ class CDXCaptureResult:
                cdxformat = i

        if not cdxformat:
-            raise InvalidCDXException('unknown %d-field cdx format' % len(fields))
+            raise InvalidCDXException('unknown {0}-field cdx format'.format(len(fields)))

        for header, field in zip(cdxformat, fields):
            setattr(self, header, field)
--- a/pywb/wbhtml.py
+++ b/pywb/wbhtml.py
@ -0,0 +1,101 @@
+import sys
+
+from HTMLParser import HTMLParser
+from wburlrewriter import ArchivalUrlRewriter
+
+tag_list = {
+    'a': {'href': ''},
+    'img': {'src': 'im_'}
+}
+
+# create a subclass and override the handler methods
+class WBHtml(HTMLParser):
+    """
+    >>> WBHtml(rewriter).feed('<HTML><A Href="page.html">Text</a></hTmL>')
+    <HTML><a href="/web/20131226101010/http://example.com/some/path/page.html">Text</a></html>
+
+    >>> WBHtml(rewriter).feed('<img src="../img.gif"/><br/>')
+    <img src="/web/20131226101010im_/http://example.com/some/img.gif"/><br/>
+
+    """
+
+    def __init__(self, rewriter, outstream = None):
+        HTMLParser.__init__(self)
+
+        self.rewriter = rewriter
+        self.out = outstream if outstream else sys.stdout
+
+    def _rewriteAttr(self, mod, value):
+        return self.rewriter.rewrite(value, mod)
+
+    def rewriteTagAttrs(self, tag, tagAttrs, isStartEnd):
+        rwAttrs = tag_list.get(tag)
+        if not rwAttrs:
+            rwAttrs = tag_list.get('')
+
+        if not rwAttrs:
+            return False
+
+        self.out.write('<' + tag)
+        for attr in tagAttrs:
+            name, value = attr
+            rwMod = rwAttrs.get(name)
+
+            if rwMod is not None:
+                value = self._rewriteAttr(rwMod, value)
+
+            self.out.write(' {0}="{1}"'.format(name, value))
+
+        self.out.write('/>' if isStartEnd else '>')
+        return True
+
+    def handle_starttag(self, tag, attrs):
+
+        if not self.rewriteTagAttrs(tag, attrs, False):
+            self.out.write(self.get_starttag_text())
+
+    def handle_startendtag(self, tag, attrs):
+
+        if not self.rewriteTagAttrs(tag, attrs, True):
+            self.out.write(self.get_starttag_text())
+
+    def handle_endtag(self, tag):
+        self.out.write('</' + tag + '>')
+
+    def handle_data(self, data):
+        self.out.write(data)
+
+    def handle_entityref(self, data):
+        self.out.write('&' + data)
+
+    def handle_charref(self, data):
+        self.out.write('&#' + data)
+
+    def handle_comment(self, data):
+        self.out.write('<!--' + data + '-->')
+
+    def handle_decl(self, data):
+        self.out.write('<!' + data + '>')
+
+    def handle_pi(self, data):
+        self.out.write('<?' + data + '>')
+
+    def unknown_decl(self, data):
+        self.out.write('<![' + data + ']>')
+
+
+
+
+# instantiate the parser and fed it some HTML
+#parser = WBHtml()
+#instr = '<HTML X=\'a\' B=\'234\' some="other"><a href="Test"><BR/><head><title>Test</title></head>\n<body><h1>Parse me!</h1></body></HTML>'
+#print instr
+#print
+#parser.feed(instr)
+#print
+if __name__ == "__main__":
+    import doctest
+
+    rewriter = ArchivalUrlRewriter('/20131226101010/http://example.com/some/path/index.html', '/web/')
+
+    doctest.testmod()
--- a/pywb/wburlrewriter.py
+++ b/pywb/wburlrewriter.py
@ -8,6 +8,9 @@ class ArchivalUrlRewriter:
    >>> test_rewrite('other.html', '/20131010/http://example.com/path/page.html', 'https://web.archive.org/web/')
    'https://web.archive.org/web/20131010/http://example.com/path/other.html'

+    >>> test_rewrite('file.js', '/20131010/http://example.com/path/page.html', 'https://web.archive.org/web/', 'js_')
+    'https://web.archive.org/web/20131010js_/http://example.com/path/file.js'
+
    >>> test_rewrite('./other.html', '/20130907*/http://example.com/path/page.html', '/coll/')
    '/coll/20130907*/http://example.com/path/other.html'

@ -30,11 +33,13 @@ class ArchivalUrlRewriter:
        if self.prefix.endswith('/'):
            self.prefix = self.prefix[:-1]

-    def rewrite(self, rel_url):
-        if '../' in rel_url:
+    def rewrite(self, rel_url, mod = None):
+        if '../' in rel_url or mod:
            wburl = ArchivalUrl(self.wburl_str)
            wburl.url = urlparse.urljoin(wburl.url, rel_url)
            wburl.url = wburl.url.replace('../', '')
+            if mod is not None:
+                wburl.mod = mod

            final_url = self.prefix + str(wburl)
        else:
@ -45,8 +50,8 @@ class ArchivalUrlRewriter:
 if __name__ == "__main__":
    import doctest

-    def test_rewrite(rel_url, base_url, prefix):
+    def test_rewrite(rel_url, base_url, prefix, mod = None):
        rewriter = ArchivalUrlRewriter(base_url, prefix)
-        return rewriter.rewrite(rel_url)
+        return rewriter.rewrite(rel_url, mod)

    doctest.testmod()