add html parser!

urlrewriter support for changing modifier
2025-03-24 06:59:52 +01:00 · 2013-12-20 19:11:52 -08:00 · 2013-12-20 19:11:52 -08:00 · fbf29e80d6
commit fbf29e80d6
parent 072befe3c8
3 changed files with 115 additions and 9 deletions
--- a/pywb/indexreader.py
+++ b/pywb/indexreader.py
@ -48,19 +48,19 @@ class RemoteCDXServer:
            return response
    @staticmethod
-    def getQueryParams(wburl):
+    def getQueryParams(wburl, limit = '150000', collapseTime = '10', replayClosest = '10'):
        return {
            ArchivalUrl.QUERY:
-                {'collapseTime': '10', 'filter': '!statuscode:(500|502|504)', 'limit': '150000'},
+                {'collapseTime': collapseTime, 'filter': '!statuscode:(500|502|504)', 'limit': limit},
            ArchivalUrl.URL_QUERY:
-                {'collapse': 'urlkey', 'matchType': 'prefix', 'showGroupCount': True, 'showUniqCount': True, 'lastSkipTimestamp': True, 'limit': '100',
+                {'collapse': 'urlkey', 'matchType': 'prefix', 'showGroupCount': True, 'showUniqCount': True, 'lastSkipTimestamp': True, 'limit': limit,
                 'fl': 'urlkey,original,timestamp,endtimestamp,groupcount,uniqcount',
                },
            ArchivalUrl.REPLAY:
-                {'sort': 'closest', 'filter': '!statuscode:(500|502|504)', 'limit': '10', 'closest': wburl.timestamp, 'resolveRevisits': True},
+                {'sort': 'closest', 'filter': '!statuscode:(500|502|504)', 'limit': replayClosest, 'closest': wburl.timestamp, 'resolveRevisits': True},
            ArchivalUrl.LATEST_REPLAY:
                {'sort': 'reverse', 'filter': 'statuscode:[23]..', 'limit': '1', 'resolveRevisits': True}
@ -82,7 +82,7 @@ class CDXCaptureResult:
                cdxformat = i
        if not cdxformat:
-            raise InvalidCDXException('unknown %d-field cdx format' % len(fields))
+            raise InvalidCDXException('unknown {0}-field cdx format'.format(len(fields)))
        for header, field in zip(cdxformat, fields):
            setattr(self, header, field)
--- a/pywb/wbhtml.py
+++ b/pywb/wbhtml.py
@ -0,0 +1,101 @@
 import sys
 from HTMLParser import HTMLParser
 from wburlrewriter import ArchivalUrlRewriter
 tag_list = {
    'a': {'href': ''},
    'img': {'src': 'im_'}
 }
 # create a subclass and override the handler methods
 class WBHtml(HTMLParser):
    """
    >>> WBHtml(rewriter).feed('<HTML><A Href="page.html">Text</a></hTmL>')
    <HTML><a href="/web/20131226101010/http://example.com/some/path/page.html">Text</a></html>
    >>> WBHtml(rewriter).feed('<img src="../img.gif"/><br/>')
    <img src="/web/20131226101010im_/http://example.com/some/img.gif"/><br/>
    """
    def __init__(self, rewriter, outstream = None):
        HTMLParser.__init__(self)
        self.rewriter = rewriter
        self.out = outstream if outstream else sys.stdout
    def _rewriteAttr(self, mod, value):
        return self.rewriter.rewrite(value, mod)
    def rewriteTagAttrs(self, tag, tagAttrs, isStartEnd):
        rwAttrs = tag_list.get(tag)
        if not rwAttrs:
            rwAttrs = tag_list.get('')
        if not rwAttrs:
            return False
        self.out.write('<' + tag)
        for attr in tagAttrs:
            name, value = attr
            rwMod = rwAttrs.get(name)
            if rwMod is not None:
                value = self._rewriteAttr(rwMod, value)
            self.out.write(' {0}="{1}"'.format(name, value))
        self.out.write('/>' if isStartEnd else '>')
        return True
    def handle_starttag(self, tag, attrs):
        if not self.rewriteTagAttrs(tag, attrs, False):
            self.out.write(self.get_starttag_text())
    def handle_startendtag(self, tag, attrs):
        if not self.rewriteTagAttrs(tag, attrs, True):
            self.out.write(self.get_starttag_text())
    def handle_endtag(self, tag):
        self.out.write('</' + tag + '>')
    def handle_data(self, data):
        self.out.write(data)
    def handle_entityref(self, data):
        self.out.write('&' + data)
    def handle_charref(self, data):
        self.out.write('&#' + data)
    def handle_comment(self, data):
        self.out.write('<!--' + data + '-->')
    def handle_decl(self, data):
        self.out.write('<!' + data + '>')
    def handle_pi(self, data):
        self.out.write('<?' + data + '>')
    def unknown_decl(self, data):
        self.out.write('<![' + data + ']>')
 # instantiate the parser and fed it some HTML
 #parser = WBHtml()
 #instr = '<HTML X=\'a\' B=\'234\' some="other"><a href="Test"><BR/><head><title>Test</title></head>\n<body><h1>Parse me!</h1></body></HTML>'
 #print instr
 #print
 #parser.feed(instr)
 #print
 if __name__ == "__main__":
    import doctest
    rewriter = ArchivalUrlRewriter('/20131226101010/http://example.com/some/path/index.html', '/web/')
    doctest.testmod()
--- a/pywb/wburlrewriter.py
+++ b/pywb/wburlrewriter.py
@ -8,6 +8,9 @@ class ArchivalUrlRewriter:
    >>> test_rewrite('other.html', '/20131010/http://example.com/path/page.html', 'https://web.archive.org/web/')
    'https://web.archive.org/web/20131010/http://example.com/path/other.html'
    >>> test_rewrite('file.js', '/20131010/http://example.com/path/page.html', 'https://web.archive.org/web/', 'js_')
    'https://web.archive.org/web/20131010js_/http://example.com/path/file.js'
    >>> test_rewrite('./other.html', '/20130907*/http://example.com/path/page.html', '/coll/')
    '/coll/20130907*/http://example.com/path/other.html'
@ -30,11 +33,13 @@ class ArchivalUrlRewriter:
        if self.prefix.endswith('/'):
            self.prefix = self.prefix[:-1]
-    def rewrite(self, rel_url):
+    def rewrite(self, rel_url, mod = None):
-        if '../' in rel_url:
+        if '../' in rel_url or mod:
            wburl = ArchivalUrl(self.wburl_str)
            wburl.url = urlparse.urljoin(wburl.url, rel_url)
            wburl.url = wburl.url.replace('../', '')
            if mod is not None:
                wburl.mod = mod
            final_url = self.prefix + str(wburl)
        else:
@ -45,8 +50,8 @@ class ArchivalUrlRewriter:
 if __name__ == "__main__":
    import doctest
-    def test_rewrite(rel_url, base_url, prefix):
+    def test_rewrite(rel_url, base_url, prefix, mod = None):
        rewriter = ArchivalUrlRewriter(base_url, prefix)
-        return rewriter.rewrite(rel_url)
+        return rewriter.rewrite(rel_url, mod)
    doctest.testmod()