mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
add html parser!
urlrewriter support for changing modifier
This commit is contained in:
parent
072befe3c8
commit
fbf29e80d6
@ -48,19 +48,19 @@ class RemoteCDXServer:
|
||||
return response
|
||||
|
||||
@staticmethod
|
||||
def getQueryParams(wburl):
|
||||
def getQueryParams(wburl, limit = '150000', collapseTime = '10', replayClosest = '10'):
|
||||
return {
|
||||
|
||||
ArchivalUrl.QUERY:
|
||||
{'collapseTime': '10', 'filter': '!statuscode:(500|502|504)', 'limit': '150000'},
|
||||
{'collapseTime': collapseTime, 'filter': '!statuscode:(500|502|504)', 'limit': limit},
|
||||
|
||||
ArchivalUrl.URL_QUERY:
|
||||
{'collapse': 'urlkey', 'matchType': 'prefix', 'showGroupCount': True, 'showUniqCount': True, 'lastSkipTimestamp': True, 'limit': '100',
|
||||
{'collapse': 'urlkey', 'matchType': 'prefix', 'showGroupCount': True, 'showUniqCount': True, 'lastSkipTimestamp': True, 'limit': limit,
|
||||
'fl': 'urlkey,original,timestamp,endtimestamp,groupcount,uniqcount',
|
||||
},
|
||||
|
||||
ArchivalUrl.REPLAY:
|
||||
{'sort': 'closest', 'filter': '!statuscode:(500|502|504)', 'limit': '10', 'closest': wburl.timestamp, 'resolveRevisits': True},
|
||||
{'sort': 'closest', 'filter': '!statuscode:(500|502|504)', 'limit': replayClosest, 'closest': wburl.timestamp, 'resolveRevisits': True},
|
||||
|
||||
ArchivalUrl.LATEST_REPLAY:
|
||||
{'sort': 'reverse', 'filter': 'statuscode:[23]..', 'limit': '1', 'resolveRevisits': True}
|
||||
@ -82,7 +82,7 @@ class CDXCaptureResult:
|
||||
cdxformat = i
|
||||
|
||||
if not cdxformat:
|
||||
raise InvalidCDXException('unknown %d-field cdx format' % len(fields))
|
||||
raise InvalidCDXException('unknown {0}-field cdx format'.format(len(fields)))
|
||||
|
||||
for header, field in zip(cdxformat, fields):
|
||||
setattr(self, header, field)
|
||||
|
101
pywb/wbhtml.py
Normal file
101
pywb/wbhtml.py
Normal file
@ -0,0 +1,101 @@
|
||||
import sys
|
||||
|
||||
from HTMLParser import HTMLParser
|
||||
from wburlrewriter import ArchivalUrlRewriter
|
||||
|
||||
tag_list = {
|
||||
'a': {'href': ''},
|
||||
'img': {'src': 'im_'}
|
||||
}
|
||||
|
||||
# create a subclass and override the handler methods
|
||||
class WBHtml(HTMLParser):
|
||||
"""
|
||||
>>> WBHtml(rewriter).feed('<HTML><A Href="page.html">Text</a></hTmL>')
|
||||
<HTML><a href="/web/20131226101010/http://example.com/some/path/page.html">Text</a></html>
|
||||
|
||||
>>> WBHtml(rewriter).feed('<img src="../img.gif"/><br/>')
|
||||
<img src="/web/20131226101010im_/http://example.com/some/img.gif"/><br/>
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, rewriter, outstream = None):
|
||||
HTMLParser.__init__(self)
|
||||
|
||||
self.rewriter = rewriter
|
||||
self.out = outstream if outstream else sys.stdout
|
||||
|
||||
def _rewriteAttr(self, mod, value):
|
||||
return self.rewriter.rewrite(value, mod)
|
||||
|
||||
def rewriteTagAttrs(self, tag, tagAttrs, isStartEnd):
|
||||
rwAttrs = tag_list.get(tag)
|
||||
if not rwAttrs:
|
||||
rwAttrs = tag_list.get('')
|
||||
|
||||
if not rwAttrs:
|
||||
return False
|
||||
|
||||
self.out.write('<' + tag)
|
||||
for attr in tagAttrs:
|
||||
name, value = attr
|
||||
rwMod = rwAttrs.get(name)
|
||||
|
||||
if rwMod is not None:
|
||||
value = self._rewriteAttr(rwMod, value)
|
||||
|
||||
self.out.write(' {0}="{1}"'.format(name, value))
|
||||
|
||||
self.out.write('/>' if isStartEnd else '>')
|
||||
return True
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
|
||||
if not self.rewriteTagAttrs(tag, attrs, False):
|
||||
self.out.write(self.get_starttag_text())
|
||||
|
||||
def handle_startendtag(self, tag, attrs):
|
||||
|
||||
if not self.rewriteTagAttrs(tag, attrs, True):
|
||||
self.out.write(self.get_starttag_text())
|
||||
|
||||
def handle_endtag(self, tag):
|
||||
self.out.write('</' + tag + '>')
|
||||
|
||||
def handle_data(self, data):
|
||||
self.out.write(data)
|
||||
|
||||
def handle_entityref(self, data):
|
||||
self.out.write('&' + data)
|
||||
|
||||
def handle_charref(self, data):
|
||||
self.out.write('&#' + data)
|
||||
|
||||
def handle_comment(self, data):
|
||||
self.out.write('<!--' + data + '-->')
|
||||
|
||||
def handle_decl(self, data):
|
||||
self.out.write('<!' + data + '>')
|
||||
|
||||
def handle_pi(self, data):
|
||||
self.out.write('<?' + data + '>')
|
||||
|
||||
def unknown_decl(self, data):
|
||||
self.out.write('<![' + data + ']>')
|
||||
|
||||
|
||||
|
||||
|
||||
# instantiate the parser and fed it some HTML
|
||||
#parser = WBHtml()
|
||||
#instr = '<HTML X=\'a\' B=\'234\' some="other"><a href="Test"><BR/><head><title>Test</title></head>\n<body><h1>Parse me!</h1></body></HTML>'
|
||||
#print instr
|
||||
#print
|
||||
#parser.feed(instr)
|
||||
#print
|
||||
if __name__ == "__main__":
|
||||
import doctest
|
||||
|
||||
rewriter = ArchivalUrlRewriter('/20131226101010/http://example.com/some/path/index.html', '/web/')
|
||||
|
||||
doctest.testmod()
|
@ -8,6 +8,9 @@ class ArchivalUrlRewriter:
|
||||
>>> test_rewrite('other.html', '/20131010/http://example.com/path/page.html', 'https://web.archive.org/web/')
|
||||
'https://web.archive.org/web/20131010/http://example.com/path/other.html'
|
||||
|
||||
>>> test_rewrite('file.js', '/20131010/http://example.com/path/page.html', 'https://web.archive.org/web/', 'js_')
|
||||
'https://web.archive.org/web/20131010js_/http://example.com/path/file.js'
|
||||
|
||||
>>> test_rewrite('./other.html', '/20130907*/http://example.com/path/page.html', '/coll/')
|
||||
'/coll/20130907*/http://example.com/path/other.html'
|
||||
|
||||
@ -30,11 +33,13 @@ class ArchivalUrlRewriter:
|
||||
if self.prefix.endswith('/'):
|
||||
self.prefix = self.prefix[:-1]
|
||||
|
||||
def rewrite(self, rel_url):
|
||||
if '../' in rel_url:
|
||||
def rewrite(self, rel_url, mod = None):
|
||||
if '../' in rel_url or mod:
|
||||
wburl = ArchivalUrl(self.wburl_str)
|
||||
wburl.url = urlparse.urljoin(wburl.url, rel_url)
|
||||
wburl.url = wburl.url.replace('../', '')
|
||||
if mod is not None:
|
||||
wburl.mod = mod
|
||||
|
||||
final_url = self.prefix + str(wburl)
|
||||
else:
|
||||
@ -45,8 +50,8 @@ class ArchivalUrlRewriter:
|
||||
if __name__ == "__main__":
|
||||
import doctest
|
||||
|
||||
def test_rewrite(rel_url, base_url, prefix):
|
||||
def test_rewrite(rel_url, base_url, prefix, mod = None):
|
||||
rewriter = ArchivalUrlRewriter(base_url, prefix)
|
||||
return rewriter.rewrite(rel_url)
|
||||
return rewriter.rewrite(rel_url, mod)
|
||||
|
||||
doctest.testmod()
|
||||
|
Loading…
x
Reference in New Issue
Block a user