1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

add html parser!

urlrewriter support for changing modifier
This commit is contained in:
Ilya Kreymer 2013-12-20 19:11:52 -08:00
parent 072befe3c8
commit fbf29e80d6
3 changed files with 115 additions and 9 deletions

View File

@ -48,19 +48,19 @@ class RemoteCDXServer:
return response
@staticmethod
def getQueryParams(wburl):
def getQueryParams(wburl, limit = '150000', collapseTime = '10', replayClosest = '10'):
return {
ArchivalUrl.QUERY:
{'collapseTime': '10', 'filter': '!statuscode:(500|502|504)', 'limit': '150000'},
{'collapseTime': collapseTime, 'filter': '!statuscode:(500|502|504)', 'limit': limit},
ArchivalUrl.URL_QUERY:
{'collapse': 'urlkey', 'matchType': 'prefix', 'showGroupCount': True, 'showUniqCount': True, 'lastSkipTimestamp': True, 'limit': '100',
{'collapse': 'urlkey', 'matchType': 'prefix', 'showGroupCount': True, 'showUniqCount': True, 'lastSkipTimestamp': True, 'limit': limit,
'fl': 'urlkey,original,timestamp,endtimestamp,groupcount,uniqcount',
},
ArchivalUrl.REPLAY:
{'sort': 'closest', 'filter': '!statuscode:(500|502|504)', 'limit': '10', 'closest': wburl.timestamp, 'resolveRevisits': True},
{'sort': 'closest', 'filter': '!statuscode:(500|502|504)', 'limit': replayClosest, 'closest': wburl.timestamp, 'resolveRevisits': True},
ArchivalUrl.LATEST_REPLAY:
{'sort': 'reverse', 'filter': 'statuscode:[23]..', 'limit': '1', 'resolveRevisits': True}
@ -82,7 +82,7 @@ class CDXCaptureResult:
cdxformat = i
if not cdxformat:
raise InvalidCDXException('unknown %d-field cdx format' % len(fields))
raise InvalidCDXException('unknown {0}-field cdx format'.format(len(fields)))
for header, field in zip(cdxformat, fields):
setattr(self, header, field)

101
pywb/wbhtml.py Normal file
View File

@ -0,0 +1,101 @@
import sys
from HTMLParser import HTMLParser
from wburlrewriter import ArchivalUrlRewriter
tag_list = {
'a': {'href': ''},
'img': {'src': 'im_'}
}
# create a subclass and override the handler methods
class WBHtml(HTMLParser):
"""
>>> WBHtml(rewriter).feed('<HTML><A Href="page.html">Text</a></hTmL>')
<HTML><a href="/web/20131226101010/http://example.com/some/path/page.html">Text</a></html>
>>> WBHtml(rewriter).feed('<img src="../img.gif"/><br/>')
<img src="/web/20131226101010im_/http://example.com/some/img.gif"/><br/>
"""
def __init__(self, rewriter, outstream = None):
HTMLParser.__init__(self)
self.rewriter = rewriter
self.out = outstream if outstream else sys.stdout
def _rewriteAttr(self, mod, value):
return self.rewriter.rewrite(value, mod)
def rewriteTagAttrs(self, tag, tagAttrs, isStartEnd):
rwAttrs = tag_list.get(tag)
if not rwAttrs:
rwAttrs = tag_list.get('')
if not rwAttrs:
return False
self.out.write('<' + tag)
for attr in tagAttrs:
name, value = attr
rwMod = rwAttrs.get(name)
if rwMod is not None:
value = self._rewriteAttr(rwMod, value)
self.out.write(' {0}="{1}"'.format(name, value))
self.out.write('/>' if isStartEnd else '>')
return True
def handle_starttag(self, tag, attrs):
if not self.rewriteTagAttrs(tag, attrs, False):
self.out.write(self.get_starttag_text())
def handle_startendtag(self, tag, attrs):
if not self.rewriteTagAttrs(tag, attrs, True):
self.out.write(self.get_starttag_text())
def handle_endtag(self, tag):
self.out.write('</' + tag + '>')
def handle_data(self, data):
self.out.write(data)
def handle_entityref(self, data):
self.out.write('&' + data)
def handle_charref(self, data):
self.out.write('&#' + data)
def handle_comment(self, data):
self.out.write('<!--' + data + '-->')
def handle_decl(self, data):
self.out.write('<!' + data + '>')
def handle_pi(self, data):
self.out.write('<?' + data + '>')
def unknown_decl(self, data):
self.out.write('<![' + data + ']>')
# instantiate the parser and fed it some HTML
#parser = WBHtml()
#instr = '<HTML X=\'a\' B=\'234\' some="other"><a href="Test"><BR/><head><title>Test</title></head>\n<body><h1>Parse me!</h1></body></HTML>'
#print instr
#print
#parser.feed(instr)
#print
if __name__ == "__main__":
import doctest
rewriter = ArchivalUrlRewriter('/20131226101010/http://example.com/some/path/index.html', '/web/')
doctest.testmod()

View File

@ -8,6 +8,9 @@ class ArchivalUrlRewriter:
>>> test_rewrite('other.html', '/20131010/http://example.com/path/page.html', 'https://web.archive.org/web/')
'https://web.archive.org/web/20131010/http://example.com/path/other.html'
>>> test_rewrite('file.js', '/20131010/http://example.com/path/page.html', 'https://web.archive.org/web/', 'js_')
'https://web.archive.org/web/20131010js_/http://example.com/path/file.js'
>>> test_rewrite('./other.html', '/20130907*/http://example.com/path/page.html', '/coll/')
'/coll/20130907*/http://example.com/path/other.html'
@ -30,11 +33,13 @@ class ArchivalUrlRewriter:
if self.prefix.endswith('/'):
self.prefix = self.prefix[:-1]
def rewrite(self, rel_url):
if '../' in rel_url:
def rewrite(self, rel_url, mod = None):
if '../' in rel_url or mod:
wburl = ArchivalUrl(self.wburl_str)
wburl.url = urlparse.urljoin(wburl.url, rel_url)
wburl.url = wburl.url.replace('../', '')
if mod is not None:
wburl.mod = mod
final_url = self.prefix + str(wburl)
else:
@ -45,8 +50,8 @@ class ArchivalUrlRewriter:
if __name__ == "__main__":
import doctest
def test_rewrite(rel_url, base_url, prefix):
def test_rewrite(rel_url, base_url, prefix, mod = None):
rewriter = ArchivalUrlRewriter(base_url, prefix)
return rewriter.rewrite(rel_url)
return rewriter.rewrite(rel_url, mod)
doctest.testmod()