mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
add html parser!
urlrewriter support for changing modifier
This commit is contained in:
parent
072befe3c8
commit
fbf29e80d6
@ -48,19 +48,19 @@ class RemoteCDXServer:
|
|||||||
return response
|
return response
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def getQueryParams(wburl):
|
def getQueryParams(wburl, limit = '150000', collapseTime = '10', replayClosest = '10'):
|
||||||
return {
|
return {
|
||||||
|
|
||||||
ArchivalUrl.QUERY:
|
ArchivalUrl.QUERY:
|
||||||
{'collapseTime': '10', 'filter': '!statuscode:(500|502|504)', 'limit': '150000'},
|
{'collapseTime': collapseTime, 'filter': '!statuscode:(500|502|504)', 'limit': limit},
|
||||||
|
|
||||||
ArchivalUrl.URL_QUERY:
|
ArchivalUrl.URL_QUERY:
|
||||||
{'collapse': 'urlkey', 'matchType': 'prefix', 'showGroupCount': True, 'showUniqCount': True, 'lastSkipTimestamp': True, 'limit': '100',
|
{'collapse': 'urlkey', 'matchType': 'prefix', 'showGroupCount': True, 'showUniqCount': True, 'lastSkipTimestamp': True, 'limit': limit,
|
||||||
'fl': 'urlkey,original,timestamp,endtimestamp,groupcount,uniqcount',
|
'fl': 'urlkey,original,timestamp,endtimestamp,groupcount,uniqcount',
|
||||||
},
|
},
|
||||||
|
|
||||||
ArchivalUrl.REPLAY:
|
ArchivalUrl.REPLAY:
|
||||||
{'sort': 'closest', 'filter': '!statuscode:(500|502|504)', 'limit': '10', 'closest': wburl.timestamp, 'resolveRevisits': True},
|
{'sort': 'closest', 'filter': '!statuscode:(500|502|504)', 'limit': replayClosest, 'closest': wburl.timestamp, 'resolveRevisits': True},
|
||||||
|
|
||||||
ArchivalUrl.LATEST_REPLAY:
|
ArchivalUrl.LATEST_REPLAY:
|
||||||
{'sort': 'reverse', 'filter': 'statuscode:[23]..', 'limit': '1', 'resolveRevisits': True}
|
{'sort': 'reverse', 'filter': 'statuscode:[23]..', 'limit': '1', 'resolveRevisits': True}
|
||||||
@ -82,7 +82,7 @@ class CDXCaptureResult:
|
|||||||
cdxformat = i
|
cdxformat = i
|
||||||
|
|
||||||
if not cdxformat:
|
if not cdxformat:
|
||||||
raise InvalidCDXException('unknown %d-field cdx format' % len(fields))
|
raise InvalidCDXException('unknown {0}-field cdx format'.format(len(fields)))
|
||||||
|
|
||||||
for header, field in zip(cdxformat, fields):
|
for header, field in zip(cdxformat, fields):
|
||||||
setattr(self, header, field)
|
setattr(self, header, field)
|
||||||
|
101
pywb/wbhtml.py
Normal file
101
pywb/wbhtml.py
Normal file
@ -0,0 +1,101 @@
|
|||||||
|
import sys
|
||||||
|
|
||||||
|
from HTMLParser import HTMLParser
|
||||||
|
from wburlrewriter import ArchivalUrlRewriter
|
||||||
|
|
||||||
|
tag_list = {
|
||||||
|
'a': {'href': ''},
|
||||||
|
'img': {'src': 'im_'}
|
||||||
|
}
|
||||||
|
|
||||||
|
# create a subclass and override the handler methods
|
||||||
|
class WBHtml(HTMLParser):
|
||||||
|
"""
|
||||||
|
>>> WBHtml(rewriter).feed('<HTML><A Href="page.html">Text</a></hTmL>')
|
||||||
|
<HTML><a href="/web/20131226101010/http://example.com/some/path/page.html">Text</a></html>
|
||||||
|
|
||||||
|
>>> WBHtml(rewriter).feed('<img src="../img.gif"/><br/>')
|
||||||
|
<img src="/web/20131226101010im_/http://example.com/some/img.gif"/><br/>
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, rewriter, outstream = None):
|
||||||
|
HTMLParser.__init__(self)
|
||||||
|
|
||||||
|
self.rewriter = rewriter
|
||||||
|
self.out = outstream if outstream else sys.stdout
|
||||||
|
|
||||||
|
def _rewriteAttr(self, mod, value):
|
||||||
|
return self.rewriter.rewrite(value, mod)
|
||||||
|
|
||||||
|
def rewriteTagAttrs(self, tag, tagAttrs, isStartEnd):
|
||||||
|
rwAttrs = tag_list.get(tag)
|
||||||
|
if not rwAttrs:
|
||||||
|
rwAttrs = tag_list.get('')
|
||||||
|
|
||||||
|
if not rwAttrs:
|
||||||
|
return False
|
||||||
|
|
||||||
|
self.out.write('<' + tag)
|
||||||
|
for attr in tagAttrs:
|
||||||
|
name, value = attr
|
||||||
|
rwMod = rwAttrs.get(name)
|
||||||
|
|
||||||
|
if rwMod is not None:
|
||||||
|
value = self._rewriteAttr(rwMod, value)
|
||||||
|
|
||||||
|
self.out.write(' {0}="{1}"'.format(name, value))
|
||||||
|
|
||||||
|
self.out.write('/>' if isStartEnd else '>')
|
||||||
|
return True
|
||||||
|
|
||||||
|
def handle_starttag(self, tag, attrs):
|
||||||
|
|
||||||
|
if not self.rewriteTagAttrs(tag, attrs, False):
|
||||||
|
self.out.write(self.get_starttag_text())
|
||||||
|
|
||||||
|
def handle_startendtag(self, tag, attrs):
|
||||||
|
|
||||||
|
if not self.rewriteTagAttrs(tag, attrs, True):
|
||||||
|
self.out.write(self.get_starttag_text())
|
||||||
|
|
||||||
|
def handle_endtag(self, tag):
|
||||||
|
self.out.write('</' + tag + '>')
|
||||||
|
|
||||||
|
def handle_data(self, data):
|
||||||
|
self.out.write(data)
|
||||||
|
|
||||||
|
def handle_entityref(self, data):
|
||||||
|
self.out.write('&' + data)
|
||||||
|
|
||||||
|
def handle_charref(self, data):
|
||||||
|
self.out.write('&#' + data)
|
||||||
|
|
||||||
|
def handle_comment(self, data):
|
||||||
|
self.out.write('<!--' + data + '-->')
|
||||||
|
|
||||||
|
def handle_decl(self, data):
|
||||||
|
self.out.write('<!' + data + '>')
|
||||||
|
|
||||||
|
def handle_pi(self, data):
|
||||||
|
self.out.write('<?' + data + '>')
|
||||||
|
|
||||||
|
def unknown_decl(self, data):
|
||||||
|
self.out.write('<![' + data + ']>')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# instantiate the parser and fed it some HTML
|
||||||
|
#parser = WBHtml()
|
||||||
|
#instr = '<HTML X=\'a\' B=\'234\' some="other"><a href="Test"><BR/><head><title>Test</title></head>\n<body><h1>Parse me!</h1></body></HTML>'
|
||||||
|
#print instr
|
||||||
|
#print
|
||||||
|
#parser.feed(instr)
|
||||||
|
#print
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import doctest
|
||||||
|
|
||||||
|
rewriter = ArchivalUrlRewriter('/20131226101010/http://example.com/some/path/index.html', '/web/')
|
||||||
|
|
||||||
|
doctest.testmod()
|
@ -8,6 +8,9 @@ class ArchivalUrlRewriter:
|
|||||||
>>> test_rewrite('other.html', '/20131010/http://example.com/path/page.html', 'https://web.archive.org/web/')
|
>>> test_rewrite('other.html', '/20131010/http://example.com/path/page.html', 'https://web.archive.org/web/')
|
||||||
'https://web.archive.org/web/20131010/http://example.com/path/other.html'
|
'https://web.archive.org/web/20131010/http://example.com/path/other.html'
|
||||||
|
|
||||||
|
>>> test_rewrite('file.js', '/20131010/http://example.com/path/page.html', 'https://web.archive.org/web/', 'js_')
|
||||||
|
'https://web.archive.org/web/20131010js_/http://example.com/path/file.js'
|
||||||
|
|
||||||
>>> test_rewrite('./other.html', '/20130907*/http://example.com/path/page.html', '/coll/')
|
>>> test_rewrite('./other.html', '/20130907*/http://example.com/path/page.html', '/coll/')
|
||||||
'/coll/20130907*/http://example.com/path/other.html'
|
'/coll/20130907*/http://example.com/path/other.html'
|
||||||
|
|
||||||
@ -30,11 +33,13 @@ class ArchivalUrlRewriter:
|
|||||||
if self.prefix.endswith('/'):
|
if self.prefix.endswith('/'):
|
||||||
self.prefix = self.prefix[:-1]
|
self.prefix = self.prefix[:-1]
|
||||||
|
|
||||||
def rewrite(self, rel_url):
|
def rewrite(self, rel_url, mod = None):
|
||||||
if '../' in rel_url:
|
if '../' in rel_url or mod:
|
||||||
wburl = ArchivalUrl(self.wburl_str)
|
wburl = ArchivalUrl(self.wburl_str)
|
||||||
wburl.url = urlparse.urljoin(wburl.url, rel_url)
|
wburl.url = urlparse.urljoin(wburl.url, rel_url)
|
||||||
wburl.url = wburl.url.replace('../', '')
|
wburl.url = wburl.url.replace('../', '')
|
||||||
|
if mod is not None:
|
||||||
|
wburl.mod = mod
|
||||||
|
|
||||||
final_url = self.prefix + str(wburl)
|
final_url = self.prefix + str(wburl)
|
||||||
else:
|
else:
|
||||||
@ -45,8 +50,8 @@ class ArchivalUrlRewriter:
|
|||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import doctest
|
import doctest
|
||||||
|
|
||||||
def test_rewrite(rel_url, base_url, prefix):
|
def test_rewrite(rel_url, base_url, prefix, mod = None):
|
||||||
rewriter = ArchivalUrlRewriter(base_url, prefix)
|
rewriter = ArchivalUrlRewriter(base_url, prefix)
|
||||||
return rewriter.rewrite(rel_url)
|
return rewriter.rewrite(rel_url, mod)
|
||||||
|
|
||||||
doctest.testmod()
|
doctest.testmod()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user