1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

Merge branch 'develop' into video

This commit is contained in:
Ilya Kreymer 2014-11-04 12:19:58 -08:00
commit fea48fd27a
3 changed files with 32 additions and 9 deletions

View File

@ -28,8 +28,17 @@ class CDXFile(CDXSource):
self.filename = filename
def load_cdx(self, query):
source = open(self.filename)
return iter_range(source, query.key, query.end_key)
def do_open():
try:
source = open(self.filename)
gen = iter_range(source, query.key, query.end_key)
for line in gen:
yield line
finally:
source.close()
return do_open()
#return iter_range(do_open(), query.key, query.end_key)
def __str__(self):
return 'CDX File - ' + self.filename

View File

@ -274,10 +274,20 @@ class HTMLRewriterMixin(object):
#=================================================================
class HTMLRewriter(HTMLRewriterMixin, HTMLParser):
PARSETAG = re.compile('[<]')
def __init__(self, *args, **kwargs):
HTMLParser.__init__(self)
super(HTMLRewriter, self).__init__(*args, **kwargs)
def reset(self):
HTMLParser.reset(self)
self.interesting = self.PARSETAG
def clear_cdata_mode(self):
HTMLParser.clear_cdata_mode(self)
self.interesting = self.PARSETAG
def feed(self, string):
try:
HTMLParser.feed(self, string)
@ -322,11 +332,12 @@ class HTMLRewriter(HTMLRewriterMixin, HTMLParser):
def handle_data(self, data):
self.parse_data(data)
def handle_entityref(self, data):
self.out.write('&' + data + ';')
def handle_charref(self, data):
self.out.write('&#' + data + ';')
# overriding regex so that these are no longer called
#def handle_entityref(self, data):
# self.out.write('&' + data + ';')
#
#def handle_charref(self, data):
# self.out.write('&#' + data + ';')
def handle_comment(self, data):
self.out.write('<!--')

View File

@ -28,8 +28,11 @@ ur"""
<base href="/web/20131226101010/http://example.com/some/path/static/"/><img src="/web/20131226101010im_/http://example.com/some/path/static/image.gif"/>
# HTML Entities
>>> parse('<a href="">&rsaquo; &nbsp; &#62;</div>')
<a href="">&rsaquo; &nbsp; &#62;</div>
>>> parse('<a href="">&rsaquo; &nbsp; &#62; &#63</div>')
<a href="">&rsaquo; &nbsp; &#62; &#63</div>
>>> parse('<div>X&Y</div> </div>X&Y;</div>')
<div>X&Y</div> </div>X&Y;</div>
# Don't rewrite anchors
>>> parse('<HTML><A Href="#abc">Text</a></hTmL>')