1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

html_rewriter: default attrs without value to empty str value, instead of no value

This commit is contained in:
Ilya Kreymer 2014-01-24 00:49:51 -08:00
parent 5987a0c047
commit 94326dafc1
2 changed files with 101 additions and 7 deletions

92
pywb/binsearch.py Normal file
View File

@ -0,0 +1,92 @@
from collections import deque
import os
import itertools
class FileReader:
def __init__(self, filename):
self.fh = open(filename, 'rb')
self.size = os.path.getsize(filename)
def getsize(self):
return self.size
def readline(self):
return self.fh.readline()
def seek(self, offset):
return self.fh.seek(offset)
def binsearch_offset(reader, key, compare_func = cmp, block_size = 8192):
min = 0
max = reader.getsize() / block_size
while (max - min > 1):
mid = min + ((max - min) / 2)
reader.seek(mid * block_size)
if mid > 0:
reader.readline() # skip partial line
line = reader.readline()
if compare_func(key, line) > 0:
min = mid
else:
max = mid
return (min * block_size)
def search(reader, key, prev_size = 0, compare_func = cmp, block_size = 8192):
min = binsearch_offset(reader, key, compare_func, block_size)
reader.seek(min)
if min > 0:
reader.readline() # skip partial line
if prev_size > 1:
prev_deque = deque(maxlen = prev_size)
line = None
while True:
line = reader.readline()
if not line:
break
if compare_func(line, key) >= 0:
break
if prev_size == 1:
prev = line
elif prev_size > 1:
prev_deque.append(line)
def gen_iter(line):
if prev_size == 1:
yield prev
elif prev_size > 1:
for i in prev_deque:
yield i
while line:
yield line
line = reader.readline()
return gen_iter(line)
# Iterate over exact matches
def iter_exact(reader, key):
lines = search(reader, key)
for x in lines:
if not x.startswith(key):
break
yield x

View File

@ -23,13 +23,14 @@ class WBHtml(HTMLParser):
<body x="y"><img src="/web/20131226101010im_/http://example.com/img.gif"/><br/></body>
>>> parse('<input "selected"><img src></div>')
<input "selected"><img src></div>
<input "selected"=""><img src=""></div>
>>> parse('<html><head><base href="http://example.com/some/path/index.html"/>')
<html><head><base href="/web/20131226101010/http://example.com/some/path/index.html"/>
# HTML Entities
>>> parse('<a href="">&rsaquo; &nbsp; &#62;</div>')
<a href>&rsaquo; &nbsp; &#62;</div>
<a href="">&rsaquo; &nbsp; &#62;</div>
# Don't rewrite anchors
>>> parse('<HTML><A Href="#abc">Text</a></hTmL>')
@ -47,7 +48,7 @@ class WBHtml(HTMLParser):
<meta http-equiv="Content-type" content="text/html; charset=utf-8"/>
>>> parse('<META http-equiv="refresh" content>')
<meta http-equiv="refresh" content>
<meta http-equiv="refresh" content="">
# Script tag
>>> parse('<script>window.location = "http://example.com/a/b/c.html"</script>')
@ -61,7 +62,7 @@ class WBHtml(HTMLParser):
<script>/*<![CDATA[*/window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html;/*]]>*/"</script>
>>> parse('<div style="background: url(\'abc.html\')" onblah onclick="location = \'redirect.html\'"></div>')
<div style="background: url('/web/20131226101010/http://example.com/some/path/abc.html')" onblah onclick="WB_wombat_location = 'redirect.html'"></div>
<div style="background: url('/web/20131226101010/http://example.com/some/path/abc.html')" onblah="" onclick="WB_wombat_location = 'redirect.html'"></div>
>>> parse('<style>@import "styles.css" .a { font-face: url(\'myfont.ttf\') }</style>')
<style>@import "/web/20131226101010/http://example.com/some/path/styles.css" .a { font-face: url('/web/20131226101010/http://example.com/some/path/myfont.ttf') }</style>
@ -218,11 +219,12 @@ class WBHtml(HTMLParser):
if rwMod is not None:
attrValue = self._rewriteURL(attrValue, rwMod)
if attrValue is not None:
#self.out.write(' {0}="{1}"'.format(attrName, attrValue))
# parser doesn't differentiate between 'attr=""' and just 'attr'
# 'attr=""' is more common, so use that form
if attrValue:
self.out.write(' ' + attrName + '="' + attrValue + '"')
else:
self.out.write(' ' + attrName)
self.out.write(' ' + attrName + '=""')
self.out.write('/>' if isStartEnd else '>')