mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
html_rewriter: default attrs without value to empty str value, instead of no value
This commit is contained in:
parent
5987a0c047
commit
94326dafc1
92
pywb/binsearch.py
Normal file
92
pywb/binsearch.py
Normal file
@ -0,0 +1,92 @@
|
||||
from collections import deque
|
||||
import os
|
||||
import itertools
|
||||
|
||||
class FileReader:
|
||||
def __init__(self, filename):
|
||||
self.fh = open(filename, 'rb')
|
||||
self.size = os.path.getsize(filename)
|
||||
|
||||
def getsize(self):
|
||||
return self.size
|
||||
|
||||
def readline(self):
|
||||
return self.fh.readline()
|
||||
|
||||
def seek(self, offset):
|
||||
return self.fh.seek(offset)
|
||||
|
||||
|
||||
def binsearch_offset(reader, key, compare_func = cmp, block_size = 8192):
|
||||
min = 0
|
||||
max = reader.getsize() / block_size
|
||||
|
||||
while (max - min > 1):
|
||||
mid = min + ((max - min) / 2)
|
||||
reader.seek(mid * block_size)
|
||||
|
||||
if mid > 0:
|
||||
reader.readline() # skip partial line
|
||||
|
||||
line = reader.readline()
|
||||
|
||||
if compare_func(key, line) > 0:
|
||||
min = mid
|
||||
else:
|
||||
max = mid
|
||||
|
||||
return (min * block_size)
|
||||
|
||||
|
||||
def search(reader, key, prev_size = 0, compare_func = cmp, block_size = 8192):
|
||||
min = binsearch_offset(reader, key, compare_func, block_size)
|
||||
|
||||
reader.seek(min)
|
||||
|
||||
if min > 0:
|
||||
reader.readline() # skip partial line
|
||||
|
||||
if prev_size > 1:
|
||||
prev_deque = deque(maxlen = prev_size)
|
||||
|
||||
line = None
|
||||
|
||||
while True:
|
||||
line = reader.readline()
|
||||
if not line:
|
||||
break
|
||||
if compare_func(line, key) >= 0:
|
||||
break
|
||||
|
||||
if prev_size == 1:
|
||||
prev = line
|
||||
elif prev_size > 1:
|
||||
prev_deque.append(line)
|
||||
|
||||
def gen_iter(line):
|
||||
if prev_size == 1:
|
||||
yield prev
|
||||
elif prev_size > 1:
|
||||
for i in prev_deque:
|
||||
yield i
|
||||
|
||||
while line:
|
||||
yield line
|
||||
line = reader.readline()
|
||||
|
||||
return gen_iter(line)
|
||||
|
||||
|
||||
# Iterate over exact matches
|
||||
def iter_exact(reader, key):
|
||||
lines = search(reader, key)
|
||||
for x in lines:
|
||||
if not x.startswith(key):
|
||||
break
|
||||
|
||||
yield x
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -23,13 +23,14 @@ class WBHtml(HTMLParser):
|
||||
<body x="y"><img src="/web/20131226101010im_/http://example.com/img.gif"/><br/></body>
|
||||
|
||||
>>> parse('<input "selected"><img src></div>')
|
||||
<input "selected"><img src></div>
|
||||
<input "selected"=""><img src=""></div>
|
||||
|
||||
>>> parse('<html><head><base href="http://example.com/some/path/index.html"/>')
|
||||
<html><head><base href="/web/20131226101010/http://example.com/some/path/index.html"/>
|
||||
|
||||
# HTML Entities
|
||||
>>> parse('<a href="">› ></div>')
|
||||
<a href>› ></div>
|
||||
<a href="">› ></div>
|
||||
|
||||
# Don't rewrite anchors
|
||||
>>> parse('<HTML><A Href="#abc">Text</a></hTmL>')
|
||||
@ -47,7 +48,7 @@ class WBHtml(HTMLParser):
|
||||
<meta http-equiv="Content-type" content="text/html; charset=utf-8"/>
|
||||
|
||||
>>> parse('<META http-equiv="refresh" content>')
|
||||
<meta http-equiv="refresh" content>
|
||||
<meta http-equiv="refresh" content="">
|
||||
|
||||
# Script tag
|
||||
>>> parse('<script>window.location = "http://example.com/a/b/c.html"</script>')
|
||||
@ -61,7 +62,7 @@ class WBHtml(HTMLParser):
|
||||
<script>/*<![CDATA[*/window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html;/*]]>*/"</script>
|
||||
|
||||
>>> parse('<div style="background: url(\'abc.html\')" onblah onclick="location = \'redirect.html\'"></div>')
|
||||
<div style="background: url('/web/20131226101010/http://example.com/some/path/abc.html')" onblah onclick="WB_wombat_location = 'redirect.html'"></div>
|
||||
<div style="background: url('/web/20131226101010/http://example.com/some/path/abc.html')" onblah="" onclick="WB_wombat_location = 'redirect.html'"></div>
|
||||
|
||||
>>> parse('<style>@import "styles.css" .a { font-face: url(\'myfont.ttf\') }</style>')
|
||||
<style>@import "/web/20131226101010/http://example.com/some/path/styles.css" .a { font-face: url('/web/20131226101010/http://example.com/some/path/myfont.ttf') }</style>
|
||||
@ -218,11 +219,12 @@ class WBHtml(HTMLParser):
|
||||
if rwMod is not None:
|
||||
attrValue = self._rewriteURL(attrValue, rwMod)
|
||||
|
||||
if attrValue is not None:
|
||||
#self.out.write(' {0}="{1}"'.format(attrName, attrValue))
|
||||
# parser doesn't differentiate between 'attr=""' and just 'attr'
|
||||
# 'attr=""' is more common, so use that form
|
||||
if attrValue:
|
||||
self.out.write(' ' + attrName + '="' + attrValue + '"')
|
||||
else:
|
||||
self.out.write(' ' + attrName)
|
||||
self.out.write(' ' + attrName + '=""')
|
||||
|
||||
self.out.write('/>' if isStartEnd else '>')
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user