mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
html_rewriter: default attrs without value to empty str value, instead of no value
This commit is contained in:
parent
5987a0c047
commit
94326dafc1
92
pywb/binsearch.py
Normal file
92
pywb/binsearch.py
Normal file
@ -0,0 +1,92 @@
|
|||||||
|
from collections import deque
|
||||||
|
import os
|
||||||
|
import itertools
|
||||||
|
|
||||||
|
class FileReader:
|
||||||
|
def __init__(self, filename):
|
||||||
|
self.fh = open(filename, 'rb')
|
||||||
|
self.size = os.path.getsize(filename)
|
||||||
|
|
||||||
|
def getsize(self):
|
||||||
|
return self.size
|
||||||
|
|
||||||
|
def readline(self):
|
||||||
|
return self.fh.readline()
|
||||||
|
|
||||||
|
def seek(self, offset):
|
||||||
|
return self.fh.seek(offset)
|
||||||
|
|
||||||
|
|
||||||
|
def binsearch_offset(reader, key, compare_func = cmp, block_size = 8192):
|
||||||
|
min = 0
|
||||||
|
max = reader.getsize() / block_size
|
||||||
|
|
||||||
|
while (max - min > 1):
|
||||||
|
mid = min + ((max - min) / 2)
|
||||||
|
reader.seek(mid * block_size)
|
||||||
|
|
||||||
|
if mid > 0:
|
||||||
|
reader.readline() # skip partial line
|
||||||
|
|
||||||
|
line = reader.readline()
|
||||||
|
|
||||||
|
if compare_func(key, line) > 0:
|
||||||
|
min = mid
|
||||||
|
else:
|
||||||
|
max = mid
|
||||||
|
|
||||||
|
return (min * block_size)
|
||||||
|
|
||||||
|
|
||||||
|
def search(reader, key, prev_size = 0, compare_func = cmp, block_size = 8192):
|
||||||
|
min = binsearch_offset(reader, key, compare_func, block_size)
|
||||||
|
|
||||||
|
reader.seek(min)
|
||||||
|
|
||||||
|
if min > 0:
|
||||||
|
reader.readline() # skip partial line
|
||||||
|
|
||||||
|
if prev_size > 1:
|
||||||
|
prev_deque = deque(maxlen = prev_size)
|
||||||
|
|
||||||
|
line = None
|
||||||
|
|
||||||
|
while True:
|
||||||
|
line = reader.readline()
|
||||||
|
if not line:
|
||||||
|
break
|
||||||
|
if compare_func(line, key) >= 0:
|
||||||
|
break
|
||||||
|
|
||||||
|
if prev_size == 1:
|
||||||
|
prev = line
|
||||||
|
elif prev_size > 1:
|
||||||
|
prev_deque.append(line)
|
||||||
|
|
||||||
|
def gen_iter(line):
|
||||||
|
if prev_size == 1:
|
||||||
|
yield prev
|
||||||
|
elif prev_size > 1:
|
||||||
|
for i in prev_deque:
|
||||||
|
yield i
|
||||||
|
|
||||||
|
while line:
|
||||||
|
yield line
|
||||||
|
line = reader.readline()
|
||||||
|
|
||||||
|
return gen_iter(line)
|
||||||
|
|
||||||
|
|
||||||
|
# Iterate over exact matches
|
||||||
|
def iter_exact(reader, key):
|
||||||
|
lines = search(reader, key)
|
||||||
|
for x in lines:
|
||||||
|
if not x.startswith(key):
|
||||||
|
break
|
||||||
|
|
||||||
|
yield x
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -23,13 +23,14 @@ class WBHtml(HTMLParser):
|
|||||||
<body x="y"><img src="/web/20131226101010im_/http://example.com/img.gif"/><br/></body>
|
<body x="y"><img src="/web/20131226101010im_/http://example.com/img.gif"/><br/></body>
|
||||||
|
|
||||||
>>> parse('<input "selected"><img src></div>')
|
>>> parse('<input "selected"><img src></div>')
|
||||||
<input "selected"><img src></div>
|
<input "selected"=""><img src=""></div>
|
||||||
|
|
||||||
>>> parse('<html><head><base href="http://example.com/some/path/index.html"/>')
|
>>> parse('<html><head><base href="http://example.com/some/path/index.html"/>')
|
||||||
<html><head><base href="/web/20131226101010/http://example.com/some/path/index.html"/>
|
<html><head><base href="/web/20131226101010/http://example.com/some/path/index.html"/>
|
||||||
|
|
||||||
|
# HTML Entities
|
||||||
>>> parse('<a href="">› ></div>')
|
>>> parse('<a href="">› ></div>')
|
||||||
<a href>› ></div>
|
<a href="">› ></div>
|
||||||
|
|
||||||
# Don't rewrite anchors
|
# Don't rewrite anchors
|
||||||
>>> parse('<HTML><A Href="#abc">Text</a></hTmL>')
|
>>> parse('<HTML><A Href="#abc">Text</a></hTmL>')
|
||||||
@ -47,7 +48,7 @@ class WBHtml(HTMLParser):
|
|||||||
<meta http-equiv="Content-type" content="text/html; charset=utf-8"/>
|
<meta http-equiv="Content-type" content="text/html; charset=utf-8"/>
|
||||||
|
|
||||||
>>> parse('<META http-equiv="refresh" content>')
|
>>> parse('<META http-equiv="refresh" content>')
|
||||||
<meta http-equiv="refresh" content>
|
<meta http-equiv="refresh" content="">
|
||||||
|
|
||||||
# Script tag
|
# Script tag
|
||||||
>>> parse('<script>window.location = "http://example.com/a/b/c.html"</script>')
|
>>> parse('<script>window.location = "http://example.com/a/b/c.html"</script>')
|
||||||
@ -61,7 +62,7 @@ class WBHtml(HTMLParser):
|
|||||||
<script>/*<![CDATA[*/window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html;/*]]>*/"</script>
|
<script>/*<![CDATA[*/window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html;/*]]>*/"</script>
|
||||||
|
|
||||||
>>> parse('<div style="background: url(\'abc.html\')" onblah onclick="location = \'redirect.html\'"></div>')
|
>>> parse('<div style="background: url(\'abc.html\')" onblah onclick="location = \'redirect.html\'"></div>')
|
||||||
<div style="background: url('/web/20131226101010/http://example.com/some/path/abc.html')" onblah onclick="WB_wombat_location = 'redirect.html'"></div>
|
<div style="background: url('/web/20131226101010/http://example.com/some/path/abc.html')" onblah="" onclick="WB_wombat_location = 'redirect.html'"></div>
|
||||||
|
|
||||||
>>> parse('<style>@import "styles.css" .a { font-face: url(\'myfont.ttf\') }</style>')
|
>>> parse('<style>@import "styles.css" .a { font-face: url(\'myfont.ttf\') }</style>')
|
||||||
<style>@import "/web/20131226101010/http://example.com/some/path/styles.css" .a { font-face: url('/web/20131226101010/http://example.com/some/path/myfont.ttf') }</style>
|
<style>@import "/web/20131226101010/http://example.com/some/path/styles.css" .a { font-face: url('/web/20131226101010/http://example.com/some/path/myfont.ttf') }</style>
|
||||||
@ -218,11 +219,12 @@ class WBHtml(HTMLParser):
|
|||||||
if rwMod is not None:
|
if rwMod is not None:
|
||||||
attrValue = self._rewriteURL(attrValue, rwMod)
|
attrValue = self._rewriteURL(attrValue, rwMod)
|
||||||
|
|
||||||
if attrValue is not None:
|
# parser doesn't differentiate between 'attr=""' and just 'attr'
|
||||||
#self.out.write(' {0}="{1}"'.format(attrName, attrValue))
|
# 'attr=""' is more common, so use that form
|
||||||
|
if attrValue:
|
||||||
self.out.write(' ' + attrName + '="' + attrValue + '"')
|
self.out.write(' ' + attrName + '="' + attrValue + '"')
|
||||||
else:
|
else:
|
||||||
self.out.write(' ' + attrName)
|
self.out.write(' ' + attrName + '=""')
|
||||||
|
|
||||||
self.out.write('/>' if isStartEnd else '>')
|
self.out.write('/>' if isStartEnd else '>')
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user