1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

fixes! Fix typos, in html parsing, fix base, support attrs w/o values

This commit is contained in:
Ilya Kreymer 2013-12-30 03:03:33 +00:00
parent a84ec2abc7
commit 997dc5df0f
4 changed files with 56 additions and 45 deletions

View File

@ -7,6 +7,7 @@ import urllib2
import StringIO import StringIO
import urlparse import urlparse
import collections import collections
import wbexceptions
#================================================================= #=================================================================
class HttpStreamLoader: class HttpStreamLoader:
@ -143,9 +144,9 @@ class ArchiveLoader:
headers = [('Content-Type', utils.get_header(parsed.headers, 'Content-Type'))] headers = [('Content-Type', utils.get_header(parsed.headers, 'Content-Type'))]
# special case: http 0.9 response, no status or headers # special case: http 0.9 response, no status or headers
elif recType == 'response' and (';version=0.9' in utils.get_header(parsed.headers, 'Content-Type')): #elif recType == 'response' and (';version=0.9' in utils.get_header(parsed.headers, 'Content-Type')):
statusline = '200 OK' # statusline = '200 OK'
headers = [] # headers = []
# response record: parse HTTP status and headers! # response record: parse HTTP status and headers!
else: else:
@ -202,13 +203,13 @@ class LineReader:
self.buff = StringIO.StringIO(data) self.buff = StringIO.StringIO(data)
def read(self): def read(self, length = None):
self._fillbuff() self._fillbuff()
return self.buff.read() return self.buff.read(length)
def readline(self): def readline(self, length = None):
self._fillbuff() self._fillbuff()
return self.buff.readline() return self.buff.readline(length)
def close(self): def close(self):
if self.stream: if self.stream:

View File

@ -117,7 +117,7 @@ class ReplayHandler(object):
if fullUrl: if fullUrl:
return fullUrl return fullUrl
raise exceptions.UnresolvedArchiveFileException('Archive File Not Found: ' + cdx.filename) raise wbexceptions.UnresolvedArchiveFileException('Archive File Not Found: ' + filename)
#================================================================= #=================================================================
@ -125,10 +125,10 @@ class RewritingReplayHandler(ReplayHandler):
REWRITE_TYPES = { REWRITE_TYPES = {
'html': ('text/html', 'application/xhtml'), 'html': ['text/html', 'application/xhtml'],
'css': ('text/css'), 'css': ['text/css'],
'js': ('text/javascript', 'application/javascript', 'application/x-javascript'), 'js': ['text/javascript', 'application/javascript', 'application/x-javascript'],
'xml': ('/xml', '+xml', '.xml', '.rss'), 'xml': ['/xml', '+xml', '.xml', '.rss'],
} }
@ -145,11 +145,10 @@ class RewritingReplayHandler(ReplayHandler):
self.headInsert = headInsert self.headInsert = headInsert
def _canonContentType(self, contentType): def _textContentType(self, contentType):
for type, mimelist in self.REWRITE_TYPES.iteritems(): for ctype, mimelist in self.REWRITE_TYPES.iteritems():
for mime in mimelist: if any ((mime in contentType) for mime in mimelist):
if mime in contentType: return ctype
return type
return None return None
@ -168,13 +167,13 @@ class RewritingReplayHandler(ReplayHandler):
return response return response
contentType = utils.get_header(response.headersList, 'Content-Type') contentType = utils.get_header(response.headersList, 'Content-Type')
canonType = self._canonContentType(contentType) textType = self._textContentType(contentType) if contentType else None
(newHeaders, remHeaders) = self._rewriteHeaders(response.headersList, (canonType is not None)) (newHeaders, remHeaders) = self._rewriteHeaders(response.headersList, urlrewriter, textType is not None)
# binary type, just send through # binary type, just send through
if canonType is None: if textType is None:
response.headersList = newHeaders response.headersList = newHeaders
return response return response
@ -186,19 +185,19 @@ class RewritingReplayHandler(ReplayHandler):
if (utils.contains_header(remHeaders, ('Content-Encoding', 'gzip'))): if (utils.contains_header(remHeaders, ('Content-Encoding', 'gzip'))):
stream = archiveloader.LineStream(stream, decomp = zlib.decompressobj(16 + zlib.MAX_WBITS)) stream = archiveloader.LineStream(stream, decomp = zlib.decompressobj(16 + zlib.MAX_WBITS))
return self._rewriteContent(canonType, urlrewriter, stream, newHeaders, response) return self._rewriteContent(textType, urlrewriter, stream, newHeaders, response)
# TODO: first non-streaming attempt, probably want to stream # TODO: first non-streaming attempt, probably want to stream
def _rewriteContent(self, canonType, urlrewriter, stream, newHeaders, origResponse): def _rewriteContent(self, textType, urlrewriter, stream, newHeaders, origResponse, encoding = 'utf-8'):
if canonType == 'html': if textType == 'html':
out = StringIO.StringIO() out = StringIO.StringIO()
htmlrewriter = wbhtml.WBHtml(urlrewriter, out, self.headInsert) htmlrewriter = wbhtml.WBHtml(urlrewriter, out, self.headInsert)
try: try:
buff = stream.read() buff = stream.read()#.decode(encoding)
while buff: while buff:
htmlrewriter.feed(buff) htmlrewriter.feed(buff)
buff = stream.read() buff = stream.read()#.decode(encoding)
htmlrewriter.close() htmlrewriter.close()
@ -207,12 +206,13 @@ class RewritingReplayHandler(ReplayHandler):
finally: finally:
value = [out.getvalue()] value = [out.getvalue()]
newHeaders.append(('Content-Length', str(len(value[0]))))
out.close() out.close()
else: else:
if canonType == 'css': if textType == 'css':
rewriter = regexmatch.CSSRewriter(urlrewriter) rewriter = regexmatch.CSSRewriter(urlrewriter)
elif canonType == 'js': elif textType == 'js':
rewriter = regexmatch.JSRewriter(urlrewriter) rewriter = regexmatch.JSRewriter(urlrewriter)
def gen(): def gen():
@ -231,7 +231,7 @@ class RewritingReplayHandler(ReplayHandler):
def _rewriteHeaders(self, headers, stripEncoding = False): def _rewriteHeaders(self, headers, urlrewriter, stripEncoding = False):
newHeaders = [] newHeaders = []
removedHeaders = [] removedHeaders = []

View File

@ -34,7 +34,7 @@ class UnknownLoaderProtocolException(CaptureException):
pass pass
class InvalidArchiveRecordException(CaptureException): class InvalidArchiveRecordException(CaptureException):
def __init__(msg, errList = None): def __init__(self, msg, errList = None):
super(InvalidArchiveRecordException, self).__init__(msg) super(InvalidArchiveRecordException, self).__init__(msg)
self.errList = errList self.errList = errList

View File

@ -19,6 +19,13 @@ class WBHtml(HTMLParser):
>>> parse('<body x="y"><img src="/img.gif"/><br/></body>') >>> parse('<body x="y"><img src="/img.gif"/><br/></body>')
<body x="y"><img src="/web/20131226101010im_/http://example.com/img.gif"/><br/></body> <body x="y"><img src="/web/20131226101010im_/http://example.com/img.gif"/><br/></body>
>>> parse('<input "selected"><img src></div>')
<input "selected"><img src></div>
>>> parse('<html><head><base href="http://example.com/some/path/index.html"/>')
<html><head><base href="/web/20131226101010/http://example.com/some/path/index.html"/>
# Meta tag
>>> parse('<META http-equiv="refresh" content="10; URL=/abc/def.html">') >>> parse('<META http-equiv="refresh" content="10; URL=/abc/def.html">')
<meta http-equiv="refresh" content="10; URL=/web/20131226101010/http://example.com/abc/def.html"> <meta http-equiv="refresh" content="10; URL=/web/20131226101010/http://example.com/abc/def.html">
@ -35,8 +42,8 @@ class WBHtml(HTMLParser):
>>> parse('<script>/*<![CDATA[*/window.location = "http://example.com/a/b/c.html;/*]]>*/"</script>') >>> parse('<script>/*<![CDATA[*/window.location = "http://example.com/a/b/c.html;/*]]>*/"</script>')
<script>/*<![CDATA[*/window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html;/*]]>*/"</script> <script>/*<![CDATA[*/window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html;/*]]>*/"</script>
>>> parse('<div style="background: url(\'abc.html\')" onclick="location = \'redirect.html\'"></div>') >>> parse('<div style="background: url(\'abc.html\')" onblah onclick="location = \'redirect.html\'"></div>')
<div style="background: url('/web/20131226101010/http://example.com/some/path/abc.html')" onclick="WB_wombat_location = 'redirect.html'"></div> <div style="background: url('/web/20131226101010/http://example.com/some/path/abc.html')" onblah onclick="WB_wombat_location = 'redirect.html'"></div>
>>> parse('<style>@import "styles.css" .a { font-face: url(\'myfont.ttf\') }</style>') >>> parse('<style>@import "styles.css" .a { font-face: url(\'myfont.ttf\') }</style>')
<style>@import "/web/20131226101010/http://example.com/some/path/styles.css" .a { font-face: url('/web/20131226101010/http://example.com/some/path/myfont.ttf') }</style> <style>@import "/web/20131226101010/http://example.com/some/path/styles.css" .a { font-face: url('/web/20131226101010/http://example.com/some/path/myfont.ttf') }</style>
@ -130,14 +137,14 @@ class WBHtml(HTMLParser):
# =========================== # ===========================
def _rewriteURL(self, value, mod = None): def _rewriteURL(self, value, mod = None):
return self.rewriter.rewrite(value, mod) return self.rewriter.rewrite(value, mod) if value else None
def _rewriteCSS(self, cssContent): def _rewriteCSS(self, cssContent):
return self.cssRewriter.replaceAll(cssContent) return self.cssRewriter.replaceAll(cssContent) if cssContent else None
def _rewriteScript(self, scriptContent): def _rewriteScript(self, scriptContent):
return self.jsRewriter.replaceAll(scriptContent) return self.jsRewriter.replaceAll(scriptContent) if scriptContent else None
def hasAttr(self, tagAttrs, attr): def hasAttr(self, tagAttrs, attr):
name, value = attr name, value = attr
@ -147,14 +154,9 @@ class WBHtml(HTMLParser):
return False return False
def rewriteTagAttrs(self, tag, tagAttrs, isStartEnd): def rewriteTagAttrs(self, tag, tagAttrs, isStartEnd):
# special case: base tag
if (tag == 'base'):
newBase = tagAttrs.get('href')
if newBase:
self.rewriter.setBaseUrl(newBase[1])
# special case: script or style parse context # special case: script or style parse context
elif (tag in WBHtml.STATE_TAGS) and (self._wbParseContext == None): if (tag in WBHtml.STATE_TAGS) and (self._wbParseContext == None):
self._wbParseContext = tag self._wbParseContext = tag
# special case: head insertion, non-head tags # special case: head insertion, non-head tags
@ -176,7 +178,7 @@ class WBHtml(HTMLParser):
attrName, attrValue = attr attrName, attrValue = attr
# special case: inline JS/event handler # special case: inline JS/event handler
if attrValue.startswith('javascript:') or attrName.startswith("on"): if (attrValue and attrValue.startswith('javascript:')) or attrName.startswith("on"):
attrValue = self._rewriteScript(attrValue) attrValue = self._rewriteScript(attrValue)
# special case: inline CSS/style attribute # special case: inline CSS/style attribute
@ -189,11 +191,19 @@ class WBHtml(HTMLParser):
attrValue = self._rewriteMetaRefresh(attrValue) attrValue = self._rewriteMetaRefresh(attrValue)
else: else:
# special case: base tag
if (tag == 'base') and (attrName == 'href') and attrValue:
self.rewriter.setBaseUrl(attrValue)
rwMod = handler.get(attrName) rwMod = handler.get(attrName)
if rwMod is not None: if rwMod is not None:
attrValue = self._rewriteURL(attrValue, rwMod) attrValue = self._rewriteURL(attrValue, rwMod)
self.out.write(' {0}="{1}"'.format(attrName, attrValue)) #self.out.write(' {0}="{1}"'.format(attrName, attrValue))
if attrValue:
self.out.write(' ' + attrName + '="' + attrValue + '"')
else:
self.out.write(' ' + attrName)
self.out.write('/>' if isStartEnd else '>') self.out.write('/>' if isStartEnd else '>')