mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
fixes! Fix typos, in html parsing, fix base, support attrs w/o values
This commit is contained in:
parent
a84ec2abc7
commit
997dc5df0f
@ -7,6 +7,7 @@ import urllib2
|
|||||||
import StringIO
|
import StringIO
|
||||||
import urlparse
|
import urlparse
|
||||||
import collections
|
import collections
|
||||||
|
import wbexceptions
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class HttpStreamLoader:
|
class HttpStreamLoader:
|
||||||
@ -143,9 +144,9 @@ class ArchiveLoader:
|
|||||||
headers = [('Content-Type', utils.get_header(parsed.headers, 'Content-Type'))]
|
headers = [('Content-Type', utils.get_header(parsed.headers, 'Content-Type'))]
|
||||||
|
|
||||||
# special case: http 0.9 response, no status or headers
|
# special case: http 0.9 response, no status or headers
|
||||||
elif recType == 'response' and (';version=0.9' in utils.get_header(parsed.headers, 'Content-Type')):
|
#elif recType == 'response' and (';version=0.9' in utils.get_header(parsed.headers, 'Content-Type')):
|
||||||
statusline = '200 OK'
|
# statusline = '200 OK'
|
||||||
headers = []
|
# headers = []
|
||||||
|
|
||||||
# response record: parse HTTP status and headers!
|
# response record: parse HTTP status and headers!
|
||||||
else:
|
else:
|
||||||
@ -202,13 +203,13 @@ class LineReader:
|
|||||||
|
|
||||||
self.buff = StringIO.StringIO(data)
|
self.buff = StringIO.StringIO(data)
|
||||||
|
|
||||||
def read(self):
|
def read(self, length = None):
|
||||||
self._fillbuff()
|
self._fillbuff()
|
||||||
return self.buff.read()
|
return self.buff.read(length)
|
||||||
|
|
||||||
def readline(self):
|
def readline(self, length = None):
|
||||||
self._fillbuff()
|
self._fillbuff()
|
||||||
return self.buff.readline()
|
return self.buff.readline(length)
|
||||||
|
|
||||||
def close(self):
|
def close(self):
|
||||||
if self.stream:
|
if self.stream:
|
||||||
|
@ -117,7 +117,7 @@ class ReplayHandler(object):
|
|||||||
if fullUrl:
|
if fullUrl:
|
||||||
return fullUrl
|
return fullUrl
|
||||||
|
|
||||||
raise exceptions.UnresolvedArchiveFileException('Archive File Not Found: ' + cdx.filename)
|
raise wbexceptions.UnresolvedArchiveFileException('Archive File Not Found: ' + filename)
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
@ -125,10 +125,10 @@ class RewritingReplayHandler(ReplayHandler):
|
|||||||
|
|
||||||
|
|
||||||
REWRITE_TYPES = {
|
REWRITE_TYPES = {
|
||||||
'html': ('text/html', 'application/xhtml'),
|
'html': ['text/html', 'application/xhtml'],
|
||||||
'css': ('text/css'),
|
'css': ['text/css'],
|
||||||
'js': ('text/javascript', 'application/javascript', 'application/x-javascript'),
|
'js': ['text/javascript', 'application/javascript', 'application/x-javascript'],
|
||||||
'xml': ('/xml', '+xml', '.xml', '.rss'),
|
'xml': ['/xml', '+xml', '.xml', '.rss'],
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -145,11 +145,10 @@ class RewritingReplayHandler(ReplayHandler):
|
|||||||
self.headInsert = headInsert
|
self.headInsert = headInsert
|
||||||
|
|
||||||
|
|
||||||
def _canonContentType(self, contentType):
|
def _textContentType(self, contentType):
|
||||||
for type, mimelist in self.REWRITE_TYPES.iteritems():
|
for ctype, mimelist in self.REWRITE_TYPES.iteritems():
|
||||||
for mime in mimelist:
|
if any ((mime in contentType) for mime in mimelist):
|
||||||
if mime in contentType:
|
return ctype
|
||||||
return type
|
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@ -168,13 +167,13 @@ class RewritingReplayHandler(ReplayHandler):
|
|||||||
return response
|
return response
|
||||||
|
|
||||||
contentType = utils.get_header(response.headersList, 'Content-Type')
|
contentType = utils.get_header(response.headersList, 'Content-Type')
|
||||||
|
|
||||||
canonType = self._canonContentType(contentType)
|
textType = self._textContentType(contentType) if contentType else None
|
||||||
|
|
||||||
(newHeaders, remHeaders) = self._rewriteHeaders(response.headersList, (canonType is not None))
|
(newHeaders, remHeaders) = self._rewriteHeaders(response.headersList, urlrewriter, textType is not None)
|
||||||
|
|
||||||
# binary type, just send through
|
# binary type, just send through
|
||||||
if canonType is None:
|
if textType is None:
|
||||||
response.headersList = newHeaders
|
response.headersList = newHeaders
|
||||||
return response
|
return response
|
||||||
|
|
||||||
@ -186,19 +185,19 @@ class RewritingReplayHandler(ReplayHandler):
|
|||||||
if (utils.contains_header(remHeaders, ('Content-Encoding', 'gzip'))):
|
if (utils.contains_header(remHeaders, ('Content-Encoding', 'gzip'))):
|
||||||
stream = archiveloader.LineStream(stream, decomp = zlib.decompressobj(16 + zlib.MAX_WBITS))
|
stream = archiveloader.LineStream(stream, decomp = zlib.decompressobj(16 + zlib.MAX_WBITS))
|
||||||
|
|
||||||
return self._rewriteContent(canonType, urlrewriter, stream, newHeaders, response)
|
return self._rewriteContent(textType, urlrewriter, stream, newHeaders, response)
|
||||||
|
|
||||||
# TODO: first non-streaming attempt, probably want to stream
|
# TODO: first non-streaming attempt, probably want to stream
|
||||||
def _rewriteContent(self, canonType, urlrewriter, stream, newHeaders, origResponse):
|
def _rewriteContent(self, textType, urlrewriter, stream, newHeaders, origResponse, encoding = 'utf-8'):
|
||||||
if canonType == 'html':
|
if textType == 'html':
|
||||||
out = StringIO.StringIO()
|
out = StringIO.StringIO()
|
||||||
htmlrewriter = wbhtml.WBHtml(urlrewriter, out, self.headInsert)
|
htmlrewriter = wbhtml.WBHtml(urlrewriter, out, self.headInsert)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
buff = stream.read()
|
buff = stream.read()#.decode(encoding)
|
||||||
while buff:
|
while buff:
|
||||||
htmlrewriter.feed(buff)
|
htmlrewriter.feed(buff)
|
||||||
buff = stream.read()
|
buff = stream.read()#.decode(encoding)
|
||||||
|
|
||||||
htmlrewriter.close()
|
htmlrewriter.close()
|
||||||
|
|
||||||
@ -207,12 +206,13 @@ class RewritingReplayHandler(ReplayHandler):
|
|||||||
|
|
||||||
finally:
|
finally:
|
||||||
value = [out.getvalue()]
|
value = [out.getvalue()]
|
||||||
|
newHeaders.append(('Content-Length', str(len(value[0]))))
|
||||||
out.close()
|
out.close()
|
||||||
|
|
||||||
else:
|
else:
|
||||||
if canonType == 'css':
|
if textType == 'css':
|
||||||
rewriter = regexmatch.CSSRewriter(urlrewriter)
|
rewriter = regexmatch.CSSRewriter(urlrewriter)
|
||||||
elif canonType == 'js':
|
elif textType == 'js':
|
||||||
rewriter = regexmatch.JSRewriter(urlrewriter)
|
rewriter = regexmatch.JSRewriter(urlrewriter)
|
||||||
|
|
||||||
def gen():
|
def gen():
|
||||||
@ -231,7 +231,7 @@ class RewritingReplayHandler(ReplayHandler):
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
def _rewriteHeaders(self, headers, stripEncoding = False):
|
def _rewriteHeaders(self, headers, urlrewriter, stripEncoding = False):
|
||||||
newHeaders = []
|
newHeaders = []
|
||||||
removedHeaders = []
|
removedHeaders = []
|
||||||
|
|
||||||
|
@ -34,7 +34,7 @@ class UnknownLoaderProtocolException(CaptureException):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
class InvalidArchiveRecordException(CaptureException):
|
class InvalidArchiveRecordException(CaptureException):
|
||||||
def __init__(msg, errList = None):
|
def __init__(self, msg, errList = None):
|
||||||
super(InvalidArchiveRecordException, self).__init__(msg)
|
super(InvalidArchiveRecordException, self).__init__(msg)
|
||||||
self.errList = errList
|
self.errList = errList
|
||||||
|
|
||||||
|
@ -19,6 +19,13 @@ class WBHtml(HTMLParser):
|
|||||||
>>> parse('<body x="y"><img src="/img.gif"/><br/></body>')
|
>>> parse('<body x="y"><img src="/img.gif"/><br/></body>')
|
||||||
<body x="y"><img src="/web/20131226101010im_/http://example.com/img.gif"/><br/></body>
|
<body x="y"><img src="/web/20131226101010im_/http://example.com/img.gif"/><br/></body>
|
||||||
|
|
||||||
|
>>> parse('<input "selected"><img src></div>')
|
||||||
|
<input "selected"><img src></div>
|
||||||
|
|
||||||
|
>>> parse('<html><head><base href="http://example.com/some/path/index.html"/>')
|
||||||
|
<html><head><base href="/web/20131226101010/http://example.com/some/path/index.html"/>
|
||||||
|
|
||||||
|
# Meta tag
|
||||||
>>> parse('<META http-equiv="refresh" content="10; URL=/abc/def.html">')
|
>>> parse('<META http-equiv="refresh" content="10; URL=/abc/def.html">')
|
||||||
<meta http-equiv="refresh" content="10; URL=/web/20131226101010/http://example.com/abc/def.html">
|
<meta http-equiv="refresh" content="10; URL=/web/20131226101010/http://example.com/abc/def.html">
|
||||||
|
|
||||||
@ -35,8 +42,8 @@ class WBHtml(HTMLParser):
|
|||||||
>>> parse('<script>/*<![CDATA[*/window.location = "http://example.com/a/b/c.html;/*]]>*/"</script>')
|
>>> parse('<script>/*<![CDATA[*/window.location = "http://example.com/a/b/c.html;/*]]>*/"</script>')
|
||||||
<script>/*<![CDATA[*/window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html;/*]]>*/"</script>
|
<script>/*<![CDATA[*/window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html;/*]]>*/"</script>
|
||||||
|
|
||||||
>>> parse('<div style="background: url(\'abc.html\')" onclick="location = \'redirect.html\'"></div>')
|
>>> parse('<div style="background: url(\'abc.html\')" onblah onclick="location = \'redirect.html\'"></div>')
|
||||||
<div style="background: url('/web/20131226101010/http://example.com/some/path/abc.html')" onclick="WB_wombat_location = 'redirect.html'"></div>
|
<div style="background: url('/web/20131226101010/http://example.com/some/path/abc.html')" onblah onclick="WB_wombat_location = 'redirect.html'"></div>
|
||||||
|
|
||||||
>>> parse('<style>@import "styles.css" .a { font-face: url(\'myfont.ttf\') }</style>')
|
>>> parse('<style>@import "styles.css" .a { font-face: url(\'myfont.ttf\') }</style>')
|
||||||
<style>@import "/web/20131226101010/http://example.com/some/path/styles.css" .a { font-face: url('/web/20131226101010/http://example.com/some/path/myfont.ttf') }</style>
|
<style>@import "/web/20131226101010/http://example.com/some/path/styles.css" .a { font-face: url('/web/20131226101010/http://example.com/some/path/myfont.ttf') }</style>
|
||||||
@ -130,14 +137,14 @@ class WBHtml(HTMLParser):
|
|||||||
# ===========================
|
# ===========================
|
||||||
|
|
||||||
def _rewriteURL(self, value, mod = None):
|
def _rewriteURL(self, value, mod = None):
|
||||||
return self.rewriter.rewrite(value, mod)
|
return self.rewriter.rewrite(value, mod) if value else None
|
||||||
|
|
||||||
|
|
||||||
def _rewriteCSS(self, cssContent):
|
def _rewriteCSS(self, cssContent):
|
||||||
return self.cssRewriter.replaceAll(cssContent)
|
return self.cssRewriter.replaceAll(cssContent) if cssContent else None
|
||||||
|
|
||||||
def _rewriteScript(self, scriptContent):
|
def _rewriteScript(self, scriptContent):
|
||||||
return self.jsRewriter.replaceAll(scriptContent)
|
return self.jsRewriter.replaceAll(scriptContent) if scriptContent else None
|
||||||
|
|
||||||
def hasAttr(self, tagAttrs, attr):
|
def hasAttr(self, tagAttrs, attr):
|
||||||
name, value = attr
|
name, value = attr
|
||||||
@ -147,14 +154,9 @@ class WBHtml(HTMLParser):
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
def rewriteTagAttrs(self, tag, tagAttrs, isStartEnd):
|
def rewriteTagAttrs(self, tag, tagAttrs, isStartEnd):
|
||||||
# special case: base tag
|
|
||||||
if (tag == 'base'):
|
|
||||||
newBase = tagAttrs.get('href')
|
|
||||||
if newBase:
|
|
||||||
self.rewriter.setBaseUrl(newBase[1])
|
|
||||||
|
|
||||||
# special case: script or style parse context
|
# special case: script or style parse context
|
||||||
elif (tag in WBHtml.STATE_TAGS) and (self._wbParseContext == None):
|
if (tag in WBHtml.STATE_TAGS) and (self._wbParseContext == None):
|
||||||
self._wbParseContext = tag
|
self._wbParseContext = tag
|
||||||
|
|
||||||
# special case: head insertion, non-head tags
|
# special case: head insertion, non-head tags
|
||||||
@ -176,7 +178,7 @@ class WBHtml(HTMLParser):
|
|||||||
attrName, attrValue = attr
|
attrName, attrValue = attr
|
||||||
|
|
||||||
# special case: inline JS/event handler
|
# special case: inline JS/event handler
|
||||||
if attrValue.startswith('javascript:') or attrName.startswith("on"):
|
if (attrValue and attrValue.startswith('javascript:')) or attrName.startswith("on"):
|
||||||
attrValue = self._rewriteScript(attrValue)
|
attrValue = self._rewriteScript(attrValue)
|
||||||
|
|
||||||
# special case: inline CSS/style attribute
|
# special case: inline CSS/style attribute
|
||||||
@ -189,11 +191,19 @@ class WBHtml(HTMLParser):
|
|||||||
attrValue = self._rewriteMetaRefresh(attrValue)
|
attrValue = self._rewriteMetaRefresh(attrValue)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
|
# special case: base tag
|
||||||
|
if (tag == 'base') and (attrName == 'href') and attrValue:
|
||||||
|
self.rewriter.setBaseUrl(attrValue)
|
||||||
|
|
||||||
rwMod = handler.get(attrName)
|
rwMod = handler.get(attrName)
|
||||||
if rwMod is not None:
|
if rwMod is not None:
|
||||||
attrValue = self._rewriteURL(attrValue, rwMod)
|
attrValue = self._rewriteURL(attrValue, rwMod)
|
||||||
|
|
||||||
self.out.write(' {0}="{1}"'.format(attrName, attrValue))
|
#self.out.write(' {0}="{1}"'.format(attrName, attrValue))
|
||||||
|
if attrValue:
|
||||||
|
self.out.write(' ' + attrName + '="' + attrValue + '"')
|
||||||
|
else:
|
||||||
|
self.out.write(' ' + attrName)
|
||||||
|
|
||||||
self.out.write('/>' if isStartEnd else '>')
|
self.out.write('/>' if isStartEnd else '>')
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user