diff --git a/pywb/rewrite/content_rewriter.py b/pywb/rewrite/content_rewriter.py index 7594efdf..8b8d0593 100644 --- a/pywb/rewrite/content_rewriter.py +++ b/pywb/rewrite/content_rewriter.py @@ -20,6 +20,13 @@ class BaseContentRewriter(object): TITLE = re.compile(r'<\s*title\s*>(.*)<\s*\/\s*title\s*>', re.M | re.I | re.S) + # set via html_rewriter since it overrides the default one + html_unescape = None + + @classmethod + def set_unescape(cls, unescape): + cls.html_unescape = unescape + @classmethod def _extract_title(cls, gen): title_res = list(gen) @@ -31,7 +38,13 @@ class BaseContentRewriter(object): return title_res = m.group(1) - return title_res.strip() + title_res = title_res.strip() + try: + title_res = cls.html_unescape(title_res) + except Exception as e: + pass + + return title_res def __init__(self, rules_file, replay_mod=''): self.rules = [] diff --git a/pywb/rewrite/html_rewriter.py b/pywb/rewrite/html_rewriter.py index 8c91989e..f3376fc2 100644 --- a/pywb/rewrite/html_rewriter.py +++ b/pywb/rewrite/html_rewriter.py @@ -11,7 +11,7 @@ from six.moves.urllib.parse import urljoin, urlsplit, urlunsplit from pywb.rewrite.url_rewriter import UrlRewriter from pywb.rewrite.regex_rewriters import JSRewriter, CSSRewriter -from pywb.rewrite.content_rewriter import StreamingRewriter +from pywb.rewrite.content_rewriter import StreamingRewriter, BaseContentRewriter from six import text_type @@ -20,9 +20,16 @@ import six.moves.html_parser try: orig_unescape = six.moves.html_parser.unescape six.moves.html_parser.unescape = lambda x: x + BaseContentRewriter.set_unescape(orig_unescape) except: orig_unescape = None + @staticmethod + def __unescape(x): + return HTMLParser().unescape(x) + + BaseContentRewriter.set_unescape(__unescape) + try: import _markupbase as markupbase diff --git a/pywb/static/wombatProxyMode.js b/pywb/static/wombatProxyMode.js index 6557ce49..2ee4f844 100644 --- a/pywb/static/wombatProxyMode.js +++ b/pywb/static/wombatProxyMode.js @@ -16,4 +16,4 @@ GNU General Public License for more details. You should have received a copy of the GNU General Public License along with pywb. If not, see . */ -(function(){function autobind(clazz){for(var prop,propValue,proto=clazz.__proto__||clazz.constructor.prototype||clazz.prototype,clazzProps=Object.getOwnPropertyNames(proto),len=clazzProps.length,i=0;i source[srcset], picture > source[data-srcset], picture > source[data-src], video > source[srcset], video > source[data-srcset], video > source[data-src], audio > source[srcset], audio > source[data-srcset], audio > source[data-src]",autobind(this),this._init(config,true)):new AutoFetcherProxyMode(wombat,config)}function WombatLite($wbwindow,wbinfo){return this instanceof WombatLite?void(this.wb_info=wbinfo,this.$wbwindow=$wbwindow,this.wb_info.top_host=this.wb_info.top_host||"*",this.wb_info.wombat_opts=this.wb_info.wombat_opts||{},this.WBAutoFetchWorker=null,this.historyCB=null):new WombatLite($wbwindow,wbinfo)}AutoFetcherProxyMode.prototype._init=function(config,first){var afwpm=this,wombat=this.wombat;if(document.readyState==="complete")return this.styleTag=document.createElement("style"),this.styleTag.id="$wrStyleParser$",document.head.appendChild(this.styleTag),void(config.isTop?fetch(config.workerURL).then(function(res){res.text().then(function(text){var blob=new Blob([text],{type:"text/javascript"});afwpm.worker=new wombat.$wbwindow.Worker(URL.createObjectURL(blob),{type:"classic",credentials:"include"}),afwpm.startChecking()}).catch(error=>{console.error("Could not create the backing worker for AutoFetchWorkerProxyMode"),console.error(error)})}):(this.worker={postMessage:function(msg){msg.wb_type||(msg={wb_type:"aaworker",msg:msg}),wombat.$wbwindow.top.postMessage(msg,"*")},terminate:function(){}},this.startChecking()));if(first)var i=setInterval(function(){document.readyState==="complete"&&(afwpm._init(config),clearInterval(i))},1e3)},AutoFetcherProxyMode.prototype.startChecking=function(){for(;this.worker&&this.msgQ.length;)this.postMessage(this.msgQ.shift());this.extractFromLocalDoc(),this.mutationObz=new MutationObserver(this.mutationCB),this.mutationObz.observe(document.documentElement,{characterData:false,characterDataOldValue:false,attributes:true,attributeOldValue:true,subtree:true,childList:true,attributeFilter:["src","srcset"]})},AutoFetcherProxyMode.prototype.terminate=function(){this.worker&&this.worker.terminate()},AutoFetcherProxyMode.prototype.justFetch=function(urls){this.postMessage({type:"fetch-all",values:urls})},AutoFetcherProxyMode.prototype.fetchAsPage=function(url,title){if(url){var headers={"X-Wombat-History-Page":url};if(title){var encodedTitle=encodeURIComponent(title.trim());title&&(headers["X-Wombat-History-Title"]=encodedTitle)}var fetchData={url:url,options:{headers:headers}};this.justFetch([fetchData])}},AutoFetcherProxyMode.prototype.postMessage=function(msg){this.worker?this.worker.postMessage(msg):this.msgQ.push(msg)},AutoFetcherProxyMode.prototype.handleMutatedStyleElem=function(elem,accum,text){var checkNode,baseURI=document.baseURI;if(text){if(!elem.parentNode||elem.parentNode.localName!=="style")return;checkNode=elem.parentNode}else checkNode=elem;try{var extractedMedia=this.extractMediaRules(checkNode.sheet,baseURI);if(extractedMedia.length)return void(accum.media=accum.media.concat(extractedMedia))}catch(e){}!text&&checkNode.href&&accum.deferred.push(this.fetchCSSAndExtract(checkNode.href))},AutoFetcherProxyMode.prototype.handleMutatedElem=function(elem,accum){var baseURI=document.baseURI;if(elem.nodeType===Node.TEXT_NODE)return this.handleMutatedStyleElem(elem,accum,true);switch(elem.localName){case"img":case"video":case"audio":case"source":return this.handleDomElement(elem,baseURI,accum);case"style":return this.handleMutatedStyleElem(elem,accum);case"link":if(elem.rel==="stylesheet"||elem.rel==="preload"&&elem.as==="style")return this.handleMutatedStyleElem(elem,accum);}return this.extractSrcSrcsetFrom(elem,baseURI,accum)},AutoFetcherProxyMode.prototype.mutationCB=function(mutationList,observer){for(var accum={type:"values",srcset:[],src:[],media:[],deferred:[]},i=0;i source[srcset], picture > source[data-srcset], picture > source[data-src], video > source[srcset], video > source[data-srcset], video > source[data-src], audio > source[srcset], audio > source[data-srcset], audio > source[data-src]",autobind(this),this._init(config,true)):new AutoFetcherProxyMode(wombat,config)}function WombatLite($wbwindow,wbinfo){return this instanceof WombatLite?void(this.wb_info=wbinfo,this.$wbwindow=$wbwindow,this.wb_info.top_host=this.wb_info.top_host||"*",this.wb_info.wombat_opts=this.wb_info.wombat_opts||{},this.WBAutoFetchWorker=null,this.historyCB=null):new WombatLite($wbwindow,wbinfo)}AutoFetcherProxyMode.prototype._init=function(config,first){var afwpm=this,wombat=this.wombat;if(document.readyState==="complete")return this.styleTag=document.createElement("style"),this.styleTag.id="$wrStyleParser$",document.head.appendChild(this.styleTag),void(config.isTop?fetch(config.workerURL).then(function(res){res.text().then(function(text){var blob=new Blob([text],{type:"text/javascript"});afwpm.worker=new wombat.$wbwindow.Worker(URL.createObjectURL(blob),{type:"classic",credentials:"include"}),afwpm.startChecking()}).catch(error=>{console.error("Could not create the backing worker for AutoFetchWorkerProxyMode"),console.error(error)})}):(this.worker={postMessage:function(msg){msg.wb_type||(msg={wb_type:"aaworker",msg:msg}),wombat.$wbwindow.top.postMessage(msg,"*")},terminate:function(){}},this.startChecking()));if(first)var i=setInterval(function(){document.readyState==="complete"&&(afwpm._init(config),clearInterval(i))},1e3)},AutoFetcherProxyMode.prototype.startChecking=function(){for(;this.worker&&this.msgQ.length;)this.postMessage(this.msgQ.shift());this.extractFromLocalDoc(),this.mutationObz=new MutationObserver(this.mutationCB),this.mutationObz.observe(document.documentElement,{characterData:false,characterDataOldValue:false,attributes:true,attributeOldValue:true,subtree:true,childList:true,attributeFilter:["src","srcset"]})},AutoFetcherProxyMode.prototype.terminate=function(){this.worker&&this.worker.terminate()},AutoFetcherProxyMode.prototype.justFetch=function(urls){this.postMessage({type:"fetch-all",values:urls})},AutoFetcherProxyMode.prototype.fetchAsPage=function(url,title){if(url){var headers={"X-Wombat-History-Page":url};if(title){var encodedTitle=encodeURIComponent(title.trim());title&&(headers["X-Wombat-History-Title"]=encodedTitle)}var fetchData={url:url,options:{headers:headers}};this.justFetch([fetchData])}},AutoFetcherProxyMode.prototype.postMessage=function(msg){this.worker?this.worker.postMessage(msg):this.msgQ.push(msg)},AutoFetcherProxyMode.prototype.handleMutatedStyleElem=function(elem,accum,text){var checkNode,baseURI=document.baseURI;if(text){if(!elem.parentNode||elem.parentNode.localName!=="style")return;checkNode=elem.parentNode}else checkNode=elem;try{var extractedMedia=this.extractMediaRules(checkNode.sheet,baseURI);if(extractedMedia.length)return void(accum.media=accum.media.concat(extractedMedia))}catch(e){}!text&&checkNode.href&&accum.deferred.push(this.fetchCSSAndExtract(checkNode.href))},AutoFetcherProxyMode.prototype.handleMutatedElem=function(elem,accum){var baseURI=document.baseURI;if(elem.nodeType===Node.TEXT_NODE)return this.handleMutatedStyleElem(elem,accum,true);switch(elem.localName){case"img":case"video":case"audio":case"source":return this.handleDomElement(elem,baseURI,accum);case"style":return this.handleMutatedStyleElem(elem,accum);case"link":if(elem.rel==="stylesheet"||elem.rel==="preload"&&elem.as==="style")return this.handleMutatedStyleElem(elem,accum);}return this.extractSrcSrcsetFrom(elem,baseURI,accum)},AutoFetcherProxyMode.prototype.mutationCB=function(mutationList,observer){for(var accum={type:"values",srcset:[],src:[],media:[],deferred:[]},i=0;iTest'Title' + + headers = [('Content-Length', str(len(body))), + ('Content-Type', 'text/html')] + start_response('200 OK', headers=headers) return [body] @@ -132,6 +138,19 @@ class TestLiveRewriter(HttpBinLiveTests, BaseConfigTest): assert 'Set-Cookie' not in resp.headers assert resp.text == 'cookie value: testcookie=cookie-val' + def test_fetch_page_with_html_title(self, fmod_sl): + resp = self.get('/live/{0}http://localhost:%s/html-title' % self.header_test_serv.port, fmod_sl, + headers={'X-Wombat-History-Page': 'http://localhost:{0}/html-title'.format(self.header_test_serv.port), + }) + assert resp.json == {'title': "Test'Title"} + + def test_fetch_page_with_title(self, fmod_sl): + resp = self.get('/live/{0}http://httpbin.org/html', fmod_sl, + headers={'X-Wombat-History-Page': 'http://httpbin.org/html', + 'X-Wombat-History-Title': 'Test%20Title', + }) + assert resp.json == {'title': 'Test Title'} + def test_live_live_frame(self): resp = self.testapp.get('/live/http://example.com/') assert resp.status_int == 200 diff --git a/wombat b/wombat index 5fdacc6c..e647aa17 160000 --- a/wombat +++ b/wombat @@ -1 +1 @@ -Subproject commit 5fdacc6cd4c89ee8cd1fcbd3fccd4907069050e3 +Subproject commit e647aa17a121bc9328809fc08b61b742c1357dd2