From 43716f5f1da8511f1f81d1c7eb2f72e0dc9811a5 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 30 Jul 2015 23:06:34 -0700 Subject: [PATCH] rewrite: add HostScopeCookieRewriter and make it default -- set cookie scope to rewritten host wombat: ensure rewrite html is using a string bump to 0.10.5! --- README.rst | 2 +- pywb/__init__.py | 2 +- pywb/rewrite/cookie_rewriter.py | 40 +- pywb/rewrite/test/test_cookie_rewriter.py | 22 +- pywb/static/url.js | 616 ---------------------- pywb/static/wombat.js | 6 + 6 files changed, 62 insertions(+), 626 deletions(-) delete mode 100644 pywb/static/url.js diff --git a/README.rst b/README.rst index 3f4112af..b4f17581 100644 --- a/README.rst +++ b/README.rst @@ -1,4 +1,4 @@ -PyWb 0.10.3 +PyWb 0.10.5 =========== .. image:: https://travis-ci.org/ikreymer/pywb.svg?branch=master diff --git a/pywb/__init__.py b/pywb/__init__.py index 71e7617e..c2792dfa 100644 --- a/pywb/__init__.py +++ b/pywb/__init__.py @@ -1,4 +1,4 @@ -__version__ = '0.10.3-dev' +__version__ = '0.10.5' DEFAULT_CONFIG = 'pywb/default_config.yaml' diff --git a/pywb/rewrite/cookie_rewriter.py b/pywb/rewrite/cookie_rewriter.py index fc2cf9e1..bac0d727 100644 --- a/pywb/rewrite/cookie_rewriter.py +++ b/pywb/rewrite/cookie_rewriter.py @@ -66,6 +66,40 @@ class MinimalScopeCookieRewriter(WbUrlBaseCookieRewriter): return morsel +#================================================================= +class HostScopeCookieRewriter(WbUrlBaseCookieRewriter): + """ + Attempt to rewrite cookies to current host url.. + + If path present, rewrite path to current host. Only makes sense in live + proxy or no redirect mode, as otherwise timestamp may change. + + If domain present, remove domain and set to path prefix + """ + + def rewrite_cookie(self, name, morsel): + new_path = None + + # if domain set, expand cookie to host prefix + if morsel.get('domain'): + del morsel['domain'] + new_path = self.url_rewriter.rewrite('/') + + # set cookie to rewritten path + elif morsel.get('path'): + new_path = self.url_rewriter.rewrite(morsel['path']) + + if new_path: + inx = new_path.find(self.url_rewriter.rel_prefix) + if inx > 0: + new_path = new_path[inx:] + + morsel['path'] = new_path + + self._remove_age_opts(morsel) + return morsel + + #================================================================= class ExactPathCookieRewriter(WbUrlBaseCookieRewriter): """ @@ -112,7 +146,11 @@ def get_cookie_rewriter(cookie_scope): return RootScopeCookieRewriter elif cookie_scope == 'exact': return ExactPathCookieRewriter + elif cookie_scope == 'host': + return HostScopeCookieRewriter elif cookie_scope == 'removeall': return RemoveAllCookiesRewriter - else: + elif cookie_scope == 'coll': return MinimalScopeCookieRewriter + else: + return HostScopeCookieRewriter diff --git a/pywb/rewrite/test/test_cookie_rewriter.py b/pywb/rewrite/test/test_cookie_rewriter.py index 3120b827..10ae01bd 100644 --- a/pywb/rewrite/test/test_cookie_rewriter.py +++ b/pywb/rewrite/test/test_cookie_rewriter.py @@ -1,27 +1,27 @@ r""" -# Default -- MinimalScopeRewriter +# Default -- MinimalScopeRewriter (Collection scope) # No rewriting >>> rewrite_cookie('a=b; c=d;') [('Set-Cookie', 'a=b'), ('Set-Cookie', 'c=d')] ->>> rewrite_cookie('some=value; Path=/;') +>>> rewrite_cookie('some=value; Path=/;', urlrewriter, 'coll') [('Set-Cookie', 'some=value; Path=/pywb/20131226101010/http://example.com/')] ->>> rewrite_cookie('some=value; Path=../;', rewriter=urlrewriter2) +>>> rewrite_cookie('some=value; Path=../;', urlrewriter2, 'coll') [('Set-Cookie', 'some=value; Path=/preview/em_/http://example.com/')] ->>> rewrite_cookie('some=value; Path=/diff/path/;') +>>> rewrite_cookie('some=value; Path=/diff/path/;', urlrewriter, 'coll') [('Set-Cookie', 'some=value; Path=/pywb/20131226101010/http://example.com/diff/path/')] # if domain set, set path to root ->>> rewrite_cookie('some=value; Domain=.example.com; Path=/diff/path/; Max-Age=1500') +>>> rewrite_cookie('some=value; Domain=.example.com; Path=/diff/path/; Max-Age=1500', urlrewriter, 'coll') [('Set-Cookie', 'some=value; Path=/pywb/')] ->>> rewrite_cookie('abc=def; Path=file.html; Expires=Wed, 13 Jan 2021 22:23:01 GMT') +>>> rewrite_cookie('abc=def; Path=file.html; Expires=Wed, 13 Jan 2021 22:23:01 GMT', urlrewriter, 'coll') [('Set-Cookie', 'abc=def; Path=/pywb/20131226101010/http://example.com/some/path/file.html')] # Cookie with invalid chars, not parsed ->>> rewrite_cookie('abc@def=123') +>>> rewrite_cookie('abc@def=123', urlrewriter, 'coll') [] @@ -33,6 +33,14 @@ r""" [('Set-Cookie', 'some=value')] +# HostCookieRewriter -- set path to host +>>> rewrite_cookie('some=value; Path=/diff/path/;', urlrewriter, 'host') +[('Set-Cookie', 'some=value; Path=/pywb/20131226101010/http://example.com/diff/path/')] + +>>> rewrite_cookie('some=value; Domain=.example.com; Path=/diff/path/; Max-Age=1500', urlrewriter, 'host') +[('Set-Cookie', 'some=value; Path=/pywb/20131226101010/http://example.com/')] + + # RootCookieRewriter -- always sets Path=/, removes Domain >>> rewrite_cookie('some=value; Path=/diff/path/;', urlrewriter, 'root') [('Set-Cookie', 'some=value; Path=/')] diff --git a/pywb/static/url.js b/pywb/static/url.js deleted file mode 100644 index b4561e50..00000000 --- a/pywb/static/url.js +++ /dev/null @@ -1,616 +0,0 @@ -/* Any copyright is dedicated to the Public Domain. - * http://creativecommons.org/publicdomain/zero/1.0/ */ - -(function(scope) { - 'use strict'; - - // feature detect for URL constructor - var hasWorkingUrl = false; - if (!scope.forceJURL) { - try { - var u = new URL('b', 'http://a'); - u.pathname = 'c%20d'; - hasWorkingUrl = u.href === 'http://a/c%20d'; - } catch(e) {} - } - - if (hasWorkingUrl) - return; - - var relative = Object.create(null); - relative['ftp'] = 21; - relative['file'] = 0; - relative['gopher'] = 70; - relative['http'] = 80; - relative['https'] = 443; - relative['ws'] = 80; - relative['wss'] = 443; - - var relativePathDotMapping = Object.create(null); - relativePathDotMapping['%2e'] = '.'; - relativePathDotMapping['.%2e'] = '..'; - relativePathDotMapping['%2e.'] = '..'; - relativePathDotMapping['%2e%2e'] = '..'; - - function isRelativeScheme(scheme) { - return relative[scheme] !== undefined; - } - - function invalid() { - clear.call(this); - this._isInvalid = true; - } - - function IDNAToASCII(h) { - if ('' == h) { - invalid.call(this) - } - // XXX - return h.toLowerCase() - } - - function percentEscape(c) { - var unicode = c.charCodeAt(0); - if (unicode > 0x20 && - unicode < 0x7F && - // " # < > ? ` - [0x22, 0x23, 0x3C, 0x3E, 0x3F, 0x60].indexOf(unicode) == -1 - ) { - return c; - } - return encodeURIComponent(c); - } - - function percentEscapeQuery(c) { - // XXX This actually needs to encode c using encoding and then - // convert the bytes one-by-one. - - var unicode = c.charCodeAt(0); - if (unicode > 0x20 && - unicode < 0x7F && - // " # < > ` (do not escape '?') - [0x22, 0x23, 0x3C, 0x3E, 0x60].indexOf(unicode) == -1 - ) { - return c; - } - return encodeURIComponent(c); - } - - var EOF = undefined, - ALPHA = /[a-zA-Z]/, - ALPHANUMERIC = /[a-zA-Z0-9\+\-\.]/; - - function parse(input, stateOverride, base) { - function err(message) { - errors.push(message) - } - - var state = stateOverride || 'scheme start', - cursor = 0, - buffer = '', - seenAt = false, - seenBracket = false, - errors = []; - - loop: while ((input[cursor - 1] != EOF || cursor == 0) && !this._isInvalid) { - var c = input[cursor]; - switch (state) { - case 'scheme start': - if (c && ALPHA.test(c)) { - buffer += c.toLowerCase(); // ASCII-safe - state = 'scheme'; - } else if (!stateOverride) { - buffer = ''; - state = 'no scheme'; - continue; - } else { - err('Invalid scheme.'); - break loop; - } - break; - - case 'scheme': - if (c && ALPHANUMERIC.test(c)) { - buffer += c.toLowerCase(); // ASCII-safe - } else if (':' == c) { - this._scheme = buffer; - buffer = ''; - if (stateOverride) { - break loop; - } - if (isRelativeScheme(this._scheme)) { - this._isRelative = true; - } - if ('file' == this._scheme) { - state = 'relative'; - } else if (this._isRelative && base && base._scheme == this._scheme) { - state = 'relative or authority'; - } else if (this._isRelative) { - state = 'authority first slash'; - } else { - state = 'scheme data'; - } - } else if (!stateOverride) { - buffer = ''; - cursor = 0; - state = 'no scheme'; - continue; - } else if (EOF == c) { - break loop; - } else { - err('Code point not allowed in scheme: ' + c) - break loop; - } - break; - - case 'scheme data': - if ('?' == c) { - query = '?'; - state = 'query'; - } else if ('#' == c) { - this._fragment = '#'; - state = 'fragment'; - } else { - // XXX error handling - if (EOF != c && '\t' != c && '\n' != c && '\r' != c) { - this._schemeData += percentEscape(c); - } - } - break; - - case 'no scheme': - if (!base || !(isRelativeScheme(base._scheme))) { - err('Missing scheme.'); - invalid.call(this); - } else { - state = 'relative'; - continue; - } - break; - - case 'relative or authority': - if ('/' == c && '/' == input[cursor+1]) { - state = 'authority ignore slashes'; - } else { - err('Expected /, got: ' + c); - state = 'relative'; - continue - } - break; - - case 'relative': - this._isRelative = true; - if ('file' != this._scheme) - this._scheme = base._scheme; - if (EOF == c) { - this._host = base._host; - this._port = base._port; - this._path = base._path.slice(); - this._query = base._query; - this._username = base._username; - this._password = base._password; - break loop; - } else if ('/' == c || '\\' == c) { - if ('\\' == c) - err('\\ is an invalid code point.'); - state = 'relative slash'; - } else if ('?' == c) { - this._host = base._host; - this._port = base._port; - this._path = base._path.slice(); - this._query = '?'; - this._username = base._username; - this._password = base._password; - state = 'query'; - } else if ('#' == c) { - this._host = base._host; - this._port = base._port; - this._path = base._path.slice(); - this._query = base._query; - this._fragment = '#'; - this._username = base._username; - this._password = base._password; - state = 'fragment'; - } else { - var nextC = input[cursor+1] - var nextNextC = input[cursor+2] - if ( - 'file' != this._scheme || !ALPHA.test(c) || - (nextC != ':' && nextC != '|') || - (EOF != nextNextC && '/' != nextNextC && '\\' != nextNextC && '?' != nextNextC && '#' != nextNextC)) { - this._host = base._host; - this._port = base._port; - this._username = base._username; - this._password = base._password; - this._path = base._path.slice(); - this._path.pop(); - } - state = 'relative path'; - continue; - } - break; - - case 'relative slash': - if ('/' == c || '\\' == c) { - if ('\\' == c) { - err('\\ is an invalid code point.'); - } - if ('file' == this._scheme) { - state = 'file host'; - } else { - state = 'authority ignore slashes'; - } - } else { - if ('file' != this._scheme) { - this._host = base._host; - this._port = base._port; - this._username = base._username; - this._password = base._password; - } - state = 'relative path'; - continue; - } - break; - - case 'authority first slash': - if ('/' == c) { - state = 'authority second slash'; - } else { - err("Expected '/', got: " + c); - state = 'authority ignore slashes'; - continue; - } - break; - - case 'authority second slash': - state = 'authority ignore slashes'; - if ('/' != c) { - err("Expected '/', got: " + c); - continue; - } - break; - - case 'authority ignore slashes': - if ('/' != c && '\\' != c) { - state = 'authority'; - continue; - } else { - err('Expected authority, got: ' + c); - } - break; - - case 'authority': - if ('@' == c) { - if (seenAt) { - err('@ already seen.'); - buffer += '%40'; - } - seenAt = true; - for (var i = 0; i < buffer.length; i++) { - var cp = buffer[i]; - if ('\t' == cp || '\n' == cp || '\r' == cp) { - err('Invalid whitespace in authority.'); - continue; - } - // XXX check URL code points - if (':' == cp && null === this._password) { - this._password = ''; - continue; - } - var tempC = percentEscape(cp); - (null !== this._password) ? this._password += tempC : this._username += tempC; - } - buffer = ''; - } else if (EOF == c || '/' == c || '\\' == c || '?' == c || '#' == c) { - cursor -= buffer.length; - buffer = ''; - state = 'host'; - continue; - } else { - buffer += c; - } - break; - - case 'file host': - if (EOF == c || '/' == c || '\\' == c || '?' == c || '#' == c) { - if (buffer.length == 2 && ALPHA.test(buffer[0]) && (buffer[1] == ':' || buffer[1] == '|')) { - state = 'relative path'; - } else if (buffer.length == 0) { - state = 'relative path start'; - } else { - this._host = IDNAToASCII.call(this, buffer); - buffer = ''; - state = 'relative path start'; - } - continue; - } else if ('\t' == c || '\n' == c || '\r' == c) { - err('Invalid whitespace in file host.'); - } else { - buffer += c; - } - break; - - case 'host': - case 'hostname': - if (':' == c && !seenBracket) { - // XXX host parsing - this._host = IDNAToASCII.call(this, buffer); - buffer = ''; - state = 'port'; - if ('hostname' == stateOverride) { - break loop; - } - } else if (EOF == c || '/' == c || '\\' == c || '?' == c || '#' == c) { - this._host = IDNAToASCII.call(this, buffer); - buffer = ''; - state = 'relative path start'; - if (stateOverride) { - break loop; - } - continue; - } else if ('\t' != c && '\n' != c && '\r' != c) { - if ('[' == c) { - seenBracket = true; - } else if (']' == c) { - seenBracket = false; - } - buffer += c; - } else { - err('Invalid code point in host/hostname: ' + c); - } - break; - - case 'port': - if (/[0-9]/.test(c)) { - buffer += c; - } else if (EOF == c || '/' == c || '\\' == c || '?' == c || '#' == c || stateOverride) { - if ('' != buffer) { - var temp = parseInt(buffer, 10); - if (temp != relative[this._scheme]) { - this._port = temp + ''; - } - buffer = ''; - } - if (stateOverride) { - break loop; - } - state = 'relative path start'; - continue; - } else if ('\t' == c || '\n' == c || '\r' == c) { - err('Invalid code point in port: ' + c); - } else { - invalid.call(this); - } - break; - - case 'relative path start': - if ('\\' == c) - err("'\\' not allowed in path."); - state = 'relative path'; - if ('/' != c && '\\' != c) { - continue; - } - break; - - case 'relative path': - if (EOF == c || '/' == c || '\\' == c || (!stateOverride && ('?' == c || '#' == c))) { - if ('\\' == c) { - err('\\ not allowed in relative path.'); - } - var tmp; - if (tmp = relativePathDotMapping[buffer.toLowerCase()]) { - buffer = tmp; - } - if ('..' == buffer) { - this._path.pop(); - if ('/' != c && '\\' != c) { - this._path.push(''); - } - } else if ('.' == buffer && '/' != c && '\\' != c) { - this._path.push(''); - } else if ('.' != buffer) { - if ('file' == this._scheme && this._path.length == 0 && buffer.length == 2 && ALPHA.test(buffer[0]) && buffer[1] == '|') { - buffer = buffer[0] + ':'; - } - this._path.push(buffer); - } - buffer = ''; - if ('?' == c) { - this._query = '?'; - state = 'query'; - } else if ('#' == c) { - this._fragment = '#'; - state = 'fragment'; - } - } else if ('\t' != c && '\n' != c && '\r' != c) { - buffer += percentEscape(c); - } - break; - - case 'query': - if (!stateOverride && '#' == c) { - this._fragment = '#'; - state = 'fragment'; - } else if (EOF != c && '\t' != c && '\n' != c && '\r' != c) { - this._query += percentEscapeQuery(c); - } - break; - - case 'fragment': - if (EOF != c && '\t' != c && '\n' != c && '\r' != c) { - this._fragment += c; - } - break; - } - - cursor++; - } - } - - function clear() { - this._scheme = ''; - this._schemeData = ''; - this._username = ''; - this._password = null; - this._host = ''; - this._port = ''; - this._path = []; - this._query = ''; - this._fragment = ''; - this._isInvalid = false; - this._isRelative = false; - } - - // Does not process domain names or IP addresses. - // Does not handle encoding for the query parameter. - function jURL(url, base /* , encoding */) { - if (base !== undefined && !(base instanceof jURL)) - base = new jURL(String(base)); - - this._url = url; - clear.call(this); - - var input = url.replace(/^[ \t\r\n\f]+|[ \t\r\n\f]+$/g, ''); - // encoding = encoding || 'utf-8' - - parse.call(this, input, null, base); - } - - jURL.prototype = { - toString: function() { - return this.href; - }, - get href() { - if (this._isInvalid) - return this._url; - - var authority = ''; - if ('' != this._username || null != this._password) { - authority = this._username + - (null != this._password ? ':' + this._password : '') + '@'; - } - - return this.protocol + - (this._isRelative ? '//' + authority + this.host : '') + - this.pathname + this._query + this._fragment; - }, - set href(href) { - clear.call(this); - parse.call(this, href); - }, - - get protocol() { - return this._scheme + ':'; - }, - set protocol(protocol) { - if (this._isInvalid) - return; - parse.call(this, protocol + ':', 'scheme start'); - }, - - get host() { - return this._isInvalid ? '' : this._port ? - this._host + ':' + this._port : this._host; - }, - set host(host) { - if (this._isInvalid || !this._isRelative) - return; - parse.call(this, host, 'host'); - }, - - get hostname() { - return this._host; - }, - set hostname(hostname) { - if (this._isInvalid || !this._isRelative) - return; - parse.call(this, hostname, 'hostname'); - }, - - get port() { - return this._port; - }, - set port(port) { - if (this._isInvalid || !this._isRelative) - return; - parse.call(this, port, 'port'); - }, - - get pathname() { - return this._isInvalid ? '' : this._isRelative ? - '/' + this._path.join('/') : this._schemeData; - }, - set pathname(pathname) { - if (this._isInvalid || !this._isRelative) - return; - this._path = []; - parse.call(this, pathname, 'relative path start'); - }, - - get search() { - return this._isInvalid || !this._query || '?' == this._query ? - '' : this._query; - }, - set search(search) { - if (this._isInvalid || !this._isRelative) - return; - this._query = '?'; - if ('?' == search[0]) - search = search.slice(1); - parse.call(this, search, 'query'); - }, - - get hash() { - return this._isInvalid || !this._fragment || '#' == this._fragment ? - '' : this._fragment; - }, - set hash(hash) { - if (this._isInvalid) - return; - this._fragment = '#'; - if ('#' == hash[0]) - hash = hash.slice(1); - parse.call(this, hash, 'fragment'); - }, - - get origin() { - var host; - if (this._isInvalid || !this._scheme) { - return ''; - } - // javascript: Gecko returns String(""), WebKit/Blink String("null") - // Gecko throws error for "data://" - // data: Gecko returns "", Blink returns "data://", WebKit returns "null" - // Gecko returns String("") for file: mailto: - // WebKit/Blink returns String("SCHEME://") for file: mailto: - switch (this._scheme) { - case 'data': - case 'file': - case 'javascript': - case 'mailto': - return 'null'; - } - host = this.host; - if (!host) { - return ''; - } - return this._scheme + '://' + host; - } - }; - - // Copy over the static methods - var OriginalURL = scope.URL; - if (OriginalURL) { - jURL.createObjectURL = function(blob) { - // IE extension allows a second optional options argument. - // http://msdn.microsoft.com/en-us/library/ie/hh772302(v=vs.85).aspx - return OriginalURL.createObjectURL.apply(OriginalURL, arguments); - }; - jURL.revokeObjectURL = function(url) { - OriginalURL.revokeObjectURL(url); - }; - } - - scope.URL = jURL; - -})(this); diff --git a/pywb/static/wombat.js b/pywb/static/wombat.js index 6ae26586..238248e9 100644 --- a/pywb/static/wombat.js +++ b/pywb/static/wombat.js @@ -1054,12 +1054,18 @@ var wombat_internal = function($wbwindow) { //============================================ function rewrite_html(string) { + if (!string) { + return string; + } + var inner_doc = new DOMParser().parseFromString(string, "text/html"); if (!inner_doc) { return string; } + string = string.toString(); + var new_html = ""; var head = ""; var body = "";