diff --git a/README.rst b/README.rst index f9fc0fc2..2132040f 100644 --- a/README.rst +++ b/README.rst @@ -1,4 +1,4 @@ -PyWb 0.3.1 +PyWb 0.4.0 Beta ============= .. image:: https://travis-ci.org/ikreymer/pywb.png?branch=develop diff --git a/pywb/rewrite/cookie_rewriter.py b/pywb/rewrite/cookie_rewriter.py new file mode 100644 index 00000000..070912d9 --- /dev/null +++ b/pywb/rewrite/cookie_rewriter.py @@ -0,0 +1,29 @@ +import Cookie + + +#================================================================= +class WbUrlCookieRewriter(object): + """ Cookie rewriter for wburl-based requests + Remove the domain and rewrite path, if any, to match + given WbUrl using the url rewriter. + """ + def __init__(self, url_rewriter): + self.url_rewriter = url_rewriter + + def rewrite(self, cookie_str, header='Set-Cookie'): + cookie = Cookie.SimpleCookie() + cookie.load(cookie_str) + + results = [] + + for name, morsel in cookie.iteritems(): + if morsel.get('domain'): + del morsel['domain'] + if morsel.get('path'): + morsel['path'] = self.url_rewriter.rewrite(morsel['path']) + if morsel.get('expires'): + del morsel['expires'] + + results.append((header, morsel.OutputString())) + + return results diff --git a/pywb/rewrite/header_rewriter.py b/pywb/rewrite/header_rewriter.py index 93b007de..25b27de4 100644 --- a/pywb/rewrite/header_rewriter.py +++ b/pywb/rewrite/header_rewriter.py @@ -39,6 +39,8 @@ class HeaderRewriter: PROXY_NO_REWRITE_HEADERS = ['content-length'] + COOKIE_HEADERS = ['set-cookie', 'cookie'] + def __init__(self, header_prefix='X-Archive-Orig-'): self.header_prefix = header_prefix @@ -86,6 +88,8 @@ class HeaderRewriter: new_headers = [] removed_header_dict = {} + cookie_rewriter = urlrewriter.get_cookie_rewriter() + for (name, value) in headers: lowername = name.lower() @@ -109,6 +113,11 @@ class HeaderRewriter: not content_rewritten): new_headers.append((name, value)) + elif (lowername in self.COOKIE_HEADERS and + cookie_rewriter): + cookie_list = cookie_rewriter.rewrite(value) + new_headers.extend(cookie_list) + else: new_headers.append((self.header_prefix + name, value)) diff --git a/pywb/rewrite/rewrite_live.py b/pywb/rewrite/rewrite_live.py index 25733b94..bf3c5f08 100644 --- a/pywb/rewrite/rewrite_live.py +++ b/pywb/rewrite/rewrite_live.py @@ -6,7 +6,9 @@ import requests import datetime import mimetypes -from pywb.utils.loaders import is_http +from urlparse import urlsplit + +from pywb.utils.loaders import is_http, LimitReader from pywb.utils.timeutils import datetime_to_timestamp from pywb.utils.statusandheaders import StatusAndHeaders from pywb.utils.canonicalize import canonicalize @@ -23,6 +25,12 @@ class LiveRewriter(object): ('HTTP_ACCEPT_CHARSET', 'Accept-Charset'), ('HTTP_ACCEPT_ENCODING', 'Accept-Encoding'), ('HTTP_RANGE', 'Range'), + ('HTTP_CACHE_CONTROL', 'Cache-Control'), + ('HTTP_X_REQUESTED_WITH', 'X-Requested-With'), + ('HTTP_X_CSRF_TOKEN', 'X-CSRF-Token'), + ('HTTP_COOKIE', 'Cookie'), + ('CONTENT_TYPE', 'Content-Type'), + ('CONTENT_LENGTH', 'Content-Length'), ('REL_REFERER', 'Referer'), ] @@ -67,10 +75,23 @@ class LiveRewriter(object): method = env['REQUEST_METHOD'].upper() input_ = env['wsgi.input'] + host = env.get('HTTP_HOST') + origin = env.get('HTTP_ORIGIN') + if host or origin: + splits = urlsplit(url) + if host: + req_headers['Host'] = splits.netloc + if origin: + req_headers['Origin'] = (splits.scheme + '://' + splits.netloc) + req_headers.update(self.translate_headers(env)) if method in ('POST', 'PUT'): - data = input_ + len_ = env.get('CONTENT_LENGTH') + if len_: + data = LimitReader(input_, int(len_)) + else: + data = input_ response = requests.request(method=method, url=url, diff --git a/pywb/rewrite/test/test_cookie_rewriter.py b/pywb/rewrite/test/test_cookie_rewriter.py new file mode 100644 index 00000000..620248cd --- /dev/null +++ b/pywb/rewrite/test/test_cookie_rewriter.py @@ -0,0 +1,25 @@ +r""" +# No rewriting +>>> rewrite_cookie('a=b; c=d;') +[('Set-Cookie', 'a=b'), ('Set-Cookie', 'c=d')] + +>>> rewrite_cookie('some=value; Domain=foo.com; Path=/;') +[('Set-Cookie', 'some=value; Path=/pywb/20131226101010/http://example.com/')] + +>>> rewrite_cookie('some=value; Domain=foo.com; Path=/diff/path/;') +[('Set-Cookie', 'some=value; Path=/pywb/20131226101010/http://example.com/diff/path/')] + +>>> rewrite_cookie('abc=def; Path=file.html; Expires=Wed, 13 Jan 2021 22:23:01 GMT') +[('Set-Cookie', 'abc=def; Path=/pywb/20131226101010/http://example.com/some/path/file.html')] + +""" + + +from pywb.rewrite.cookie_rewriter import WbUrlCookieRewriter +from pywb.rewrite.url_rewriter import UrlRewriter + +urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/pywb/') + +def rewrite_cookie(cookie_str): + return WbUrlCookieRewriter(urlrewriter).rewrite(cookie_str) + diff --git a/pywb/rewrite/test/test_header_rewriter.py b/pywb/rewrite/test/test_header_rewriter.py new file mode 100644 index 00000000..de772244 --- /dev/null +++ b/pywb/rewrite/test/test_header_rewriter.py @@ -0,0 +1,80 @@ +""" +#================================================================= +HTTP Headers Rewriting +#================================================================= + +# Text with charset +>>> _test_headers([('Date', 'Fri, 03 Jan 2014 03:03:21 GMT'), ('Content-Length', '5'), ('Content-Type', 'text/html;charset=UTF-8')]) +{'charset': 'utf-8', + 'removed_header_dict': {}, + 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Date', 'Fri, 03 Jan 2014 03:03:21 GMT'), + ('X-Archive-Orig-Content-Length', '5'), + ('Content-Type', 'text/html;charset=UTF-8')]), + 'text_type': 'html'} + +# Redirect +>>> _test_headers([('Connection', 'close'), ('Location', '/other.html')], '302 Redirect') +{'charset': None, + 'removed_header_dict': {}, + 'status_headers': StatusAndHeaders(protocol = '', statusline = '302 Redirect', headers = [ ('X-Archive-Orig-Connection', 'close'), + ('Location', '/web/20131010/http://example.com/other.html')]), + 'text_type': None} + +# cookie, host/origin rewriting +>>> _test_headers([('Connection', 'close'), ('Set-Cookie', 'foo=bar; Path=/; Domain=.example.com, abc=def; Path=somefile.html'), ('Host', 'example.com'), ('Origin', 'https://example.com')]) +{'charset': None, + 'removed_header_dict': {}, + 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Connection', 'close'), + ('Set-Cookie', 'foo=bar; Path=/web/20131010/http://example.com/'), + ( 'Set-Cookie', + 'abc=def; Path=/web/20131010/http://example.com/somefile.html'), + ('X-Archive-Orig-Host', 'example.com'), + ('X-Archive-Orig-Origin', 'https://example.com')]), + 'text_type': None} + + + +# gzip +>>> _test_headers([('Content-Length', '199999'), ('Content-Type', 'text/javascript'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked')]) +{'charset': None, + 'removed_header_dict': {'content-encoding': 'gzip', + 'transfer-encoding': 'chunked'}, + 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Content-Length', '199999'), + ('Content-Type', 'text/javascript')]), + 'text_type': 'js'} + +# Binary -- transfer-encoding removed +>>> _test_headers([('Content-Length', '200000'), ('Content-Type', 'image/png'), ('Set-Cookie', 'foo=bar; Path=/; Domain=.example.com'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked')]) +{'charset': None, + 'removed_header_dict': {'transfer-encoding': 'chunked'}, + 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('Content-Length', '200000'), + ('Content-Type', 'image/png'), + ('Set-Cookie', 'foo=bar; Path=/web/20131010/http://example.com/'), + ('Content-Encoding', 'gzip')]), + 'text_type': None} + +""" + + + +from pywb.rewrite.header_rewriter import HeaderRewriter +from pywb.rewrite.url_rewriter import UrlRewriter +from pywb.utils.statusandheaders import StatusAndHeaders + +import pprint + +urlrewriter = UrlRewriter('20131010/http://example.com/', '/web/') + + +headerrewriter = HeaderRewriter() + +def _test_headers(headers, status = '200 OK'): + rewritten = headerrewriter.rewrite(StatusAndHeaders(status, headers), urlrewriter) + return pprint.pprint(vars(rewritten)) + + +if __name__ == "__main__": + import doctest + doctest.testmod() + + diff --git a/pywb/rewrite/test/test_regex_rewriters.py b/pywb/rewrite/test/test_regex_rewriters.py index 17bf0a75..cbd2cb21 100644 --- a/pywb/rewrite/test/test_regex_rewriters.py +++ b/pywb/rewrite/test/test_regex_rewriters.py @@ -116,61 +116,13 @@ r""" >>> _test_css("@import url(/url.css)\n@import url(/anotherurl.css)\n @import url(/and_a_third.css)") '@import url(/web/20131010em_/http://example.com/url.css)\n@import url(/web/20131010em_/http://example.com/anotherurl.css)\n @import url(/web/20131010em_/http://example.com/and_a_third.css)' -#================================================================= -HTTP Headers Rewriting -#================================================================= - -# Text with charset ->>> _test_headers([('Date', 'Fri, 03 Jan 2014 03:03:21 GMT'), ('Content-Length', '5'), ('Content-Type', 'text/html;charset=UTF-8')]) -{'charset': 'utf-8', - 'removed_header_dict': {}, - 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Date', 'Fri, 03 Jan 2014 03:03:21 GMT'), - ('X-Archive-Orig-Content-Length', '5'), - ('Content-Type', 'text/html;charset=UTF-8')]), - 'text_type': 'html'} - -# Redirect ->>> _test_headers([('Connection', 'close'), ('Location', '/other.html')], '302 Redirect') -{'charset': None, - 'removed_header_dict': {}, - 'status_headers': StatusAndHeaders(protocol = '', statusline = '302 Redirect', headers = [ ('X-Archive-Orig-Connection', 'close'), - ('Location', '/web/20131010/http://example.com/other.html')]), - 'text_type': None} - -# gzip ->>> _test_headers([('Content-Length', '199999'), ('Content-Type', 'text/javascript'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked')]) -{'charset': None, - 'removed_header_dict': {'content-encoding': 'gzip', - 'transfer-encoding': 'chunked'}, - 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Content-Length', '199999'), - ('Content-Type', 'text/javascript')]), - 'text_type': 'js'} - -# Binary ->>> _test_headers([('Content-Length', '200000'), ('Content-Type', 'image/png'), ('Cookie', 'blah'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked')]) -{'charset': None, - 'removed_header_dict': {'transfer-encoding': 'chunked'}, - 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('Content-Length', '200000'), - ('Content-Type', 'image/png'), - ('X-Archive-Orig-Cookie', 'blah'), - ('Content-Encoding', 'gzip')]), - 'text_type': None} - -Removing Transfer-Encoding always, Was: - ('Content-Encoding', 'gzip'), - ('Transfer-Encoding', 'chunked')]), 'charset': None, 'text_type': None, 'removed_header_dict': {}} - - """ + #================================================================= from pywb.rewrite.url_rewriter import UrlRewriter from pywb.rewrite.regex_rewriters import RegexRewriter, JSRewriter, CSSRewriter, XMLRewriter -from pywb.rewrite.header_rewriter import HeaderRewriter -from pywb.utils.statusandheaders import StatusAndHeaders - -import pprint urlrewriter = UrlRewriter('20131010/http://example.com/', '/web/') @@ -184,12 +136,6 @@ def _test_xml(string): def _test_css(string): return CSSRewriter(urlrewriter).rewrite(string) -headerrewriter = HeaderRewriter() - -def _test_headers(headers, status = '200 OK'): - rewritten = headerrewriter.rewrite(StatusAndHeaders(status, headers), urlrewriter) - return pprint.pprint(vars(rewritten)) - if __name__ == "__main__": import doctest diff --git a/pywb/rewrite/test/test_rewrite_live.py b/pywb/rewrite/test/test_rewrite_live.py index 1e8fa25e..938c9ee1 100644 --- a/pywb/rewrite/test/test_rewrite_live.py +++ b/pywb/rewrite/test/test_rewrite_live.py @@ -3,6 +3,8 @@ from pywb.rewrite.url_rewriter import UrlRewriter from pywb import get_test_dir +from io import BytesIO + # This module has some rewriting tests against the 'live web' # As such, the content may change and the test may break @@ -81,5 +83,17 @@ def test_example_domain_specific_3(): assert '/* Bootloader.configurePage' in buff +def test_post(): + buff = BytesIO('ABCDEF') + + env = {'REQUEST_METHOD': 'POST', + 'HTTP_ORIGIN': 'http://example.com', + 'HTTP_HOST': 'example.com', + 'wsgi.input': buff} + + status_headers, resp_buff = get_rewritten('http://example.com/', urlrewriter, env=env) + assert status_headers.get_statuscode() == '200', status_headers + + def get_rewritten(*args, **kwargs): return LiveRewriter().get_rewritten(*args, **kwargs) diff --git a/pywb/rewrite/url_rewriter.py b/pywb/rewrite/url_rewriter.py index df4f32eb..843e665e 100644 --- a/pywb/rewrite/url_rewriter.py +++ b/pywb/rewrite/url_rewriter.py @@ -2,6 +2,7 @@ import copy import urlparse from wburl import WbUrl +from cookie_rewriter import WbUrlCookieRewriter #================================================================= @@ -82,6 +83,9 @@ class UrlRewriter(object): new_wburl.url = new_url return UrlRewriter(new_wburl, self.prefix) + def get_cookie_rewriter(self): + return WbUrlCookieRewriter(self) + def __repr__(self): return "UrlRewriter('{0}', '{1}')".format(self.wburl, self.prefix) @@ -114,3 +118,6 @@ class HttpsUrlRewriter(object): def rebase_rewriter(self, new_url): return self + + def get_cookie_rewriter(self): + return None diff --git a/setup.py b/setup.py index 91279b4f..c33471fd 100755 --- a/setup.py +++ b/setup.py @@ -34,7 +34,7 @@ class PyTest(TestCommand): setup( name='pywb', - version='0.3.1', + version='0.4.0', url='https://github.com/ikreymer/pywb', author='Ilya Kreymer', author_email='ikreymer@gmail.com',