1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

Rewriting fixes for http-only cookies, bad content-length, and document with base (#386)

* rewriting fixes:
server side: cookie rewriting: if httponly cookie with mp_/if_ modifier and path ends with '/', add set-cookie for all known modifiers
content length parsing: improve content-length parsing to support 'content-length: num,num', parse out the first number (occasionally seen with range requests when range is dropped for upstream)
wombat: rewrite_elem: use element.ownerDocument for resolving baseUri for parent paths
tests: add tests for cookie all modifier rewrite, bad content-length parsing (skip for py2.7)
This commit is contained in:
Ilya Kreymer 2018-10-05 14:37:32 -07:00 committed by GitHub
parent e6f00ce58d
commit 671dd2c204
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 94 additions and 7 deletions

View File

@ -175,6 +175,12 @@ class RewriterApp(object):
content_length = (record.http_headers.
get_header('Content-Length'))
if content_length is None:
return
content_length = content_length.split(',')[0]
try:
content_length = int(content_length)
if not range_end:

View File

@ -26,10 +26,39 @@ class WbUrlBaseCookieRewriter(object):
morsel = self.rewrite_cookie(name, morsel)
self._filter_morsel(morsel)
results.append((header, morsel.OutputString()))
if not self.add_prefix_cookie_for_all_mods(morsel, results, header):
value = morsel.OutputString()
results.append((header, value))
return results
def add_prefix_cookie_for_all_mods(self, morsel, results, header):
""" If HttpOnly cookie that is set to a path ending in /,
and current mod is mp_ or if_,
then assume its meant to be a prefix, and likely needed for
other content.
Set cookie with same prefix but for all common modifiers:
(mp_, js_, cs_, oe_, if_)
"""
curr_mod = self.url_rewriter.wburl.mod
if curr_mod not in ('mp_', 'if_'):
return False
if not morsel.get('httponly'):
return False
path = morsel.get('path')
if not path or not path.endswith('/'):
return False
for mod in ('mp_', 'cs_', 'js_', 'im_', 'oe_', 'if_'):
new_path = path.replace(curr_mod + '/', mod + '/')
morsel['path'] = new_path
results.append((header, morsel.OutputString()))
return True
def _filter_morsel(self, morsel):
path = morsel.get('path')
if path:

View File

@ -21,6 +21,7 @@ import os
import json
import pytest
import six
import re
# ============================================================================
@ -277,6 +278,36 @@ class TestContentRewriter(object):
assert is_rw == False
def test_rewrite_cookies_all_mods(self):
headers = {'Set-Cookie': 'foo=bar; Expires=Wed, 13 Jan 2021 22:23:01 GMT; Path=/some/path/; HttpOnly'}
content = '\x11\x12\x13\x14'
headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701mp_')
mods = set()
assert len(headers.headers) == 6
for name, value in headers.headers:
assert name == 'Set-Cookie'
mods.add(re.search('Path=/prefix/201701([^/]+)', value).group(1))
assert mods == {'mp_', 'cs_', 'js_', 'im_', 'oe_', 'if_'}
assert is_rw == False
def test_rewrite_http_cookie_no_all_mods_no_slash(self):
headers = {'Set-Cookie': 'foo=bar; Expires=Wed, 13 Jan 2021 22:23:01 GMT; Path=/some/path; HttpOnly'}
content = 'abcdefg'
headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701mp_')
assert len(headers.headers) == 1
assert headers.headers[0][0] == 'Set-Cookie'
def test_rewrite_http_cookie_no_all_mods_wrong_mod(self):
headers = {'Set-Cookie': 'foo=bar; Expires=Wed, 13 Jan 2021 22:23:01 GMT; Path=/some/path/; HttpOnly'}
content = 'abcdefg'
headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701id_')
assert len(headers.headers) == 1
assert headers.headers[0][0] == 'Set-Cookie'
def test_binary_no_content_type(self):
headers = {}
content = '\x11\x12\x13\x14'

View File

@ -251,8 +251,8 @@ var _WBWombat = function($wbwindow, wbinfo) {
//============================================
var rewrite_url = rewrite_url_;
function rewrite_url_debug(url, use_rel, mod) {
var rewritten = rewrite_url_(url, use_rel, mod);
function rewrite_url_debug(url, use_rel, mod, doc) {
var rewritten = rewrite_url_(url, use_rel, mod, doc);
if (url != rewritten) {
console.log('REWRITE: ' + url + ' -> ' + rewritten);
} else {
@ -280,7 +280,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
//============================================
function rewrite_url_(url, use_rel, mod) {
function rewrite_url_(url, use_rel, mod, doc) {
// If undefined, just return it
if (!url) {
return url;
@ -369,7 +369,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
// Use a parser
if (url.charAt(0) == ".") {
url = resolve_rel_url(url);
url = resolve_rel_url(url, doc);
}
// If full url starting with http://, https:// or //
@ -1606,7 +1606,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
return;
}
var mod = rwModForElement(elem, name);
new_value = rewrite_url(value, false, mod);
new_value = rewrite_url(value, false, mod, elem.ownerDocument);
}
if (new_value != value) {

View File

@ -108,7 +108,7 @@ class BaseLoader(object):
# Try to set content-length, if it is available and valid
try:
content_len = int(content_len_str)
except (KeyError, TypeError):
except (ValueError, TypeError):
content_len = -1
if content_len >= 0:

View File

@ -1,6 +1,7 @@
from .base_config_test import BaseConfigTest, fmod_sl
from pywb.warcserver.test.testutils import HttpBinLiveTests
import pytest
import sys
# ============================================================================
@ -37,6 +38,26 @@ class TestLiveRewriter(HttpBinLiveTests, BaseConfigTest):
resp = self.head('/live/{0}httpbin.org/get?foo=bar', fmod_sl)
assert resp.status_int == 200
@pytest.mark.skipif(sys.version_info < (3,0), reason='does not respond in 2.7')
def test_live_bad_content_length(self, fmod_sl):
resp = self.get('/live/{0}httpbin.org/response-headers?content-length=149,149', fmod_sl, status=200)
assert resp.headers['Content-Length'] == '149'
resp = self.get('/live/{0}httpbin.org/response-headers?Content-Length=xyz', fmod_sl, status=200)
assert resp.headers['Content-Length'] == '90'
@pytest.mark.skipif(sys.version_info < (3,0), reason='does not respond in 2.7')
def test_live_bad_content_length_with_range(self, fmod_sl):
resp = self.get('/live/{0}httpbin.org/response-headers?content-length=149,149', fmod_sl,
headers={'Range': 'bytes=0-'}, status=206)
assert resp.headers['Content-Length'] == '149'
assert resp.headers['Content-Range'] == 'bytes 0-148/149'
resp = self.get('/live/{0}httpbin.org/response-headers?Content-Length=xyz', fmod_sl,
headers={'Range': 'bytes=0-'}, status=206)
assert resp.headers['Content-Length'] == '90'
assert resp.headers['Content-Range'] == 'bytes 0-89/90'
def test_live_live_frame(self):
resp = self.testapp.get('/live/http://example.com/')
assert resp.status_int == 200