1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

live rewriting/utf-8 headers: fix for sites that have utf-8 in headers despite standard (#402)

- attempt to encode headers as utf-8 first for live web, then latin-1 (similar to warcio http header parsing)
- only encode headers for py3 (in py2, headers are already bytestrings)
- tests: add tests for utf-8 in header
bump version to 2.1.1
This commit is contained in:
Ilya Kreymer 2018-10-26 15:06:59 -07:00 committed by GitHub
parent 1b151b74bf
commit e1e8917bc3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 56 additions and 4 deletions

View File

@ -1,4 +1,4 @@
__version__ = '2.1.0'
__version__ = '2.1.1'
DEFAULT_CONFIG = 'pywb/default_config.yaml'

View File

@ -353,6 +353,17 @@ class LiveWebLoader(BaseLoader):
v = self.unrewrite_header(cdx, v)
http_headers_buff += n + ': ' + v + '\r\n'
http_headers_buff += '\r\n'
try:
# http headers could be encoded as utf-8 (though non-standard)
# first try utf-8 encoding
http_headers_buff = http_headers_buff.encode('utf-8')
except:
# then, fall back to latin-1
http_headers_buff = http_headers_buff.encode('latin-1')
except: #pragma: no cover
#PY 2
resp_headers = orig_resp.msg.headers
@ -374,8 +385,8 @@ class LiveWebLoader(BaseLoader):
else:
http_headers_buff += line
http_headers_buff += '\r\n'
http_headers_buff = http_headers_buff.encode('latin-1')
# if python2, already byte headers, so leave as is
http_headers_buff += '\r\n'
try:
fp = upstream_res._fp.fp

View File

@ -22,11 +22,13 @@ def fmod_sl(request):
# ============================================================================
class BaseConfigTest(BaseTestClass):
lint_app = True
@classmethod
def get_test_app(cls, config_file, custom_config=None):
config_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), config_file)
app = FrontEndApp(config_file=config_file, custom_config=custom_config)
return app, webtest.TestApp(app)
return app, webtest.TestApp(app, lint=cls.lint_app)
@classmethod
def setup_class(cls, config_file, include_non_frame=True, custom_config=None):

View File

@ -1,14 +1,44 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from .base_config_test import BaseConfigTest, fmod_sl
from pywb.warcserver.test.testutils import HttpBinLiveTests
from pywb.utils.geventserver import GeventServer
import pytest
import sys
import six
# ============================================================================
def header_test_server(environ, start_response):
body = b'body'
value = u''
value = value.encode('utf-8')
if six.PY3:
value = value.decode('latin-1')
headers = []
if environ['PATH_INFO'] == '/unicode':
headers = [('Content-Length', str(len(body))),
('x-utf-8', value)]
start_response('200 OK', headers=headers)
return [body]
# ============================================================================
class TestLiveRewriter(HttpBinLiveTests, BaseConfigTest):
@classmethod
def setup_class(cls):
cls.lint_app = False
super(TestLiveRewriter, cls).setup_class('config_test.yaml')
cls.test_serv = GeventServer(header_test_server)
@classmethod
def teardown_class(cls):
cls.test_serv.stop()
super(TestLiveRewriter, cls).teardown_class()
def test_live_live_1(self, fmod_sl):
headers = [('User-Agent', 'python'), ('Referer', 'http://localhost:80/live/other.example.com')]
@ -58,6 +88,15 @@ class TestLiveRewriter(HttpBinLiveTests, BaseConfigTest):
assert resp.headers['Content-Length'] == '90'
assert resp.headers['Content-Range'] == 'bytes 0-89/90'
def test_custom_unicode_header(self, fmod_sl):
value = u''
value = value.encode('utf-8')
if six.PY3:
value = value.decode('latin-1')
resp = self.get('/live/{0}http://localhost:%s/unicode' % self.test_serv.port, fmod_sl)
assert resp.headers['x-utf-8'] == value
def test_live_live_frame(self):
resp = self.testapp.get('/live/http://example.com/')
assert resp.status_int == 200