1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

Brotli: Don't accept brotli if library can't be loaded. (#444)

* brotli: if the brotli module can not be loaded, print warning
and also remove `br` from any Accept-Encoding header to avoid recording with brotli, addresses #434
This commit is contained in:
Ilya Kreymer 2019-02-19 17:19:24 -08:00 committed by GitHub
parent 000ed89dc3
commit 32c1e6c85b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 36 additions and 3 deletions

View File

@ -6,6 +6,14 @@ from six.moves.urllib.parse import urlsplit
import re
try: # pragma: no cover
import brotli
has_brotli = True
except Exception: # pragma: no cover
has_brotli = False
print('Warning: brotli module could not be loaded, will not be able to replay brotli-encoded content')
#=============================================================================
class RewriteInputRequest(DirectWSGIInputRequest):
RANGE_ARG_RX = re.compile('.*.googlevideo.com/videoplayback.*([&?]range=(\d+)-(\d+))')
@ -79,6 +87,12 @@ class RewriteInputRequest(DirectWSGIInputRequest):
if self.splits:
value = self.splits.scheme
elif not has_brotli and name == 'HTTP_ACCEPT_ENCODING' and 'br' in value:
# if brotli not available, remove 'br' from accept-encoding to avoid
# capture brotli encoded content
name = 'Accept-Encoding'
value = ','.join([enc for enc in value.split(',') if enc.strip() != 'br'])
elif name.startswith('HTTP_'):
name = name[5:].title().replace('_', '-')

View File

@ -3,10 +3,15 @@ from pywb.manager.manager import main as manager
from pywb.manager.autoindex import AutoIndexer
from pywb.warcserver.test.testutils import to_path, HttpBinLiveTests, TEST_WARC_PATH, TEST_CDX_PATH
from warcio import ArchiveIterator
import os
import time
import json
from mock import patch
import pytest
# ============================================================================
class TestRecordReplay(HttpBinLiveTests, CollsDirMixin, BaseConfigTest):
@ -153,6 +158,20 @@ class TestRecordCustomConfig(HttpBinLiveTests, CollsDirMixin, BaseConfigTest):
assert names[0].startswith('pywb-rec-test-')
assert names[0].endswith('.warcgz')
TestRecordCustomConfig.warc_name = os.path.join(dir_name, names[0])
@patch('pywb.rewrite.rewriteinputreq.has_brotli', False)
def test_no_brotli(self):
res = self.testapp.get('/test-new/record/mp_/http://httpbin.org/get?C=D',
headers={'Accept-Encoding': 'gzip, deflate, br'})
assert '"C": "D"' in res.text
with open(self.warc_name, 'rb') as fh:
for record in ArchiveIterator(fh):
last_record = record
assert record.http_headers['Accept-Encoding'] == 'gzip, deflate'
# ============================================================================
class TestRecordFilter(HttpBinLiveTests, CollsDirMixin, BaseConfigTest):
@ -174,17 +193,17 @@ class TestRecordFilter(HttpBinLiveTests, CollsDirMixin, BaseConfigTest):
}
super(TestRecordFilter, cls).setup_class('config_test_record.yaml', custom_config=rec_custom)
manager(['init', 'test-new'])
def test_skip_existing(self):
dir_name = os.path.join(self.root_dir, '_test_colls', 'test-new', 'archive')
assert os.path.isdir(dir_name)
res = self.testapp.get('/fallback/cdx?url=http://example.com/?example=1')
assert res.text != ''
res = self.testapp.get('/test-new/record/mp_/http://example.com/?example=1')
assert 'Example Domain' in res.text
assert os.listdir(dir_name) == []
def test_record_new(self):
dir_name = os.path.join(self.root_dir, '_test_colls', 'test-new', 'archive')
assert os.path.isdir(dir_name)