1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

Rewriting fix for DASH FB and document.write (#529)

* rewrite fixes:
- dash rewrite fix for fb: when rewriting, match quoted '"dash_prefetched_representation_ids"' as well as w/o quotes,
update tests to ensure rewriting both old and new formats
- wombat update to fix #527: ensure document.write() doesn't accidentally remove end-tag if end-tag was not lowercase (see webrecorder/wombat#21)

* tests: fix recorder cookie filtering test, use https://www.google.com/ for testing

* appveyor: fix appveyor builds
This commit is contained in:
Ilya Kreymer 2020-01-11 10:44:49 -08:00 committed by GitHub
parent 523e35d973
commit f0b9d5b8e8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 43 additions and 13 deletions

View File

@ -23,6 +23,7 @@ install:
- "pip install pypiwin32"
- "pip install certauth boto3 youtube-dl pysocks"
- "pip install codecov"
- "pip install wheel"
build_script:
- "python setup.py install"

View File

@ -71,8 +71,8 @@ class TestRecorder(LiveServerTests, HttpBinLiveTests, FakeRedisTests, TempDirTes
return dedup_index
def _test_warc_write(self, recorder_app, host, path, other_params='', link_url=''):
url = 'http://' + host + path
def _test_warc_write(self, recorder_app, host, path, other_params='', link_url='', protocol='http'):
url = protocol + '://' + host + path
req_url = '/live/resource/postreq?url=' + url + other_params
testapp = webtest.TestApp(recorder_app)
resp = testapp.post(req_url, general_req_data.format(host=host, path=path).encode('utf-8'))
@ -231,8 +231,9 @@ class TestRecorder(LiveServerTests, HttpBinLiveTests, FakeRedisTests, TempDirTes
PerRecordWARCWriter(warc_path, header_filter=header_filter),
accept_colls='live')
resp = self._test_warc_write(recorder_app, 'www.google.com', '/')
assert b'HTTP/1.1 302' in resp.body
resp = self._test_warc_write(recorder_app, 'www.google.com', '/', protocol='https')
print(resp.body.decode('utf-8'))
#assert b'HTTP/1.1 302' in resp.body
buff = BytesIO(resp.body)
record = ArcWarcRecordLoader().parse_record_stream(buff)

View File

@ -59,21 +59,30 @@ class RewriteDASH(BufferedRewriter):
# ============================================================================
def rewrite_fb_dash(string, *args):
DASH_SPLIT = r'\n",dash_prefetched_representation_ids:'
inx = string.find(DASH_SPLIT)
DASH_SPLITS = [r'\n",dash_prefetched_representation_ids:', r'\n","dash_prefetched_representation_ids":']
inx = -1
split = None
for split in DASH_SPLITS:
inx = string.find(split)
if inx >= 0:
break
if inx < 0:
return string
return
string = string[:inx]
buff = string.encode('utf-8').decode('unicode-escape')
buff = buff.replace('\\/', '/')
buff = buff.encode('utf-8')
io = BytesIO(buff)
io, best_ids = RewriteDASH().rewrite_dash(io, None)
string = json.dumps(io.read().decode('utf-8'))
buff = io.read().decode('utf-8')
string = json.dumps(buff)
string = string[1:-1].replace('<', r'\x3C')
string += DASH_SPLIT
string += split
string += json.dumps(best_ids)
return string

View File

@ -718,6 +718,25 @@ http://example.com/video_4.m3u8
assert 'dash_prefetched_representation_ids:["1", "7"]' in result
assert rep_ids not in result
def test_dash_fb_in_js_2(self):
headers = {'Content-Type': 'text/javascript'}
with open(os.path.join(get_test_dir(), 'text_content', 'sample_dash.mpd'), 'rt') as fh:
content = 'dash_manifest:"' + fh.read().encode('unicode-escape').decode('utf-8')
rep_ids = r'\n","dash_prefetched_representation_ids":["4","5"]'
content += rep_ids
headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701js_',
url='http://facebook.com/example/dash/manifest.mpd')
assert headers.headers == [('Content-Type', 'text/javascript')]
result = b''.join(gen).decode('utf-8')
# 4, 5 representations removed, replaced with default 1, 7
assert '"dash_prefetched_representation_ids":["1", "7"]' in result
assert rep_ids not in result
def test_dash_custom_max_resolution(self):
headers = {'Content-Type': 'application/dash+xml'}
with open(os.path.join(get_test_dir(), 'text_content', 'sample_dash.mpd'), 'rt') as fh:

View File

@ -166,7 +166,7 @@ rules:
- match: 'Bootloader\.configurePage.*?;'
replace: '/* {0} */'
- match: 'dash_manifest:"(.*",dash_prefetched_representation_ids:.*?])'
- match: 'dash_manifest"?:"(.*","?dash_prefetched_representation_ids"?:.*?])'
group: 1
function: 'pywb.rewrite.rewrite_dash:rewrite_fb_dash'

File diff suppressed because one or more lines are too long

View File

@ -93,7 +93,7 @@ setup(
long_description=get_ldecription(),
license='GPL',
packages=find_packages(exclude=['tests_disabled']),
zip_safe=True,
zip_safe=False,
package_data={
'pywb': get_package_data(),
},

2
wombat

@ -1 +1 @@
Subproject commit c3276154de61196c0c34d9f5f1242706d6e407b6
Subproject commit b8a75357e82ef91b006be177cc3e5d827e02ff7d