1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-14 15:53:28 +01:00

RC7 Fixes (#561)

* misc fixes for 2.4.0rc7:
- warcserver: when parsing headers to check for redirect, reserialized headers
may be of different length then original, causing warcserver->app response to hang
now adjusting the content-length on the warc record and also not including a fixed
length when serving warcserver->app, possible fix for ukwa/ukwa-pywb#53
- undo change in path resolvers to use os.path.join, just concatenate full_path + filename
- rewrite 'date' -> 'x-orig-archive-date' header to avoid confusion (eg. #548)
- bump version to rc7

* ci: attempt to fix travis build for 27, 35
This commit is contained in:
Ilya Kreymer 2020-04-30 22:39:47 -07:00 committed by GitHub
parent 871a05a76a
commit 7e56ca8ca2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 56 additions and 17 deletions

View File

@ -2,6 +2,7 @@
set -e
pip install --upgrade pip setuptools
pip install 'Markupsafe<2.0.0'
python setup.py -q install
pip install -r extra_requirements.txt
pip install coverage pytest-cov coveralls

View File

@ -152,19 +152,19 @@ StatusAndHeadersParserException: Expected Status Line starting with ['HTTP/1.0',
>>> cli_lines(['--sort', '-', TEST_WARC_DIR])
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz
urn:x-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 example-wpull.warc.gz
Total: 212
Total: 213
# test sort, multiple inputs, recursive, from base test dir
>>> cli_lines(['--sort', '-r', '-', get_test_dir()])
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 warcs/example-url-agnostic-revisit.warc.gz
urn:x-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 warcs/example-wpull.warc.gz
Total: 212
Total: 213
# test sort, 9-field, multiple inputs, all records + post query
>>> cli_lines(['--sort', '-a', '-p', '-9', TEST_WARC_DIR])
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - 355 example-url-agnostic-revisit.warc.gz
urn:x-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - 3181 example-wpull.warc.gz
Total: 407
Total: 408
# test writing to stdout
>>> cli_lines(['-', TEST_WARC_DIR + 'example.warc.gz'])
@ -188,7 +188,7 @@ Total: 4
>>> cli_lines(['--sort', '--dir-root', get_test_dir() + 'other/', TEST_WARC_DIR])
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 ../warcs/example-url-agnostic-revisit.warc.gz
urn:x-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 ../warcs/example-wpull.warc.gz
Total: 212
Total: 213
# test writing to temp dir, also use unicode filename
>>> cli_lines_with_dir(TEST_WARC_DIR + 'example.warc.gz')

View File

@ -38,7 +38,7 @@ class DefaultHeaderRewriter(object):
'content-security-policy-report-only': 'prefix',
'content-type': 'keep',
'date': 'keep',
'date': 'prefix',
'etag': 'prefix',
'expires': 'prefix',

View File

@ -42,7 +42,7 @@ class TestHeaderRewriter(object):
res = """\
HTTP/1.0 200 OK\r\n\
Date: Fri, 03 Jan 2014 03:03:21 GMT\r\n\
X-Archive-Orig-Date: Fri, 03 Jan 2014 03:03:21 GMT\r\n\
X-Archive-Orig-Content-Length: 5\r\n\
Content-Type: text/html;charset=UTF-8\r\n\
"""

File diff suppressed because one or more lines are too long

View File

@ -1,4 +1,4 @@
__version__ = '2.4.0-rc6'
__version__ = '2.4.0-rc7'
if __name__ == '__main__':
print(__version__)

View File

@ -37,7 +37,8 @@ class PrefixResolver(object):
if hasattr(cdx, '_formatter') and cdx._formatter:
full_path = cdx._formatter.format(full_path)
path = os.path.join(full_path, filename)
#path = os.path.join(full_path, filename)
path = full_path + filename
if '*' not in path:
return path

View File

@ -60,7 +60,9 @@ class BaseLoader(object):
out_headers['Link'] = other_headers.get('Link')
out_headers['Memento-Datetime'] = other_headers.get('Memento-Datetime')
if not compress:
out_headers['Content-Length'] = other_headers.get('Content-Length')
known_length = other_headers.get('Content-Length')
if known_length:
out_headers['Content-Length'] = known_length
return out_headers, StreamIter(stream, closer=call_release_conn)
@ -75,12 +77,13 @@ class BaseLoader(object):
warc_headers_buff = warc_headers.to_bytes()
if not compress:
lenset = self._set_content_len(warc_headers.get_header('Content-Length'),
out_headers,
len(warc_headers_buff))
else:
lenset = False
# don't set length, just stream as is in case it is wrong
#if not compress:
# lenset = self._set_content_len(warc_headers.get_header('Content-Length'),
# out_headers,
# len(warc_headers_buff))
#else:
# lenset = False
streamiter = StreamIter(stream,
header1=warc_headers_buff,
@ -210,6 +213,10 @@ class WARCPathLoader(DefaultResolverMixin, BaseLoader):
# go through self-redirect check just in case
if not status or not status.startswith(('2', '4', '5')):
http_headers = self.headers_parser.parse(payload.raw_stream)
try:
orig_size = payload.raw_stream.tell()
except:
orig_size = 0
try:
self.raise_on_self_redirect(params, cdx,
@ -222,6 +229,14 @@ class WARCPathLoader(DefaultResolverMixin, BaseLoader):
http_headers_buff = http_headers.to_bytes()
# if new http_headers_buff is different length,
# attempt to adjust content-lenghth on the WARC record
if orig_size and len(http_headers_buff) != orig_size:
orig_cl = payload.rec_headers.get_header('Content-Length')
if orig_cl:
new_cl = int(orig_cl) + (len(http_headers_buff) - orig_size)
payload.rec_headers.replace_header('Content-Length', str(new_cl))
warc_headers = payload.rec_headers
if headers != payload:

View File

@ -0,0 +1 @@
org,iana)/bads 20140127171238 {"offset":"0","mime":"unk","url":"http://iana.org/bads","digest":"3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ","length":"485","status":"302","filename":"missing-status-text.warc"}

View File

@ -0,0 +1,16 @@
WARC/1.0
WARC-Type: response
WARC-Record-ID: <urn:uuid:125f3091-65f6-4d0e-96dd-dbff7c551275>
WARC-Date: 2014-01-27T17:12:38Z
Content-Length: 107
Content-Type: application/http; msgtype=response
WARC-Payload-Digest: sha1:3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ
WARC-Target-URI: http://iana.org/bads
WARC-Warcinfo-ID: <urn:uuid:f087c2a7-14b3-43b7-9c47-79f29f4b39e7>
HTTP/1.0 302
Location: http://www.iana.org/badst
Server: BigIP
Connection: close
Content-Length: 0

View File

@ -288,6 +288,11 @@ class TestWbIntegration(BaseConfigTest):
assert '"20140127171251"' in resp.text
assert '/pywb/{0}http://www.iana.org/domains/example'.format(fmod_slash) in resp.text, resp.text
def test_replay_content_bad_status_text(self, fmod):
# test larger file, rewritten file (svg!)
resp = self.get('/pywb/20140127171238{0}/https://iana.org/bads', fmod)
assert resp.headers['Content-Length'] == str(len(resp.text))
def test_replay_non_latest_content_location_ts(self, fmod):
fmod_slash = fmod + '/' if fmod else ''
resp = self.get('/pywb/{0}http://example.com/', fmod_slash)

2
wombat

@ -1 +1 @@
Subproject commit 1dc98bc1f3b90054536d767102b64d71e3da3ad1
Subproject commit 1aba3b2f3393ad46d15dbed50c9b6ed29185e2d7