mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
R6 - Various Fixes (#540)
* fixes for RC6: - blockrecordloader: ensure record stream is closed after parsing one record - wrap HttpLoader streams in StreamClosingReader() which should close the connection even if stream not fully consumed - simplify no_except_close may help with ukwa/ukwa-pywb#53 - iframe: add allow fullscreen, autoplay - wombat: update to latest, filter out custom wombat props from getOwnPropertyNames - rules: add rule for vimeo * cdx formatting: fix output=text to return plain text / non-cdxj output * auto fetch fix: - update to latest wombat to fix auto-fetch in rewriting mode - fix /proxy-fetch/ endpoint for proxy mode recording, switch proxy-fetch to run in recording mode - don't use global to allow repeated checks * rewriter html check: peek 1024 bytes to determine if page is html instead of 128 * fix jinja2 dependency for py2
This commit is contained in:
parent
fa021eebab
commit
92e459bda5
@ -82,6 +82,7 @@ class FrontEndApp(object):
|
|||||||
|
|
||||||
self.proxy_prefix = None # the URL prefix to be used for the collection with proxy mode (e.g. /coll/id_/)
|
self.proxy_prefix = None # the URL prefix to be used for the collection with proxy mode (e.g. /coll/id_/)
|
||||||
self.proxy_coll = None # the name of the collection that has proxy mode enabled
|
self.proxy_coll = None # the name of the collection that has proxy mode enabled
|
||||||
|
self.proxy_record = False # indicate if proxy recording
|
||||||
self.init_proxy(config)
|
self.init_proxy(config)
|
||||||
|
|
||||||
self.init_recorder(config.get('recorder'))
|
self.init_recorder(config.get('recorder'))
|
||||||
@ -627,17 +628,21 @@ class FrontEndApp(object):
|
|||||||
if proxy_coll in self.warcserver.list_fixed_routes():
|
if proxy_coll in self.warcserver.list_fixed_routes():
|
||||||
raise Exception('Can not record into fixed collection')
|
raise Exception('Can not record into fixed collection')
|
||||||
|
|
||||||
proxy_coll += self.RECORD_ROUTE
|
proxy_route = proxy_coll + self.RECORD_ROUTE
|
||||||
if not config.get('recorder'):
|
if not config.get('recorder'):
|
||||||
config['recorder'] = 'live'
|
config['recorder'] = 'live'
|
||||||
|
|
||||||
|
self.proxy_record = True
|
||||||
|
|
||||||
else:
|
else:
|
||||||
logging.info('Proxy enabled for collection "{0}"'.format(proxy_coll))
|
logging.info('Proxy enabled for collection "{0}"'.format(proxy_coll))
|
||||||
|
self.proxy_record = False
|
||||||
|
proxy_route = proxy_coll
|
||||||
|
|
||||||
if proxy_config.get('enable_content_rewrite', True):
|
if proxy_config.get('enable_content_rewrite', True):
|
||||||
self.proxy_prefix = '/{0}/bn_/'.format(proxy_coll)
|
self.proxy_prefix = '/{0}/bn_/'.format(proxy_route)
|
||||||
else:
|
else:
|
||||||
self.proxy_prefix = '/{0}/id_/'.format(proxy_coll)
|
self.proxy_prefix = '/{0}/id_/'.format(proxy_route)
|
||||||
|
|
||||||
self.proxy_default_timestamp = proxy_config.get('default_timestamp')
|
self.proxy_default_timestamp = proxy_config.get('default_timestamp')
|
||||||
if self.proxy_default_timestamp:
|
if self.proxy_default_timestamp:
|
||||||
@ -686,14 +691,14 @@ class FrontEndApp(object):
|
|||||||
return WbResponse.options_response(env)
|
return WbResponse.options_response(env)
|
||||||
|
|
||||||
# ensure full URL
|
# ensure full URL
|
||||||
request_url = env['REQUEST_URI']
|
url = env['REQUEST_URI'].split('/proxy-fetch/', 1)[-1]
|
||||||
# replace with /id_ so we do not get rewritten
|
|
||||||
url = request_url.replace('/proxy-fetch', '/id_')
|
env['REQUEST_URI'] = self.proxy_prefix + url
|
||||||
# update WSGI environment object
|
env['PATH_INFO'] = self.proxy_prefix + env['PATH_INFO'].split('/proxy-fetch/', 1)[-1]
|
||||||
env['REQUEST_URI'] = self.proxy_coll + url
|
|
||||||
env['PATH_INFO'] = env['PATH_INFO'].replace('/proxy-fetch', self.proxy_coll + '/id_')
|
|
||||||
# make request using normal serve_content
|
# make request using normal serve_content
|
||||||
response = self.serve_content(env, self.proxy_coll, url)
|
response = self.serve_content(env, self.proxy_coll, url, record=self.proxy_record)
|
||||||
|
|
||||||
# for WR
|
# for WR
|
||||||
if isinstance(response, WbResponse):
|
if isinstance(response, WbResponse):
|
||||||
response.add_access_control_headers(env=env)
|
response.add_access_control_headers(env=env)
|
||||||
|
@ -488,7 +488,7 @@ class RewriteInfo(object):
|
|||||||
else:
|
else:
|
||||||
return text_type
|
return text_type
|
||||||
|
|
||||||
buff = self.read_and_keep(128)
|
buff = self.read_and_keep(1024)
|
||||||
|
|
||||||
# check if doesn't start with a tag, then likely not html
|
# check if doesn't start with a tag, then likely not html
|
||||||
if self.TAG_REGEX.match(buff):
|
if self.TAG_REGEX.match(buff):
|
||||||
|
@ -113,7 +113,7 @@ if (!self.__WB_pmw) {{ self.__WB_pmw = function(obj) {{ this.__WB_source = obj;
|
|||||||
# rewriting 'this.' special properties access, not on new line (no ;)
|
# rewriting 'this.' special properties access, not on new line (no ;)
|
||||||
(r'(?<![$.])\s*this\b(?=(?:\.(?:{0})\b))'.format(prop_str), self.replace_str(this_rw), 0),
|
(r'(?<![$.])\s*this\b(?=(?:\.(?:{0})\b))'.format(prop_str), self.replace_str(this_rw), 0),
|
||||||
# rewrite '= this' or ', this'
|
# rewrite '= this' or ', this'
|
||||||
(r'(?<=[=,])\s*this\b\s*(?![.$])', self.replace_str(this_rw), 0),
|
(r'(?<=[=,])\s*this\b\s*(?![:.$])', self.replace_str(this_rw), 0),
|
||||||
# rewrite ')(this)'
|
# rewrite ')(this)'
|
||||||
('\}(?:\s*\))?\s*\(this\)', self.replace_str(this_rw), 0),
|
('\}(?:\s*\))?\s*\(this\)', self.replace_str(this_rw), 0),
|
||||||
# rewrite this in && or || expr?
|
# rewrite this in && or || expr?
|
||||||
|
@ -197,6 +197,9 @@ r"""
|
|||||||
>>> _test_js_obj_proxy('return this.foo')
|
>>> _test_js_obj_proxy('return this.foo')
|
||||||
'return this.foo'
|
'return this.foo'
|
||||||
|
|
||||||
|
>>> _test_js_obj_proxy('{foo: bar, this: other}')
|
||||||
|
'{foo: bar, this: other}'
|
||||||
|
|
||||||
>>> _test_js_obj_proxy(r'this.$location = http://example.com/')
|
>>> _test_js_obj_proxy(r'this.$location = http://example.com/')
|
||||||
'this.$location = http://example.com/'
|
'this.$location = http://example.com/'
|
||||||
|
|
||||||
|
@ -344,7 +344,7 @@ rules:
|
|||||||
- videoFileId
|
- videoFileId
|
||||||
- signature
|
- signature
|
||||||
|
|
||||||
- url_prefix: 'net,akamaized,gcs-vimeo)/'
|
- url_prefix: ['net,akamaized,gcs-vimeo)/', 'net,akamaized,vod)/']
|
||||||
|
|
||||||
fuzzy_lookup:
|
fuzzy_lookup:
|
||||||
match: '([/\d]+\.mp4)$'
|
match: '([/\d]+\.mp4)$'
|
||||||
|
@ -23,7 +23,7 @@ var config = {
|
|||||||
rwRe: null,
|
rwRe: null,
|
||||||
defaultFetchOptions: {
|
defaultFetchOptions: {
|
||||||
cache: 'force-cache',
|
cache: 'force-cache',
|
||||||
mode: null
|
mode: 'cors'
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -53,7 +53,7 @@ if (!config.haveFetch) {
|
|||||||
xhr.onreadystatechange = function() {
|
xhr.onreadystatechange = function() {
|
||||||
if (xhr.readyState === 4) {
|
if (xhr.readyState === 4) {
|
||||||
if (!config.havePromise) {
|
if (!config.havePromise) {
|
||||||
fetchDoneOrErrored();
|
fetchDone();
|
||||||
}
|
}
|
||||||
resolve();
|
resolve();
|
||||||
}
|
}
|
||||||
@ -78,7 +78,7 @@ if (location.search.indexOf('init') !== -1) {
|
|||||||
config.prefix = init.prefix;
|
config.prefix = init.prefix;
|
||||||
config.mod = init.mod;
|
config.mod = init.mod;
|
||||||
config.prefixMod = init.prefix + init.mod;
|
config.prefixMod = init.prefix + init.mod;
|
||||||
config.rwRe = new RegExp(init.rwRe, 'g');
|
config.rwRe = new RegExp(init.rwRe);
|
||||||
config.relative = init.prefix.split(location.origin)[1];
|
config.relative = init.prefix.split(location.origin)[1];
|
||||||
config.schemeless = '/' + config.relative;
|
config.schemeless = '/' + config.relative;
|
||||||
})();
|
})();
|
||||||
@ -101,11 +101,16 @@ self.onmessage = function(event) {
|
|||||||
|
|
||||||
function noop() {}
|
function noop() {}
|
||||||
|
|
||||||
function fetchDoneOrErrored() {
|
function fetchDone() {
|
||||||
runningFetches -= 1;
|
runningFetches -= 1;
|
||||||
fetchFromQ();
|
fetchFromQ();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function fetchErrored(err) {
|
||||||
|
console.warn("Fetch Failed: " + err);
|
||||||
|
fetchDone();
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Fetches the supplied URL and increments the {@link runningFetches} variable
|
* Fetches the supplied URL and increments the {@link runningFetches} variable
|
||||||
* to represent an inflight request.
|
* to represent an inflight request.
|
||||||
@ -130,8 +135,8 @@ function fetchURL(toBeFetched) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fetch(url, options)
|
fetch(url, options)
|
||||||
.then(fetchDoneOrErrored)
|
.then(fetchDone)
|
||||||
.catch(fetchDoneOrErrored);
|
.catch(fetchErrored);
|
||||||
}
|
}
|
||||||
|
|
||||||
function queueOrFetch(toBeFetched) {
|
function queueOrFetch(toBeFetched) {
|
||||||
|
File diff suppressed because one or more lines are too long
@ -20,7 +20,7 @@ html, body
|
|||||||
<body style="margin: 0px; padding: 0px;">
|
<body style="margin: 0px; padding: 0px;">
|
||||||
|
|
||||||
<div id="wb_iframe_div">
|
<div id="wb_iframe_div">
|
||||||
<iframe id="replay_iframe" frameborder="0" seamless="seamless" scrolling="yes" class="wb_iframe"></iframe>
|
<iframe id="replay_iframe" frameborder="0" seamless="seamless" scrolling="yes" class="wb_iframe" allow="autoplay; fullscreen"></iframe>
|
||||||
</div>
|
</div>
|
||||||
<script>
|
<script>
|
||||||
var cframe = new ContentFrame({"url": "{{ url }}" + window.location.hash,
|
var cframe = new ContentFrame({"url": "{{ url }}" + window.location.hash,
|
||||||
|
@ -6,25 +6,22 @@ from warcio.limitreader import LimitReader
|
|||||||
from warcio.utils import BUFF_SIZE
|
from warcio.utils import BUFF_SIZE
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
def no_except_close(closable):
|
def no_except_close(closable):
|
||||||
"""Attempts to call the close method of the
|
"""Attempts to call the close method of the
|
||||||
supplied object.
|
supplied object catching all exceptions.
|
||||||
|
Also tries to call release_conn() in case a requests raw stream
|
||||||
|
|
||||||
:param closable: The object to be closed
|
:param closable: The object to be closed
|
||||||
:rtype: None
|
:rtype: None
|
||||||
"""
|
"""
|
||||||
if not closable:
|
|
||||||
return
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
closable.close()
|
closable.close()
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
try:
|
try:
|
||||||
release_conn = getattr(closable, 'release_conn', None)
|
closable.release_conn()
|
||||||
if release_conn is not None:
|
|
||||||
release_conn()
|
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@ -121,3 +118,18 @@ class OffsetLimitReader(LimitReader):
|
|||||||
def readline(self, length=None):
|
def readline(self, length=None):
|
||||||
self._skip()
|
self._skip()
|
||||||
return super(OffsetLimitReader, self).readline(length)
|
return super(OffsetLimitReader, self).readline(length)
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
class StreamClosingReader(object):
|
||||||
|
def __init__(self, stream):
|
||||||
|
self.stream = stream
|
||||||
|
|
||||||
|
def read(self, length=None):
|
||||||
|
return self.stream.read(length)
|
||||||
|
|
||||||
|
def readline(self, length=None):
|
||||||
|
return self.stream.readline(length)
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
no_except_close(self.stream)
|
||||||
|
@ -21,7 +21,7 @@ import re
|
|||||||
|
|
||||||
from io import open, BytesIO
|
from io import open, BytesIO
|
||||||
from warcio.limitreader import LimitReader
|
from warcio.limitreader import LimitReader
|
||||||
from pywb.utils.io import no_except_close
|
from pywb.utils.io import no_except_close, StreamClosingReader
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import boto3
|
import boto3
|
||||||
@ -355,7 +355,7 @@ class HttpLoader(BaseLoader):
|
|||||||
|
|
||||||
r = self.session.get(url, headers=headers, stream=True)
|
r = self.session.get(url, headers=headers, stream=True)
|
||||||
r.raise_for_status()
|
r.raise_for_status()
|
||||||
return r.raw
|
return StreamClosingReader(r.raw)
|
||||||
|
|
||||||
|
|
||||||
# =================================================================
|
# =================================================================
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
__version__ = '2.4.0-rc5'
|
__version__ = '2.4.0-rc6'
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
print(__version__)
|
print(__version__)
|
||||||
|
@ -181,10 +181,13 @@ class CDXObject(OrderedDict):
|
|||||||
:param fields: list of field names to output.
|
:param fields: list of field names to output.
|
||||||
"""
|
"""
|
||||||
if fields is None:
|
if fields is None:
|
||||||
return str(self) + '\n'
|
if self.cdxline:
|
||||||
|
return to_native_str(self.cdxline, 'utf-8') + '\n'
|
||||||
|
|
||||||
|
fields = six.iterkeys(self)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
result = ' '.join(str(self[x]) for x in fields) + '\n'
|
result = ' '.join(str(self.get(x, '-')) for x in fields) + '\n'
|
||||||
except KeyError as ke:
|
except KeyError as ke:
|
||||||
msg = 'Invalid field "{0}" found in fields= argument'
|
msg = 'Invalid field "{0}" found in fields= argument'
|
||||||
msg = msg.format(str(ke))
|
msg = msg.format(str(ke))
|
||||||
|
@ -20,6 +20,7 @@ org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201308 https://www.iana.or
|
|||||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201249 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 771773 iana.warc.gz 117166 198285 iana.warc.gz
|
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201249 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 771773 iana.warc.gz 117166 198285 iana.warc.gz
|
||||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201240 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 551 757988 iana.warc.gz 117166 198285 iana.warc.gz
|
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201240 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 551 757988 iana.warc.gz 117166 198285 iana.warc.gz
|
||||||
|
|
||||||
|
|
||||||
>>> cdx_ops_test('http://iana.org/_js/2013.1/jquery.js', reverse = True, resolveRevisits = True, limit = 1)
|
>>> cdx_ops_test('http://iana.org/_js/2013.1/jquery.js', reverse = True, resolveRevisits = True, limit = 1)
|
||||||
org,iana)/_js/2013.1/jquery.js 20140126201307 https://www.iana.org/_js/2013.1/jquery.js application/x-javascript 200 AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO - - 543 778507 iana.warc.gz 33449 7311 iana.warc.gz
|
org,iana)/_js/2013.1/jquery.js 20140126201307 https://www.iana.org/_js/2013.1/jquery.js application/x-javascript 200 AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO - - 543 778507 iana.warc.gz 33449 7311 iana.warc.gz
|
||||||
|
|
||||||
|
@ -37,7 +37,7 @@ class PrefixResolver(object):
|
|||||||
if hasattr(cdx, '_formatter') and cdx._formatter:
|
if hasattr(cdx, '_formatter') and cdx._formatter:
|
||||||
full_path = cdx._formatter.format(full_path)
|
full_path = cdx._formatter.format(full_path)
|
||||||
|
|
||||||
path = full_path + filename
|
path = os.path.join(full_path, filename)
|
||||||
if '*' not in path:
|
if '*' not in path:
|
||||||
return path
|
return path
|
||||||
|
|
||||||
|
@ -2,7 +2,7 @@ six
|
|||||||
warcio>=1.7.1
|
warcio>=1.7.1
|
||||||
requests
|
requests
|
||||||
redis<3.0
|
redis<3.0
|
||||||
jinja2
|
jinja2<3.0.0
|
||||||
surt>=0.3.1
|
surt>=0.3.1
|
||||||
brotlipy
|
brotlipy
|
||||||
pyyaml
|
pyyaml
|
||||||
|
@ -46,6 +46,19 @@ class TestCDXApp(BaseTestClass):
|
|||||||
assert len(lines) == 3, resp.text
|
assert len(lines) == 3, resp.text
|
||||||
assert len(list(map(json.loads, lines))) == 3
|
assert len(list(map(json.loads, lines))) == 3
|
||||||
|
|
||||||
|
def test_exact_url_plain_text(self):
|
||||||
|
"""
|
||||||
|
basic exact match, no filters, etc.
|
||||||
|
"""
|
||||||
|
resp = self.query('http://www.iana.org/', output='text')
|
||||||
|
|
||||||
|
assert resp.status_code == 200
|
||||||
|
assert resp.content_type == 'text/plain'
|
||||||
|
assert '{' not in resp.text
|
||||||
|
|
||||||
|
lines = resp.text.splitlines()
|
||||||
|
assert len(lines) == 3, resp.text
|
||||||
|
|
||||||
def test_prefix_match(self):
|
def test_prefix_match(self):
|
||||||
"""
|
"""
|
||||||
prefix match test
|
prefix match test
|
||||||
|
@ -430,12 +430,16 @@ class TestProxyIncludeAutoFetchWorkerNotWombat(BaseTestProxy):
|
|||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
class TestProxyAutoFetchWorkerEndPoints(BaseTestProxy):
|
class TestProxyAutoFetchWorkerEndPoints(CollsDirMixin, BaseTestProxy):
|
||||||
@classmethod
|
@classmethod
|
||||||
def setup_class(cls):
|
def setup_class(cls):
|
||||||
super(TestProxyAutoFetchWorkerEndPoints, cls).setup_class(
|
super(TestProxyAutoFetchWorkerEndPoints, cls).setup_class(
|
||||||
proxy_opts={'enable_wombat': True}, config_opts={'enable_auto_fetch': True}
|
coll='test2',
|
||||||
|
config_file='config_test_record.yaml',
|
||||||
|
proxy_opts={'enable_wombat': True}, config_opts={'enable_auto_fetch': True},
|
||||||
|
recording=True
|
||||||
)
|
)
|
||||||
|
manager(['init', 'test2'])
|
||||||
|
|
||||||
def test_proxy_fetch_options_request(self, scheme):
|
def test_proxy_fetch_options_request(self, scheme):
|
||||||
expected_origin = '{0}://example.com'.format(scheme)
|
expected_origin = '{0}://example.com'.format(scheme)
|
||||||
|
2
wombat
2
wombat
@ -1 +1 @@
|
|||||||
Subproject commit b8a75357e82ef91b006be177cc3e5d827e02ff7d
|
Subproject commit 1dc98bc1f3b90054536d767102b64d71e3da3ad1
|
Loading…
x
Reference in New Issue
Block a user