R6 - Various Fixes (#540)

* fixes for RC6: - blockrecordloader: ensure record stream is closed after parsing one record - wrap HttpLoader streams in StreamClosingReader() which should close the connection even if stream not fully consumed - simplify no_except_close may help with ukwa/ukwa-pywb#53 - iframe: add allow fullscreen, autoplay - wombat: update to latest, filter out custom wombat props from getOwnPropertyNames - rules: add rule for vimeo * cdx formatting: fix output=text to return plain text / non-cdxj output * auto fetch fix: - update to latest wombat to fix auto-fetch in rewriting mode - fix /proxy-fetch/ endpoint for proxy mode recording, switch proxy-fetch to run in recording mode - don't use global to allow repeated checks * rewriter html check: peek 1024 bytes to determine if page is html instead of 128 * fix jinja2 dependency for py2
2025-03-15 00:03:28 +01:00 · 2020-02-20 21:53:00 -08:00 · 2020-02-20 21:53:00 -08:00 · 92e459bda5
commit 92e459bda5
parent fa021eebab
18 changed files with 84 additions and 38 deletions
--- a/pywb/apps/frontendapp.py
+++ b/pywb/apps/frontendapp.py
@ -82,6 +82,7 @@ class FrontEndApp(object):
        self.proxy_prefix = None  # the URL prefix to be used for the collection with proxy mode (e.g. /coll/id_/)
        self.proxy_coll = None  # the name of the collection that has proxy mode enabled
        self.proxy_record = False # indicate if proxy recording
        self.init_proxy(config)
        self.init_recorder(config.get('recorder'))
@ -627,17 +628,21 @@ class FrontEndApp(object):
            if proxy_coll in self.warcserver.list_fixed_routes():
                raise Exception('Can not record into fixed collection')
-            proxy_coll += self.RECORD_ROUTE
+            proxy_route = proxy_coll + self.RECORD_ROUTE
            if not config.get('recorder'):
                config['recorder'] = 'live'
            self.proxy_record = True
        else:
            logging.info('Proxy enabled for collection "{0}"'.format(proxy_coll))
            self.proxy_record = False
            proxy_route = proxy_coll
        if proxy_config.get('enable_content_rewrite', True):
-            self.proxy_prefix = '/{0}/bn_/'.format(proxy_coll)
+            self.proxy_prefix = '/{0}/bn_/'.format(proxy_route)
        else:
-            self.proxy_prefix = '/{0}/id_/'.format(proxy_coll)
+            self.proxy_prefix = '/{0}/id_/'.format(proxy_route)
        self.proxy_default_timestamp = proxy_config.get('default_timestamp')
        if self.proxy_default_timestamp:
@ -686,14 +691,14 @@ class FrontEndApp(object):
            return WbResponse.options_response(env)
        # ensure full URL
-        request_url = env['REQUEST_URI']
+        url = env['REQUEST_URI'].split('/proxy-fetch/', 1)[-1]
-        # replace with /id_ so we do not get rewritten
+
-        url = request_url.replace('/proxy-fetch', '/id_')
+        env['REQUEST_URI'] = self.proxy_prefix + url
-        # update WSGI environment object
+        env['PATH_INFO'] = self.proxy_prefix + env['PATH_INFO'].split('/proxy-fetch/', 1)[-1]
-        env['REQUEST_URI'] = self.proxy_coll + url
+
        env['PATH_INFO'] = env['PATH_INFO'].replace('/proxy-fetch', self.proxy_coll + '/id_')
        # make request using normal serve_content
-        response = self.serve_content(env, self.proxy_coll, url)
+        response = self.serve_content(env, self.proxy_coll, url, record=self.proxy_record)
        # for WR
        if isinstance(response, WbResponse):
            response.add_access_control_headers(env=env)
--- a/pywb/rewrite/content_rewriter.py
+++ b/pywb/rewrite/content_rewriter.py
@ -488,7 +488,7 @@ class RewriteInfo(object):
        else:
            return text_type
-        buff = self.read_and_keep(128)
+        buff = self.read_and_keep(1024)
        # check if doesn't start with a tag, then likely not html
        if self.TAG_REGEX.match(buff):
--- a/pywb/rewrite/regex_rewriters.py
+++ b/pywb/rewrite/regex_rewriters.py
@ -113,7 +113,7 @@ if (!self.__WB_pmw) {{ self.__WB_pmw = function(obj) {{ this.__WB_source = obj;
            # rewriting 'this.' special properties access, not on new line (no ;)
            (r'(?<![$.])\s*this\b(?=(?:\.(?:{0})\b))'.format(prop_str), self.replace_str(this_rw), 0),
            # rewrite '= this' or ', this'
-            (r'(?<=[=,])\s*this\b\s*(?![.$])', self.replace_str(this_rw), 0),
+            (r'(?<=[=,])\s*this\b\s*(?![:.$])', self.replace_str(this_rw), 0),
            # rewrite ')(this)'
            ('\}(?:\s*\))?\s*\(this\)', self.replace_str(this_rw), 0),
            # rewrite this in && or || expr?
--- a/pywb/rewrite/test/test_regex_rewriters.py
+++ b/pywb/rewrite/test/test_regex_rewriters.py
@ -197,6 +197,9 @@ r"""
 >>> _test_js_obj_proxy('return this.foo')
 'return this.foo'
 >>> _test_js_obj_proxy('{foo: bar, this: other}')
 '{foo: bar, this: other}'
 >>> _test_js_obj_proxy(r'this.$location = http://example.com/')
 'this.$location = http://example.com/'
--- a/pywb/rules.yaml
+++ b/pywb/rules.yaml
@ -344,7 +344,7 @@ rules:
          - videoFileId
          - signature
-    - url_prefix: 'net,akamaized,gcs-vimeo)/'
+    - url_prefix: ['net,akamaized,gcs-vimeo)/', 'net,akamaized,vod)/']
      fuzzy_lookup:
        match: '([/\d]+\.mp4)$'
--- a/pywb/static/autoFetchWorker.js
+++ b/pywb/static/autoFetchWorker.js
@ -23,7 +23,7 @@ var config = {
  rwRe: null,
  defaultFetchOptions: {
    cache: 'force-cache',
-    mode: null
+    mode: 'cors'
  }
 };
@ -53,7 +53,7 @@ if (!config.haveFetch) {
      xhr.onreadystatechange = function() {
        if (xhr.readyState === 4) {
          if (!config.havePromise) {
-            fetchDoneOrErrored();
+            fetchDone();
          }
          resolve();
        }
@ -78,7 +78,7 @@ if (location.search.indexOf('init') !== -1) {
    config.prefix = init.prefix;
    config.mod = init.mod;
    config.prefixMod = init.prefix + init.mod;
-    config.rwRe = new RegExp(init.rwRe, 'g');
+    config.rwRe = new RegExp(init.rwRe);
    config.relative = init.prefix.split(location.origin)[1];
    config.schemeless = '/' + config.relative;
  })();
@ -101,11 +101,16 @@ self.onmessage = function(event) {
 function noop() {}
-function fetchDoneOrErrored() {
+function fetchDone() {
  runningFetches -= 1;
  fetchFromQ();
 }
 function fetchErrored(err) {
  console.warn("Fetch Failed: " + err);
  fetchDone();
 }
 /**
 * Fetches the supplied URL and increments the {@link runningFetches} variable
 * to represent an inflight request.
@ -130,8 +135,8 @@ function fetchURL(toBeFetched) {
  }
  fetch(url, options)
-    .then(fetchDoneOrErrored)
+    .then(fetchDone)
-    .catch(fetchDoneOrErrored);
+    .catch(fetchErrored);
 }
 function queueOrFetch(toBeFetched) {
--- a/pywb/static/wombat.js
+++ b/pywb/static/wombat.js
--- a/pywb/templates/frame_insert.html
+++ b/pywb/templates/frame_insert.html
@ -20,7 +20,7 @@ html, body
 <body style="margin: 0px; padding: 0px;">
 <div id="wb_iframe_div">
-<iframe id="replay_iframe" frameborder="0" seamless="seamless" scrolling="yes" class="wb_iframe"></iframe>
+<iframe id="replay_iframe" frameborder="0" seamless="seamless" scrolling="yes" class="wb_iframe" allow="autoplay; fullscreen"></iframe>
 </div>
 <script>
  var cframe = new ContentFrame({"url": "{{ url }}" + window.location.hash,
--- a/pywb/utils/io.py
+++ b/pywb/utils/io.py
@ -6,25 +6,22 @@ from warcio.limitreader import LimitReader
 from warcio.utils import BUFF_SIZE
 # =============================================================================
 def no_except_close(closable):
    """Attempts to call the close method of the
-    supplied object.
+    supplied object catching all exceptions.
    Also tries to call release_conn() in case a requests raw stream
    :param closable: The object to be closed
    :rtype: None
    """
    if not closable:
        return
    try:
        closable.close()
    except Exception:
        pass
    try:
-        release_conn = getattr(closable, 'release_conn', None)
+        closable.release_conn()
        if release_conn is not None:
            release_conn()
    except Exception:
        pass
@ -121,3 +118,18 @@ class OffsetLimitReader(LimitReader):
    def readline(self, length=None):
        self._skip()
        return super(OffsetLimitReader, self).readline(length)
 # ============================================================================
 class StreamClosingReader(object):
    def __init__(self, stream):
        self.stream = stream
    def read(self, length=None):
        return self.stream.read(length)
    def readline(self, length=None):
        return self.stream.readline(length)
    def close(self):
        no_except_close(self.stream)
--- a/pywb/utils/loaders.py
+++ b/pywb/utils/loaders.py
@ -21,7 +21,7 @@ import re
 from io import open, BytesIO
 from warcio.limitreader import LimitReader
-from pywb.utils.io import no_except_close
+from pywb.utils.io import no_except_close, StreamClosingReader
 try:
    import boto3
@ -355,7 +355,7 @@ class HttpLoader(BaseLoader):
        r = self.session.get(url, headers=headers, stream=True)
        r.raise_for_status()
-        return r.raw
+        return StreamClosingReader(r.raw)
 # =================================================================
--- a/pywb/version.py
+++ b/pywb/version.py
@ -1,4 +1,4 @@
-__version__ = '2.4.0-rc5'
+__version__ = '2.4.0-rc6'
 if __name__ == '__main__':
    print(__version__)
--- a/pywb/warcserver/index/cdxobject.py
+++ b/pywb/warcserver/index/cdxobject.py
@ -181,10 +181,13 @@ class CDXObject(OrderedDict):
        :param fields: list of field names to output.
        """
        if fields is None:
-            return str(self) + '\n'
+            if self.cdxline:
                return to_native_str(self.cdxline, 'utf-8') + '\n'
            fields = six.iterkeys(self)
        try:
-            result = ' '.join(str(self[x]) for x in fields) + '\n'
+            result = ' '.join(str(self.get(x, '-')) for x in fields) + '\n'
        except KeyError as ke:
            msg = 'Invalid field "{0}" found in fields= argument'
            msg = msg.format(str(ke))
--- a/pywb/warcserver/index/test/test_cdxops.py
+++ b/pywb/warcserver/index/test/test_cdxops.py
@ -20,6 +20,7 @@ org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201308 https://www.iana.or
 org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201249 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 771773 iana.warc.gz 117166 198285 iana.warc.gz
 org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201240 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 551 757988 iana.warc.gz 117166 198285 iana.warc.gz
 >>> cdx_ops_test('http://iana.org/_js/2013.1/jquery.js', reverse = True, resolveRevisits = True, limit = 1)
 org,iana)/_js/2013.1/jquery.js 20140126201307 https://www.iana.org/_js/2013.1/jquery.js application/x-javascript 200 AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO - - 543 778507 iana.warc.gz 33449 7311 iana.warc.gz
--- a/pywb/warcserver/resource/pathresolvers.py
+++ b/pywb/warcserver/resource/pathresolvers.py
@ -37,7 +37,7 @@ class PrefixResolver(object):
        if hasattr(cdx, '_formatter') and cdx._formatter:
            full_path = cdx._formatter.format(full_path)
-        path = full_path + filename
+        path = os.path.join(full_path, filename)
        if '*' not in path:
            return path
--- a/requirements.txt
+++ b/requirements.txt
@ -2,7 +2,7 @@ six
 warcio>=1.7.1
 requests
 redis<3.0
-jinja2
+jinja2<3.0.0
 surt>=0.3.1
 brotlipy
 pyyaml
--- a/tests/test_cdx_server_app.py
+++ b/tests/test_cdx_server_app.py
@ -46,6 +46,19 @@ class TestCDXApp(BaseTestClass):
        assert len(lines) == 3, resp.text
        assert len(list(map(json.loads, lines))) == 3
    def test_exact_url_plain_text(self):
        """
        basic exact match, no filters, etc.
        """
        resp = self.query('http://www.iana.org/', output='text')
        assert resp.status_code == 200
        assert resp.content_type == 'text/plain'
        assert '{' not in resp.text
        lines = resp.text.splitlines()
        assert len(lines) == 3, resp.text
    def test_prefix_match(self):
        """
        prefix match test
--- a/tests/test_proxy.py
+++ b/tests/test_proxy.py
@ -430,12 +430,16 @@ class TestProxyIncludeAutoFetchWorkerNotWombat(BaseTestProxy):
 # ============================================================================
-class TestProxyAutoFetchWorkerEndPoints(BaseTestProxy):
+class TestProxyAutoFetchWorkerEndPoints(CollsDirMixin, BaseTestProxy):
    @classmethod
    def setup_class(cls):
        super(TestProxyAutoFetchWorkerEndPoints, cls).setup_class(
-            proxy_opts={'enable_wombat': True}, config_opts={'enable_auto_fetch': True}
+            coll='test2',
            config_file='config_test_record.yaml',
            proxy_opts={'enable_wombat': True}, config_opts={'enable_auto_fetch': True},
            recording=True
        )
        manager(['init', 'test2'])
    def test_proxy_fetch_options_request(self, scheme):
        expected_origin = '{0}://example.com'.format(scheme)
--- a/2
+++ b/2
@ -1 +1 @@
-Subproject commit b8a75357e82ef91b006be177cc3e5d827e02ff7d
+Subproject commit 1dc98bc1f3b90054536d767102b64d71e3da3ad1
		`@ -1 +1 @@`
			`Subproject commit b8a75357e82ef91b006be177cc3e5d827e02ff7d`				`Subproject commit 1dc98bc1f3b90054536d767102b64d71e3da3ad1`