New Feature: support for autoFetch of urls deemed as pages by history api (pywb part) (#497)

* auto-fetch page fetch support: - check for X-Wombat-History-Page header to indicate page url - set title from X-Wombat-History-Title header, and attempt to parse <title> from response - update auto-fetch workers in wombat - update changelist, bump to 2.3.4
2025-03-15 00:03:28 +01:00 · 2019-08-12 13:34:33 -07:00 · 2019-08-12 13:34:33 -07:00 · e79c657255
commit e79c657255
parent bf9284fec5
8 changed files with 70 additions and 18 deletions
--- a/CHANGES.rst
+++ b/CHANGES.rst
@ -1,3 +1,12 @@
 pywb 2.3.4 changelist
 ~~~~~~~~~~~~~~~~~~~~~
 * Improvements to auto-fetch to support page fetch (webrecroder/wombat#5, #497)
  - Support fetching page with ``X-Wombat-History-Page`` and title ``X-Wombat-History-Title`` headers present.
  - Attempt to extract title and pass along with cdx to ``_add_history_page()`` callback in RewriterApp, to indicate a url is a page.
  - General auto-fetch fixes: queue messages if worker not yet inited (in proxy mode), only parse <link> stylesheet hrefs as sheets.
 pywb 2.3.3 changelist
 ~~~~~~~~~~~~~~~~~~~~~
--- a/pywb/apps/rewriterapp.py
+++ b/pywb/apps/rewriterapp.py
@ -3,7 +3,7 @@ from io import BytesIO
 import requests
 from fakeredis import FakeStrictRedis
-from six.moves.urllib.parse import urlencode, urlsplit, urlunsplit
+from six.moves.urllib.parse import urlencode, urlsplit, urlunsplit, unquote
 from warcio.bufferedreaders import BufferedReader
 from warcio.recordloader import ArcWarcRecordLoader
 from warcio.timeutils import http_date_to_timestamp, timestamp_to_http_date
@ -222,6 +222,13 @@ class RewriterApp(object):
        if proto:
            environ['wsgi.url_scheme'] = proto
        history_page = environ.pop('HTTP_X_WOMBAT_HISTORY_PAGE', '')
        if history_page:
            wb_url.url = history_page
            is_ajax = True
        else:
            is_ajax = self.is_ajax(environ)
        is_timegate = self._check_accept_dt(wb_url, environ)
        host_prefix = self.get_host_prefix(environ)
@ -358,8 +365,6 @@ class RewriterApp(object):
        if self._add_range(record, wb_url, range_start, range_end):
            wb_url.mod = 'id_'
        is_ajax = self.is_ajax(environ)
        if is_ajax:
            head_insert_func = None
            urlrewriter.rewrite_opts['is_ajax'] = True
@ -392,6 +397,17 @@ class RewriterApp(object):
        status_headers, gen, is_rw = result
        if history_page:
            title = DefaultRewriter._extract_title(gen)
            if not title:
                title = unquote(environ.get('HTTP_X_WOMBAT_HISTORY_TITLE', ''))
            if not title:
                title = history_page
            self._add_history_page(cdx, kwargs, title)
            return WbResponse.json_response({'title': title})
        if setcookie_headers:
            status_headers.headers.extend(setcookie_headers)
@ -660,6 +676,9 @@ class RewriterApp(object):
        else:
            return None
    def _add_history_page(self, cdx, kwargs, doc_title):
        pass
    def _add_custom_params(self, cdx, headers, kwargs, record):
        pass
--- a/pywb/rewrite/content_rewriter.py
+++ b/pywb/rewrite/content_rewriter.py
@ -18,6 +18,21 @@ WORKER_MODS = {"wkr_", "sw_"}  # type: Set[str]
 class BaseContentRewriter(object):
    CHARSET_REGEX = re.compile(b'<meta[^>]*?[\s;"\']charset\s*=[\s"\']*([^\s"\'/>]*)')
    TITLE = re.compile(r'<\s*title\s*>(.*)<\s*\/\s*title\s*>', re.M | re.I | re.S)
    @classmethod
    def _extract_title(cls, gen):
        title_res = list(gen)
        if not title_res or not title_res[0]:
            return
        m = cls.TITLE.search(title_res[0].decode('utf-8'))
        if not m:
            return
        title_res = m.group(1)
        return title_res.strip()
    def __init__(self, rules_file, replay_mod=''):
        self.rules = []
        self.all_rewriters = []
--- a/pywb/static/autoFetchWorker.js
+++ b/pywb/static/autoFetchWorker.js
@ -101,27 +101,36 @@ function fetchDoneOrErrored() {
  fetchFromQ();
 }
-function fetchURL(urlToBeFetched) {
+function fetchURL(toBeFetched) {
  runningFetches += 1;
-  fetch(urlToBeFetched)
+
  var url;
  var options;
  if (typeof toBeFetched === 'object') {
    url = toBeFetched.url;
    options = toBeFetched.options;
  } else {
    url = toBeFetched;
  }
  fetch(url, options)
    .then(fetchDoneOrErrored)
    .catch(fetchDoneOrErrored);
 }
-function queueOrFetch(urlToBeFetched) {
+function queueOrFetch(toBeFetched) {
-  if (
+  var url = typeof toBeFetched === 'object' ? toBeFetched.url : toBeFetched;
-    !urlToBeFetched ||
+
-    urlToBeFetched.indexOf(DataURLPrefix) === 0 ||
+  if (!url || url.indexOf(DataURLPrefix) === 0 || seen[url] != null) {
    seen[urlToBeFetched] != null
  ) {
    return;
  }
-  seen[urlToBeFetched] = true;
+  seen[url] = true;
  if (runningFetches >= MaxRunningFetches) {
-    queue.push(urlToBeFetched);
+    queue.push(toBeFetched);
    return;
  }
-  fetchURL(urlToBeFetched);
+  fetchURL(toBeFetched);
 }
 function fetchFromQ() {
--- a/pywb/static/wombat.js
+++ b/pywb/static/wombat.js
--- a/pywb/static/wombatProxyMode.js
+++ b/pywb/static/wombatProxyMode.js
--- a/pywb/version.py
+++ b/pywb/version.py
@ -1,4 +1,4 @@
-__version__ = '2.3.3'
+__version__ = '2.3.4'
 if __name__ == '__main__':
    print(__version__)
--- a/2
+++ b/2
@ -1 +1 @@
-Subproject commit acfb37a74bee00c4c483befd7f756551b45b9333
+Subproject commit 5fdacc6cd4c89ee8cd1fcbd3fccd4907069050e3
		`@ -1 +1 @@`
			`Subproject commit acfb37a74bee00c4c483befd7f756551b45b9333`				`Subproject commit 5fdacc6cd4c89ee8cd1fcbd3fccd4907069050e3`