mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
New Feature: support for autoFetch of urls deemed as pages by history api (pywb part) (#497)
* auto-fetch page fetch support: - check for X-Wombat-History-Page header to indicate page url - set title from X-Wombat-History-Title header, and attempt to parse <title> from response - update auto-fetch workers in wombat - update changelist, bump to 2.3.4
This commit is contained in:
parent
bf9284fec5
commit
e79c657255
@ -1,3 +1,12 @@
|
|||||||
|
pywb 2.3.4 changelist
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
* Improvements to auto-fetch to support page fetch (webrecroder/wombat#5, #497)
|
||||||
|
- Support fetching page with ``X-Wombat-History-Page`` and title ``X-Wombat-History-Title`` headers present.
|
||||||
|
- Attempt to extract title and pass along with cdx to ``_add_history_page()`` callback in RewriterApp, to indicate a url is a page.
|
||||||
|
- General auto-fetch fixes: queue messages if worker not yet inited (in proxy mode), only parse <link> stylesheet hrefs as sheets.
|
||||||
|
|
||||||
|
|
||||||
pywb 2.3.3 changelist
|
pywb 2.3.3 changelist
|
||||||
~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
@ -3,7 +3,7 @@ from io import BytesIO
|
|||||||
import requests
|
import requests
|
||||||
from fakeredis import FakeStrictRedis
|
from fakeredis import FakeStrictRedis
|
||||||
|
|
||||||
from six.moves.urllib.parse import urlencode, urlsplit, urlunsplit
|
from six.moves.urllib.parse import urlencode, urlsplit, urlunsplit, unquote
|
||||||
from warcio.bufferedreaders import BufferedReader
|
from warcio.bufferedreaders import BufferedReader
|
||||||
from warcio.recordloader import ArcWarcRecordLoader
|
from warcio.recordloader import ArcWarcRecordLoader
|
||||||
from warcio.timeutils import http_date_to_timestamp, timestamp_to_http_date
|
from warcio.timeutils import http_date_to_timestamp, timestamp_to_http_date
|
||||||
@ -222,6 +222,13 @@ class RewriterApp(object):
|
|||||||
if proto:
|
if proto:
|
||||||
environ['wsgi.url_scheme'] = proto
|
environ['wsgi.url_scheme'] = proto
|
||||||
|
|
||||||
|
history_page = environ.pop('HTTP_X_WOMBAT_HISTORY_PAGE', '')
|
||||||
|
if history_page:
|
||||||
|
wb_url.url = history_page
|
||||||
|
is_ajax = True
|
||||||
|
else:
|
||||||
|
is_ajax = self.is_ajax(environ)
|
||||||
|
|
||||||
is_timegate = self._check_accept_dt(wb_url, environ)
|
is_timegate = self._check_accept_dt(wb_url, environ)
|
||||||
|
|
||||||
host_prefix = self.get_host_prefix(environ)
|
host_prefix = self.get_host_prefix(environ)
|
||||||
@ -358,8 +365,6 @@ class RewriterApp(object):
|
|||||||
if self._add_range(record, wb_url, range_start, range_end):
|
if self._add_range(record, wb_url, range_start, range_end):
|
||||||
wb_url.mod = 'id_'
|
wb_url.mod = 'id_'
|
||||||
|
|
||||||
is_ajax = self.is_ajax(environ)
|
|
||||||
|
|
||||||
if is_ajax:
|
if is_ajax:
|
||||||
head_insert_func = None
|
head_insert_func = None
|
||||||
urlrewriter.rewrite_opts['is_ajax'] = True
|
urlrewriter.rewrite_opts['is_ajax'] = True
|
||||||
@ -392,6 +397,17 @@ class RewriterApp(object):
|
|||||||
|
|
||||||
status_headers, gen, is_rw = result
|
status_headers, gen, is_rw = result
|
||||||
|
|
||||||
|
if history_page:
|
||||||
|
title = DefaultRewriter._extract_title(gen)
|
||||||
|
if not title:
|
||||||
|
title = unquote(environ.get('HTTP_X_WOMBAT_HISTORY_TITLE', ''))
|
||||||
|
|
||||||
|
if not title:
|
||||||
|
title = history_page
|
||||||
|
|
||||||
|
self._add_history_page(cdx, kwargs, title)
|
||||||
|
return WbResponse.json_response({'title': title})
|
||||||
|
|
||||||
if setcookie_headers:
|
if setcookie_headers:
|
||||||
status_headers.headers.extend(setcookie_headers)
|
status_headers.headers.extend(setcookie_headers)
|
||||||
|
|
||||||
@ -660,6 +676,9 @@ class RewriterApp(object):
|
|||||||
else:
|
else:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
def _add_history_page(self, cdx, kwargs, doc_title):
|
||||||
|
pass
|
||||||
|
|
||||||
def _add_custom_params(self, cdx, headers, kwargs, record):
|
def _add_custom_params(self, cdx, headers, kwargs, record):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@ -18,6 +18,21 @@ WORKER_MODS = {"wkr_", "sw_"} # type: Set[str]
|
|||||||
class BaseContentRewriter(object):
|
class BaseContentRewriter(object):
|
||||||
CHARSET_REGEX = re.compile(b'<meta[^>]*?[\s;"\']charset\s*=[\s"\']*([^\s"\'/>]*)')
|
CHARSET_REGEX = re.compile(b'<meta[^>]*?[\s;"\']charset\s*=[\s"\']*([^\s"\'/>]*)')
|
||||||
|
|
||||||
|
TITLE = re.compile(r'<\s*title\s*>(.*)<\s*\/\s*title\s*>', re.M | re.I | re.S)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _extract_title(cls, gen):
|
||||||
|
title_res = list(gen)
|
||||||
|
if not title_res or not title_res[0]:
|
||||||
|
return
|
||||||
|
|
||||||
|
m = cls.TITLE.search(title_res[0].decode('utf-8'))
|
||||||
|
if not m:
|
||||||
|
return
|
||||||
|
|
||||||
|
title_res = m.group(1)
|
||||||
|
return title_res.strip()
|
||||||
|
|
||||||
def __init__(self, rules_file, replay_mod=''):
|
def __init__(self, rules_file, replay_mod=''):
|
||||||
self.rules = []
|
self.rules = []
|
||||||
self.all_rewriters = []
|
self.all_rewriters = []
|
||||||
|
@ -101,27 +101,36 @@ function fetchDoneOrErrored() {
|
|||||||
fetchFromQ();
|
fetchFromQ();
|
||||||
}
|
}
|
||||||
|
|
||||||
function fetchURL(urlToBeFetched) {
|
function fetchURL(toBeFetched) {
|
||||||
runningFetches += 1;
|
runningFetches += 1;
|
||||||
fetch(urlToBeFetched)
|
|
||||||
|
var url;
|
||||||
|
var options;
|
||||||
|
|
||||||
|
if (typeof toBeFetched === 'object') {
|
||||||
|
url = toBeFetched.url;
|
||||||
|
options = toBeFetched.options;
|
||||||
|
} else {
|
||||||
|
url = toBeFetched;
|
||||||
|
}
|
||||||
|
|
||||||
|
fetch(url, options)
|
||||||
.then(fetchDoneOrErrored)
|
.then(fetchDoneOrErrored)
|
||||||
.catch(fetchDoneOrErrored);
|
.catch(fetchDoneOrErrored);
|
||||||
}
|
}
|
||||||
|
|
||||||
function queueOrFetch(urlToBeFetched) {
|
function queueOrFetch(toBeFetched) {
|
||||||
if (
|
var url = typeof toBeFetched === 'object' ? toBeFetched.url : toBeFetched;
|
||||||
!urlToBeFetched ||
|
|
||||||
urlToBeFetched.indexOf(DataURLPrefix) === 0 ||
|
if (!url || url.indexOf(DataURLPrefix) === 0 || seen[url] != null) {
|
||||||
seen[urlToBeFetched] != null
|
|
||||||
) {
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
seen[urlToBeFetched] = true;
|
seen[url] = true;
|
||||||
if (runningFetches >= MaxRunningFetches) {
|
if (runningFetches >= MaxRunningFetches) {
|
||||||
queue.push(urlToBeFetched);
|
queue.push(toBeFetched);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
fetchURL(urlToBeFetched);
|
fetchURL(toBeFetched);
|
||||||
}
|
}
|
||||||
|
|
||||||
function fetchFromQ() {
|
function fetchFromQ() {
|
||||||
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@ -1,4 +1,4 @@
|
|||||||
__version__ = '2.3.3'
|
__version__ = '2.3.4'
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
print(__version__)
|
print(__version__)
|
||||||
|
2
wombat
2
wombat
@ -1 +1 @@
|
|||||||
Subproject commit acfb37a74bee00c4c483befd7f756551b45b9333
|
Subproject commit 5fdacc6cd4c89ee8cd1fcbd3fccd4907069050e3
|
Loading…
x
Reference in New Issue
Block a user