mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
New Feature: support for autoFetch of urls deemed as pages by history api (pywb part) (#497)
* auto-fetch page fetch support: - check for X-Wombat-History-Page header to indicate page url - set title from X-Wombat-History-Title header, and attempt to parse <title> from response - update auto-fetch workers in wombat - update changelist, bump to 2.3.4
This commit is contained in:
parent
bf9284fec5
commit
e79c657255
@ -1,3 +1,12 @@
|
||||
pywb 2.3.4 changelist
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
* Improvements to auto-fetch to support page fetch (webrecroder/wombat#5, #497)
|
||||
- Support fetching page with ``X-Wombat-History-Page`` and title ``X-Wombat-History-Title`` headers present.
|
||||
- Attempt to extract title and pass along with cdx to ``_add_history_page()`` callback in RewriterApp, to indicate a url is a page.
|
||||
- General auto-fetch fixes: queue messages if worker not yet inited (in proxy mode), only parse <link> stylesheet hrefs as sheets.
|
||||
|
||||
|
||||
pywb 2.3.3 changelist
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
|
@ -3,7 +3,7 @@ from io import BytesIO
|
||||
import requests
|
||||
from fakeredis import FakeStrictRedis
|
||||
|
||||
from six.moves.urllib.parse import urlencode, urlsplit, urlunsplit
|
||||
from six.moves.urllib.parse import urlencode, urlsplit, urlunsplit, unquote
|
||||
from warcio.bufferedreaders import BufferedReader
|
||||
from warcio.recordloader import ArcWarcRecordLoader
|
||||
from warcio.timeutils import http_date_to_timestamp, timestamp_to_http_date
|
||||
@ -222,6 +222,13 @@ class RewriterApp(object):
|
||||
if proto:
|
||||
environ['wsgi.url_scheme'] = proto
|
||||
|
||||
history_page = environ.pop('HTTP_X_WOMBAT_HISTORY_PAGE', '')
|
||||
if history_page:
|
||||
wb_url.url = history_page
|
||||
is_ajax = True
|
||||
else:
|
||||
is_ajax = self.is_ajax(environ)
|
||||
|
||||
is_timegate = self._check_accept_dt(wb_url, environ)
|
||||
|
||||
host_prefix = self.get_host_prefix(environ)
|
||||
@ -358,8 +365,6 @@ class RewriterApp(object):
|
||||
if self._add_range(record, wb_url, range_start, range_end):
|
||||
wb_url.mod = 'id_'
|
||||
|
||||
is_ajax = self.is_ajax(environ)
|
||||
|
||||
if is_ajax:
|
||||
head_insert_func = None
|
||||
urlrewriter.rewrite_opts['is_ajax'] = True
|
||||
@ -392,6 +397,17 @@ class RewriterApp(object):
|
||||
|
||||
status_headers, gen, is_rw = result
|
||||
|
||||
if history_page:
|
||||
title = DefaultRewriter._extract_title(gen)
|
||||
if not title:
|
||||
title = unquote(environ.get('HTTP_X_WOMBAT_HISTORY_TITLE', ''))
|
||||
|
||||
if not title:
|
||||
title = history_page
|
||||
|
||||
self._add_history_page(cdx, kwargs, title)
|
||||
return WbResponse.json_response({'title': title})
|
||||
|
||||
if setcookie_headers:
|
||||
status_headers.headers.extend(setcookie_headers)
|
||||
|
||||
@ -660,6 +676,9 @@ class RewriterApp(object):
|
||||
else:
|
||||
return None
|
||||
|
||||
def _add_history_page(self, cdx, kwargs, doc_title):
|
||||
pass
|
||||
|
||||
def _add_custom_params(self, cdx, headers, kwargs, record):
|
||||
pass
|
||||
|
||||
|
@ -18,6 +18,21 @@ WORKER_MODS = {"wkr_", "sw_"} # type: Set[str]
|
||||
class BaseContentRewriter(object):
|
||||
CHARSET_REGEX = re.compile(b'<meta[^>]*?[\s;"\']charset\s*=[\s"\']*([^\s"\'/>]*)')
|
||||
|
||||
TITLE = re.compile(r'<\s*title\s*>(.*)<\s*\/\s*title\s*>', re.M | re.I | re.S)
|
||||
|
||||
@classmethod
|
||||
def _extract_title(cls, gen):
|
||||
title_res = list(gen)
|
||||
if not title_res or not title_res[0]:
|
||||
return
|
||||
|
||||
m = cls.TITLE.search(title_res[0].decode('utf-8'))
|
||||
if not m:
|
||||
return
|
||||
|
||||
title_res = m.group(1)
|
||||
return title_res.strip()
|
||||
|
||||
def __init__(self, rules_file, replay_mod=''):
|
||||
self.rules = []
|
||||
self.all_rewriters = []
|
||||
|
@ -101,27 +101,36 @@ function fetchDoneOrErrored() {
|
||||
fetchFromQ();
|
||||
}
|
||||
|
||||
function fetchURL(urlToBeFetched) {
|
||||
function fetchURL(toBeFetched) {
|
||||
runningFetches += 1;
|
||||
fetch(urlToBeFetched)
|
||||
|
||||
var url;
|
||||
var options;
|
||||
|
||||
if (typeof toBeFetched === 'object') {
|
||||
url = toBeFetched.url;
|
||||
options = toBeFetched.options;
|
||||
} else {
|
||||
url = toBeFetched;
|
||||
}
|
||||
|
||||
fetch(url, options)
|
||||
.then(fetchDoneOrErrored)
|
||||
.catch(fetchDoneOrErrored);
|
||||
}
|
||||
|
||||
function queueOrFetch(urlToBeFetched) {
|
||||
if (
|
||||
!urlToBeFetched ||
|
||||
urlToBeFetched.indexOf(DataURLPrefix) === 0 ||
|
||||
seen[urlToBeFetched] != null
|
||||
) {
|
||||
function queueOrFetch(toBeFetched) {
|
||||
var url = typeof toBeFetched === 'object' ? toBeFetched.url : toBeFetched;
|
||||
|
||||
if (!url || url.indexOf(DataURLPrefix) === 0 || seen[url] != null) {
|
||||
return;
|
||||
}
|
||||
seen[urlToBeFetched] = true;
|
||||
seen[url] = true;
|
||||
if (runningFetches >= MaxRunningFetches) {
|
||||
queue.push(urlToBeFetched);
|
||||
queue.push(toBeFetched);
|
||||
return;
|
||||
}
|
||||
fetchURL(urlToBeFetched);
|
||||
fetchURL(toBeFetched);
|
||||
}
|
||||
|
||||
function fetchFromQ() {
|
||||
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@ -1,4 +1,4 @@
|
||||
__version__ = '2.3.3'
|
||||
__version__ = '2.3.4'
|
||||
|
||||
if __name__ == '__main__':
|
||||
print(__version__)
|
||||
|
2
wombat
2
wombat
@ -1 +1 @@
|
||||
Subproject commit acfb37a74bee00c4c483befd7f756551b45b9333
|
||||
Subproject commit 5fdacc6cd4c89ee8cd1fcbd3fccd4907069050e3
|
Loading…
x
Reference in New Issue
Block a user