1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

New Feature: support for autoFetch of urls deemed as pages by history api (pywb part) (#497)

* auto-fetch page fetch support:
- check for X-Wombat-History-Page header to indicate page url
- set title from X-Wombat-History-Title header, and attempt to parse <title> from response
- update auto-fetch workers in wombat
- update changelist, bump to 2.3.4
This commit is contained in:
Ilya Kreymer 2019-08-12 13:34:33 -07:00 committed by GitHub
parent bf9284fec5
commit e79c657255
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 70 additions and 18 deletions

View File

@ -1,3 +1,12 @@
pywb 2.3.4 changelist
~~~~~~~~~~~~~~~~~~~~~
* Improvements to auto-fetch to support page fetch (webrecroder/wombat#5, #497)
- Support fetching page with ``X-Wombat-History-Page`` and title ``X-Wombat-History-Title`` headers present.
- Attempt to extract title and pass along with cdx to ``_add_history_page()`` callback in RewriterApp, to indicate a url is a page.
- General auto-fetch fixes: queue messages if worker not yet inited (in proxy mode), only parse <link> stylesheet hrefs as sheets.
pywb 2.3.3 changelist pywb 2.3.3 changelist
~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~

View File

@ -3,7 +3,7 @@ from io import BytesIO
import requests import requests
from fakeredis import FakeStrictRedis from fakeredis import FakeStrictRedis
from six.moves.urllib.parse import urlencode, urlsplit, urlunsplit from six.moves.urllib.parse import urlencode, urlsplit, urlunsplit, unquote
from warcio.bufferedreaders import BufferedReader from warcio.bufferedreaders import BufferedReader
from warcio.recordloader import ArcWarcRecordLoader from warcio.recordloader import ArcWarcRecordLoader
from warcio.timeutils import http_date_to_timestamp, timestamp_to_http_date from warcio.timeutils import http_date_to_timestamp, timestamp_to_http_date
@ -222,6 +222,13 @@ class RewriterApp(object):
if proto: if proto:
environ['wsgi.url_scheme'] = proto environ['wsgi.url_scheme'] = proto
history_page = environ.pop('HTTP_X_WOMBAT_HISTORY_PAGE', '')
if history_page:
wb_url.url = history_page
is_ajax = True
else:
is_ajax = self.is_ajax(environ)
is_timegate = self._check_accept_dt(wb_url, environ) is_timegate = self._check_accept_dt(wb_url, environ)
host_prefix = self.get_host_prefix(environ) host_prefix = self.get_host_prefix(environ)
@ -358,8 +365,6 @@ class RewriterApp(object):
if self._add_range(record, wb_url, range_start, range_end): if self._add_range(record, wb_url, range_start, range_end):
wb_url.mod = 'id_' wb_url.mod = 'id_'
is_ajax = self.is_ajax(environ)
if is_ajax: if is_ajax:
head_insert_func = None head_insert_func = None
urlrewriter.rewrite_opts['is_ajax'] = True urlrewriter.rewrite_opts['is_ajax'] = True
@ -392,6 +397,17 @@ class RewriterApp(object):
status_headers, gen, is_rw = result status_headers, gen, is_rw = result
if history_page:
title = DefaultRewriter._extract_title(gen)
if not title:
title = unquote(environ.get('HTTP_X_WOMBAT_HISTORY_TITLE', ''))
if not title:
title = history_page
self._add_history_page(cdx, kwargs, title)
return WbResponse.json_response({'title': title})
if setcookie_headers: if setcookie_headers:
status_headers.headers.extend(setcookie_headers) status_headers.headers.extend(setcookie_headers)
@ -660,6 +676,9 @@ class RewriterApp(object):
else: else:
return None return None
def _add_history_page(self, cdx, kwargs, doc_title):
pass
def _add_custom_params(self, cdx, headers, kwargs, record): def _add_custom_params(self, cdx, headers, kwargs, record):
pass pass

View File

@ -18,6 +18,21 @@ WORKER_MODS = {"wkr_", "sw_"} # type: Set[str]
class BaseContentRewriter(object): class BaseContentRewriter(object):
CHARSET_REGEX = re.compile(b'<meta[^>]*?[\s;"\']charset\s*=[\s"\']*([^\s"\'/>]*)') CHARSET_REGEX = re.compile(b'<meta[^>]*?[\s;"\']charset\s*=[\s"\']*([^\s"\'/>]*)')
TITLE = re.compile(r'<\s*title\s*>(.*)<\s*\/\s*title\s*>', re.M | re.I | re.S)
@classmethod
def _extract_title(cls, gen):
title_res = list(gen)
if not title_res or not title_res[0]:
return
m = cls.TITLE.search(title_res[0].decode('utf-8'))
if not m:
return
title_res = m.group(1)
return title_res.strip()
def __init__(self, rules_file, replay_mod=''): def __init__(self, rules_file, replay_mod=''):
self.rules = [] self.rules = []
self.all_rewriters = [] self.all_rewriters = []

View File

@ -101,27 +101,36 @@ function fetchDoneOrErrored() {
fetchFromQ(); fetchFromQ();
} }
function fetchURL(urlToBeFetched) { function fetchURL(toBeFetched) {
runningFetches += 1; runningFetches += 1;
fetch(urlToBeFetched)
var url;
var options;
if (typeof toBeFetched === 'object') {
url = toBeFetched.url;
options = toBeFetched.options;
} else {
url = toBeFetched;
}
fetch(url, options)
.then(fetchDoneOrErrored) .then(fetchDoneOrErrored)
.catch(fetchDoneOrErrored); .catch(fetchDoneOrErrored);
} }
function queueOrFetch(urlToBeFetched) { function queueOrFetch(toBeFetched) {
if ( var url = typeof toBeFetched === 'object' ? toBeFetched.url : toBeFetched;
!urlToBeFetched ||
urlToBeFetched.indexOf(DataURLPrefix) === 0 || if (!url || url.indexOf(DataURLPrefix) === 0 || seen[url] != null) {
seen[urlToBeFetched] != null
) {
return; return;
} }
seen[urlToBeFetched] = true; seen[url] = true;
if (runningFetches >= MaxRunningFetches) { if (runningFetches >= MaxRunningFetches) {
queue.push(urlToBeFetched); queue.push(toBeFetched);
return; return;
} }
fetchURL(urlToBeFetched); fetchURL(toBeFetched);
} }
function fetchFromQ() { function fetchFromQ() {

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -1,4 +1,4 @@
__version__ = '2.3.3' __version__ = '2.3.4'
if __name__ == '__main__': if __name__ == '__main__':
print(__version__) print(__version__)

2
wombat

@ -1 +1 @@
Subproject commit acfb37a74bee00c4c483befd7f756551b45b9333 Subproject commit 5fdacc6cd4c89ee8cd1fcbd3fccd4907069050e3