From 9bda61cab561ac71023417e23bbd48526903351a Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 29 Jun 2017 18:41:30 -0700 Subject: [PATCH] mementoindexsource improvements: - use shared session for timegate/timemap queries - catch timegate query exceptions and treat as not found - skip fuzzy match queries (ensure 'is_fuzzy' is set on params) wbmementoindexsource improvements: - fix errors related to exception handling - hook up 'wb-memento' config, add tests jsonp_rewriter: fix typo --- pywb/rewrite/jsonp_rewriter.py | 2 +- pywb/warcserver/index/fuzzymatcher.py | 3 +- pywb/warcserver/index/indexsource.py | 58 +++++++++++-------- pywb/warcserver/test/test_configwarcserver.py | 13 ++++- .../test/test_warcserver_config.yaml | 3 + pywb/warcserver/warcserver.py | 3 +- 6 files changed, 52 insertions(+), 30 deletions(-) diff --git a/pywb/rewrite/jsonp_rewriter.py b/pywb/rewrite/jsonp_rewriter.py index b4d3fac5..fa04332c 100644 --- a/pywb/rewrite/jsonp_rewriter.py +++ b/pywb/rewrite/jsonp_rewriter.py @@ -14,7 +14,7 @@ class JSONPRewriter(StreamingRewriter): return string # see if there is a callback param in current url - m_callback = self.CALLBACK.search(self.urlrewriter.wburl.url) + m_callback = self.CALLBACK.search(self.url_rewriter.wburl.url) if not m_callback: return string diff --git a/pywb/warcserver/index/fuzzymatcher.py b/pywb/warcserver/index/fuzzymatcher.py index b3ccaa96..517bc288 100644 --- a/pywb/warcserver/index/fuzzymatcher.py +++ b/pywb/warcserver/index/fuzzymatcher.py @@ -107,7 +107,8 @@ class FuzzyMatcher(object): fuzzy_params = {'url': url, 'matchType': matched_rule.match_type, - 'filter': filters} + 'filter': filters, + 'is_fuzzy': True} for key in iterkeys(params): if key not in self.FUZZY_SKIP_PARAMS: diff --git a/pywb/warcserver/index/indexsource.py b/pywb/warcserver/index/indexsource.py index e557ed4b..8c40d665 100644 --- a/pywb/warcserver/index/indexsource.py +++ b/pywb/warcserver/index/indexsource.py @@ -316,6 +316,13 @@ class MementoIndexSource(BaseIndexSource): self.timegate_url = timegate_url self.timemap_url = timemap_url self.replay_url = replay_url + self._init_sesh() + + def _init_sesh(self): + self.sesh = requests.Session() + adapt = requests.adapters.HTTPAdapter(max_retries=3) + self.sesh.mount('http://', adapt) + self.sesh.mount('https://', adapt) def links_to_cdxobject(self, link_header, def_name): results = MementoUtils.parse_links(link_header, def_name) @@ -350,8 +357,13 @@ class MementoIndexSource(BaseIndexSource): def get_timegate_links(self, params, timestamp): url = res_template(self.timegate_url, params) accept_dt = timestamp_to_http_date(timestamp) - res = requests.head(url, headers={'Accept-Datetime': accept_dt}) - if res.status_code >= 400: + try: + headers = self._get_headers(params) + headers['Accept-Datetime'] = accept_dt + res = self.sesh.head(url, headers=headers, timeout=None) + if res.status_code >= 400: + raise NotFoundException(url) + except: raise NotFoundException(url) links = res.headers.get('Link') @@ -361,15 +373,15 @@ class MementoIndexSource(BaseIndexSource): return links - def _get_timemap_headers(self, params): + def _get_headers(self, params): return {} def handle_timemap(self, params): url = res_template(self.timemap_url, params) - headers = self._get_timemap_headers(params) - res = requests.get(url, - headers=headers, - timeout=params.get('_timeout')) + headers = self._get_headers(params) + res = self.sesh.get(url, + headers=headers, + timeout=params.get('_timeout')) if res.status_code >= 400 or not res.text: raise NotFoundException(url) @@ -439,11 +451,11 @@ class MementoIndexSource(BaseIndexSource): #============================================================================= class WBMementoIndexSource(MementoIndexSource): - WBURL_MATCH = re.compile('([0-9]{0,14}(?:\w+_)?)?/{0,3}(.*)') + WBURL_MATCH = re.compile('([0-9]{0,14})?(?:\w+_)?/{0,3}(.*)') WAYBACK_ORIG_SUFFIX = '{timestamp}im_/{url}' - def __init__(self, replay_url): - super(WBMementoIndexSource, self).__init__('', '', replay_url) + def __init__(self, timegate_url, timemap_url, replay_url): + super(WBMementoIndexSource, self).__init__(timegate_url, timemap_url, replay_url) self.prefix = replay_url.split('{', 1)[0] def _get_referrer(self, params): @@ -456,14 +468,14 @@ class WBMementoIndexSource(MementoIndexSource): return ref_url - def _get_timemap_headers(self, params): + def _get_headers(self, params): ref_url = self._get_referrer(params) if ref_url: return {'Referer': ref_url} else: return {} - def _extract_location(self, location): + def _extract_location(self, url, location): if not location or not location.startswith(self.prefix): raise NotFoundException(url) @@ -473,30 +485,27 @@ class WBMementoIndexSource(MementoIndexSource): url = m.group(2) timestamp = m.group(1) + location = self._get_replay_url(timestamp, url) return url, timestamp, location def handle_timegate(self, params, timestamp): url = params['url'] load_url = self._get_replay_url(timestamp, url) - ref_url = self._get_referrer(params) try: - headers = {} - if ref_url: - headers = {'Referer': ref_url} - - res = requests.head(load_url, headers=headers) + headers = self._get_headers(params) + res = self.sesh.head(load_url, headers=headers) except Exception as e: raise NotFoundException(url) - if not res.headers.get('Memento-Datetime'): + if res and res.headers.get('Memento-Datetime'): if res.status_code >= 400: raise NotFoundException(url) - if res.status_code == 302: - info = self._extract_location(res.headers.get('Location')) + if res.status_code >= 300: + info = self._extract_location(url, res.headers.get('Location')) else: - info = self._extract_location(res.headers.get('Content-Location')) + info = self._extract_location(url, res.headers.get('Content-Location')) url, timestamp, load_url = info @@ -506,12 +515,11 @@ class WBMementoIndexSource(MementoIndexSource): cdx['url'] = url cdx['load_url'] = load_url - if ref_url: - cdx['set_referrer'] = ref_url + if 'Referer' in headers: + cdx['set_referrer'] = headers['Referer'] return iter([cdx]) @classmethod def _init_id(cls): return 'wb-memento' - diff --git a/pywb/warcserver/test/test_configwarcserver.py b/pywb/warcserver/test/test_configwarcserver.py index ab101f7f..17ed4810 100644 --- a/pywb/warcserver/test/test_configwarcserver.py +++ b/pywb/warcserver/test/test_configwarcserver.py @@ -2,7 +2,8 @@ from .testutils import TempDirTests, BaseTestClass from pywb.warcserver.warcserver import WarcServer import os -from pywb.warcserver.index.indexsource import RemoteIndexSource, LiveIndexSource, MementoIndexSource, FileIndexSource +from pywb.warcserver.index.indexsource import RemoteIndexSource, LiveIndexSource, MementoIndexSource +from pywb.warcserver.index.indexsource import WBMementoIndexSource, FileIndexSource from pywb.warcserver.index.aggregator import BaseSourceListAggregator, DirectoryIndexSource from pywb.warcserver.handlers import ResourceHandler, HandlerSeq @@ -49,7 +50,7 @@ class TestWarcServer(TempDirTests, BaseTestClass): return handler.index_source.sources def test_list_static(self): - assert len(self.loader.list_fixed_routes()) == 12 + assert len(self.loader.list_fixed_routes()) == 13 def test_list_dynamic(self): assert self.loader.list_dynamic_routes() == ['auto1', 'auto2'] @@ -73,6 +74,14 @@ class TestWarcServer(TempDirTests, BaseTestClass): long_form_sources = self._get_sources('rhiz_long') assert sources['rhiz'] == long_form_sources['rhiz_long'] + def test_wb_memento(self): + sources = self._get_sources('rhiz_wb') + assert isinstance(sources['rhiz_wb'], WBMementoIndexSource) + assert sources['rhiz_wb'].timegate_url == 'http://webenact.rhizome.org/all/{url}' + assert sources['rhiz_wb'].timemap_url == 'http://webenact.rhizome.org/all/timemap/link/{url}' + assert sources['rhiz_wb'].replay_url == 'http://webenact.rhizome.org/all/{timestamp}im_/{url}' + assert sources['rhiz_wb'].prefix == 'http://webenact.rhizome.org/all/' + def test_remote_cdx_2(self): sources = self._get_sources('rhiz_cdx') assert isinstance(sources['rhiz_cdx'], RemoteIndexSource) diff --git a/pywb/warcserver/test/test_warcserver_config.yaml b/pywb/warcserver/test/test_warcserver_config.yaml index 3bf09a3b..d5515907 100644 --- a/pywb/warcserver/test/test_warcserver_config.yaml +++ b/pywb/warcserver/test/test_warcserver_config.yaml @@ -9,6 +9,9 @@ collections: # rhizome (cdx) rhiz_cdx: cdx+http://webenact.rhizome.org/all-cdx + # rhizome (native wb) + rhiz_wb: wb-memento+http://webenact.rhizome.org/all/ + # ia cdx ia: cdx+http://web.archive.org/cdx /web diff --git a/pywb/warcserver/warcserver.py b/pywb/warcserver/warcserver.py index f82dccae..28386c9e 100644 --- a/pywb/warcserver/warcserver.py +++ b/pywb/warcserver/warcserver.py @@ -9,7 +9,7 @@ from pywb.warcserver.handlers import DefaultResourceHandler, HandlerSeq from pywb.warcserver.index.indexsource import FileIndexSource, RemoteIndexSource from pywb.warcserver.index.indexsource import MementoIndexSource, RedisIndexSource -from pywb.warcserver.index.indexsource import LiveIndexSource +from pywb.warcserver.index.indexsource import LiveIndexSource, WBMementoIndexSource from pywb.warcserver.index.zipnum import ZipNumIndexSource from pywb import DEFAULT_CONFIG @@ -20,6 +20,7 @@ import os SOURCE_LIST = [LiveIndexSource, + WBMementoIndexSource, RedisMultiKeyIndexSource, MementoIndexSource, CacheDirectoryIndexSource,