1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

mementoindexsource improvements:

- use shared session for timegate/timemap queries
- catch timegate query exceptions and treat as not found
- skip fuzzy match queries (ensure 'is_fuzzy' is set on params)
wbmementoindexsource improvements:
- fix errors related to exception handling
- hook up 'wb-memento' config, add tests
jsonp_rewriter: fix typo
This commit is contained in:
Ilya Kreymer 2017-06-29 18:41:30 -07:00
parent 582966bb2f
commit 9bda61cab5
6 changed files with 52 additions and 30 deletions

View File

@ -14,7 +14,7 @@ class JSONPRewriter(StreamingRewriter):
return string return string
# see if there is a callback param in current url # see if there is a callback param in current url
m_callback = self.CALLBACK.search(self.urlrewriter.wburl.url) m_callback = self.CALLBACK.search(self.url_rewriter.wburl.url)
if not m_callback: if not m_callback:
return string return string

View File

@ -107,7 +107,8 @@ class FuzzyMatcher(object):
fuzzy_params = {'url': url, fuzzy_params = {'url': url,
'matchType': matched_rule.match_type, 'matchType': matched_rule.match_type,
'filter': filters} 'filter': filters,
'is_fuzzy': True}
for key in iterkeys(params): for key in iterkeys(params):
if key not in self.FUZZY_SKIP_PARAMS: if key not in self.FUZZY_SKIP_PARAMS:

View File

@ -316,6 +316,13 @@ class MementoIndexSource(BaseIndexSource):
self.timegate_url = timegate_url self.timegate_url = timegate_url
self.timemap_url = timemap_url self.timemap_url = timemap_url
self.replay_url = replay_url self.replay_url = replay_url
self._init_sesh()
def _init_sesh(self):
self.sesh = requests.Session()
adapt = requests.adapters.HTTPAdapter(max_retries=3)
self.sesh.mount('http://', adapt)
self.sesh.mount('https://', adapt)
def links_to_cdxobject(self, link_header, def_name): def links_to_cdxobject(self, link_header, def_name):
results = MementoUtils.parse_links(link_header, def_name) results = MementoUtils.parse_links(link_header, def_name)
@ -350,8 +357,13 @@ class MementoIndexSource(BaseIndexSource):
def get_timegate_links(self, params, timestamp): def get_timegate_links(self, params, timestamp):
url = res_template(self.timegate_url, params) url = res_template(self.timegate_url, params)
accept_dt = timestamp_to_http_date(timestamp) accept_dt = timestamp_to_http_date(timestamp)
res = requests.head(url, headers={'Accept-Datetime': accept_dt}) try:
if res.status_code >= 400: headers = self._get_headers(params)
headers['Accept-Datetime'] = accept_dt
res = self.sesh.head(url, headers=headers, timeout=None)
if res.status_code >= 400:
raise NotFoundException(url)
except:
raise NotFoundException(url) raise NotFoundException(url)
links = res.headers.get('Link') links = res.headers.get('Link')
@ -361,15 +373,15 @@ class MementoIndexSource(BaseIndexSource):
return links return links
def _get_timemap_headers(self, params): def _get_headers(self, params):
return {} return {}
def handle_timemap(self, params): def handle_timemap(self, params):
url = res_template(self.timemap_url, params) url = res_template(self.timemap_url, params)
headers = self._get_timemap_headers(params) headers = self._get_headers(params)
res = requests.get(url, res = self.sesh.get(url,
headers=headers, headers=headers,
timeout=params.get('_timeout')) timeout=params.get('_timeout'))
if res.status_code >= 400 or not res.text: if res.status_code >= 400 or not res.text:
raise NotFoundException(url) raise NotFoundException(url)
@ -439,11 +451,11 @@ class MementoIndexSource(BaseIndexSource):
#============================================================================= #=============================================================================
class WBMementoIndexSource(MementoIndexSource): class WBMementoIndexSource(MementoIndexSource):
WBURL_MATCH = re.compile('([0-9]{0,14}(?:\w+_)?)?/{0,3}(.*)') WBURL_MATCH = re.compile('([0-9]{0,14})?(?:\w+_)?/{0,3}(.*)')
WAYBACK_ORIG_SUFFIX = '{timestamp}im_/{url}' WAYBACK_ORIG_SUFFIX = '{timestamp}im_/{url}'
def __init__(self, replay_url): def __init__(self, timegate_url, timemap_url, replay_url):
super(WBMementoIndexSource, self).__init__('', '', replay_url) super(WBMementoIndexSource, self).__init__(timegate_url, timemap_url, replay_url)
self.prefix = replay_url.split('{', 1)[0] self.prefix = replay_url.split('{', 1)[0]
def _get_referrer(self, params): def _get_referrer(self, params):
@ -456,14 +468,14 @@ class WBMementoIndexSource(MementoIndexSource):
return ref_url return ref_url
def _get_timemap_headers(self, params): def _get_headers(self, params):
ref_url = self._get_referrer(params) ref_url = self._get_referrer(params)
if ref_url: if ref_url:
return {'Referer': ref_url} return {'Referer': ref_url}
else: else:
return {} return {}
def _extract_location(self, location): def _extract_location(self, url, location):
if not location or not location.startswith(self.prefix): if not location or not location.startswith(self.prefix):
raise NotFoundException(url) raise NotFoundException(url)
@ -473,30 +485,27 @@ class WBMementoIndexSource(MementoIndexSource):
url = m.group(2) url = m.group(2)
timestamp = m.group(1) timestamp = m.group(1)
location = self._get_replay_url(timestamp, url)
return url, timestamp, location return url, timestamp, location
def handle_timegate(self, params, timestamp): def handle_timegate(self, params, timestamp):
url = params['url'] url = params['url']
load_url = self._get_replay_url(timestamp, url) load_url = self._get_replay_url(timestamp, url)
ref_url = self._get_referrer(params)
try: try:
headers = {} headers = self._get_headers(params)
if ref_url: res = self.sesh.head(load_url, headers=headers)
headers = {'Referer': ref_url}
res = requests.head(load_url, headers=headers)
except Exception as e: except Exception as e:
raise NotFoundException(url) raise NotFoundException(url)
if not res.headers.get('Memento-Datetime'): if res and res.headers.get('Memento-Datetime'):
if res.status_code >= 400: if res.status_code >= 400:
raise NotFoundException(url) raise NotFoundException(url)
if res.status_code == 302: if res.status_code >= 300:
info = self._extract_location(res.headers.get('Location')) info = self._extract_location(url, res.headers.get('Location'))
else: else:
info = self._extract_location(res.headers.get('Content-Location')) info = self._extract_location(url, res.headers.get('Content-Location'))
url, timestamp, load_url = info url, timestamp, load_url = info
@ -506,12 +515,11 @@ class WBMementoIndexSource(MementoIndexSource):
cdx['url'] = url cdx['url'] = url
cdx['load_url'] = load_url cdx['load_url'] = load_url
if ref_url: if 'Referer' in headers:
cdx['set_referrer'] = ref_url cdx['set_referrer'] = headers['Referer']
return iter([cdx]) return iter([cdx])
@classmethod @classmethod
def _init_id(cls): def _init_id(cls):
return 'wb-memento' return 'wb-memento'

View File

@ -2,7 +2,8 @@ from .testutils import TempDirTests, BaseTestClass
from pywb.warcserver.warcserver import WarcServer from pywb.warcserver.warcserver import WarcServer
import os import os
from pywb.warcserver.index.indexsource import RemoteIndexSource, LiveIndexSource, MementoIndexSource, FileIndexSource from pywb.warcserver.index.indexsource import RemoteIndexSource, LiveIndexSource, MementoIndexSource
from pywb.warcserver.index.indexsource import WBMementoIndexSource, FileIndexSource
from pywb.warcserver.index.aggregator import BaseSourceListAggregator, DirectoryIndexSource from pywb.warcserver.index.aggregator import BaseSourceListAggregator, DirectoryIndexSource
from pywb.warcserver.handlers import ResourceHandler, HandlerSeq from pywb.warcserver.handlers import ResourceHandler, HandlerSeq
@ -49,7 +50,7 @@ class TestWarcServer(TempDirTests, BaseTestClass):
return handler.index_source.sources return handler.index_source.sources
def test_list_static(self): def test_list_static(self):
assert len(self.loader.list_fixed_routes()) == 12 assert len(self.loader.list_fixed_routes()) == 13
def test_list_dynamic(self): def test_list_dynamic(self):
assert self.loader.list_dynamic_routes() == ['auto1', 'auto2'] assert self.loader.list_dynamic_routes() == ['auto1', 'auto2']
@ -73,6 +74,14 @@ class TestWarcServer(TempDirTests, BaseTestClass):
long_form_sources = self._get_sources('rhiz_long') long_form_sources = self._get_sources('rhiz_long')
assert sources['rhiz'] == long_form_sources['rhiz_long'] assert sources['rhiz'] == long_form_sources['rhiz_long']
def test_wb_memento(self):
sources = self._get_sources('rhiz_wb')
assert isinstance(sources['rhiz_wb'], WBMementoIndexSource)
assert sources['rhiz_wb'].timegate_url == 'http://webenact.rhizome.org/all/{url}'
assert sources['rhiz_wb'].timemap_url == 'http://webenact.rhizome.org/all/timemap/link/{url}'
assert sources['rhiz_wb'].replay_url == 'http://webenact.rhizome.org/all/{timestamp}im_/{url}'
assert sources['rhiz_wb'].prefix == 'http://webenact.rhizome.org/all/'
def test_remote_cdx_2(self): def test_remote_cdx_2(self):
sources = self._get_sources('rhiz_cdx') sources = self._get_sources('rhiz_cdx')
assert isinstance(sources['rhiz_cdx'], RemoteIndexSource) assert isinstance(sources['rhiz_cdx'], RemoteIndexSource)

View File

@ -9,6 +9,9 @@ collections:
# rhizome (cdx) # rhizome (cdx)
rhiz_cdx: cdx+http://webenact.rhizome.org/all-cdx rhiz_cdx: cdx+http://webenact.rhizome.org/all-cdx
# rhizome (native wb)
rhiz_wb: wb-memento+http://webenact.rhizome.org/all/
# ia cdx # ia cdx
ia: cdx+http://web.archive.org/cdx /web ia: cdx+http://web.archive.org/cdx /web

View File

@ -9,7 +9,7 @@ from pywb.warcserver.handlers import DefaultResourceHandler, HandlerSeq
from pywb.warcserver.index.indexsource import FileIndexSource, RemoteIndexSource from pywb.warcserver.index.indexsource import FileIndexSource, RemoteIndexSource
from pywb.warcserver.index.indexsource import MementoIndexSource, RedisIndexSource from pywb.warcserver.index.indexsource import MementoIndexSource, RedisIndexSource
from pywb.warcserver.index.indexsource import LiveIndexSource from pywb.warcserver.index.indexsource import LiveIndexSource, WBMementoIndexSource
from pywb.warcserver.index.zipnum import ZipNumIndexSource from pywb.warcserver.index.zipnum import ZipNumIndexSource
from pywb import DEFAULT_CONFIG from pywb import DEFAULT_CONFIG
@ -20,6 +20,7 @@ import os
SOURCE_LIST = [LiveIndexSource, SOURCE_LIST = [LiveIndexSource,
WBMementoIndexSource,
RedisMultiKeyIndexSource, RedisMultiKeyIndexSource,
MementoIndexSource, MementoIndexSource,
CacheDirectoryIndexSource, CacheDirectoryIndexSource,