1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 08:04:49 +01:00

mementoindexsource improvements:

- use shared session for timegate/timemap queries
- catch timegate query exceptions and treat as not found
- skip fuzzy match queries (ensure 'is_fuzzy' is set on params)
wbmementoindexsource improvements:
- fix errors related to exception handling
- hook up 'wb-memento' config, add tests
jsonp_rewriter: fix typo
This commit is contained in:
Ilya Kreymer 2017-06-29 18:41:30 -07:00
parent 582966bb2f
commit 9bda61cab5
6 changed files with 52 additions and 30 deletions

View File

@ -14,7 +14,7 @@ class JSONPRewriter(StreamingRewriter):
return string
# see if there is a callback param in current url
m_callback = self.CALLBACK.search(self.urlrewriter.wburl.url)
m_callback = self.CALLBACK.search(self.url_rewriter.wburl.url)
if not m_callback:
return string

View File

@ -107,7 +107,8 @@ class FuzzyMatcher(object):
fuzzy_params = {'url': url,
'matchType': matched_rule.match_type,
'filter': filters}
'filter': filters,
'is_fuzzy': True}
for key in iterkeys(params):
if key not in self.FUZZY_SKIP_PARAMS:

View File

@ -316,6 +316,13 @@ class MementoIndexSource(BaseIndexSource):
self.timegate_url = timegate_url
self.timemap_url = timemap_url
self.replay_url = replay_url
self._init_sesh()
def _init_sesh(self):
self.sesh = requests.Session()
adapt = requests.adapters.HTTPAdapter(max_retries=3)
self.sesh.mount('http://', adapt)
self.sesh.mount('https://', adapt)
def links_to_cdxobject(self, link_header, def_name):
results = MementoUtils.parse_links(link_header, def_name)
@ -350,8 +357,13 @@ class MementoIndexSource(BaseIndexSource):
def get_timegate_links(self, params, timestamp):
url = res_template(self.timegate_url, params)
accept_dt = timestamp_to_http_date(timestamp)
res = requests.head(url, headers={'Accept-Datetime': accept_dt})
if res.status_code >= 400:
try:
headers = self._get_headers(params)
headers['Accept-Datetime'] = accept_dt
res = self.sesh.head(url, headers=headers, timeout=None)
if res.status_code >= 400:
raise NotFoundException(url)
except:
raise NotFoundException(url)
links = res.headers.get('Link')
@ -361,15 +373,15 @@ class MementoIndexSource(BaseIndexSource):
return links
def _get_timemap_headers(self, params):
def _get_headers(self, params):
return {}
def handle_timemap(self, params):
url = res_template(self.timemap_url, params)
headers = self._get_timemap_headers(params)
res = requests.get(url,
headers=headers,
timeout=params.get('_timeout'))
headers = self._get_headers(params)
res = self.sesh.get(url,
headers=headers,
timeout=params.get('_timeout'))
if res.status_code >= 400 or not res.text:
raise NotFoundException(url)
@ -439,11 +451,11 @@ class MementoIndexSource(BaseIndexSource):
#=============================================================================
class WBMementoIndexSource(MementoIndexSource):
WBURL_MATCH = re.compile('([0-9]{0,14}(?:\w+_)?)?/{0,3}(.*)')
WBURL_MATCH = re.compile('([0-9]{0,14})?(?:\w+_)?/{0,3}(.*)')
WAYBACK_ORIG_SUFFIX = '{timestamp}im_/{url}'
def __init__(self, replay_url):
super(WBMementoIndexSource, self).__init__('', '', replay_url)
def __init__(self, timegate_url, timemap_url, replay_url):
super(WBMementoIndexSource, self).__init__(timegate_url, timemap_url, replay_url)
self.prefix = replay_url.split('{', 1)[0]
def _get_referrer(self, params):
@ -456,14 +468,14 @@ class WBMementoIndexSource(MementoIndexSource):
return ref_url
def _get_timemap_headers(self, params):
def _get_headers(self, params):
ref_url = self._get_referrer(params)
if ref_url:
return {'Referer': ref_url}
else:
return {}
def _extract_location(self, location):
def _extract_location(self, url, location):
if not location or not location.startswith(self.prefix):
raise NotFoundException(url)
@ -473,30 +485,27 @@ class WBMementoIndexSource(MementoIndexSource):
url = m.group(2)
timestamp = m.group(1)
location = self._get_replay_url(timestamp, url)
return url, timestamp, location
def handle_timegate(self, params, timestamp):
url = params['url']
load_url = self._get_replay_url(timestamp, url)
ref_url = self._get_referrer(params)
try:
headers = {}
if ref_url:
headers = {'Referer': ref_url}
res = requests.head(load_url, headers=headers)
headers = self._get_headers(params)
res = self.sesh.head(load_url, headers=headers)
except Exception as e:
raise NotFoundException(url)
if not res.headers.get('Memento-Datetime'):
if res and res.headers.get('Memento-Datetime'):
if res.status_code >= 400:
raise NotFoundException(url)
if res.status_code == 302:
info = self._extract_location(res.headers.get('Location'))
if res.status_code >= 300:
info = self._extract_location(url, res.headers.get('Location'))
else:
info = self._extract_location(res.headers.get('Content-Location'))
info = self._extract_location(url, res.headers.get('Content-Location'))
url, timestamp, load_url = info
@ -506,12 +515,11 @@ class WBMementoIndexSource(MementoIndexSource):
cdx['url'] = url
cdx['load_url'] = load_url
if ref_url:
cdx['set_referrer'] = ref_url
if 'Referer' in headers:
cdx['set_referrer'] = headers['Referer']
return iter([cdx])
@classmethod
def _init_id(cls):
return 'wb-memento'

View File

@ -2,7 +2,8 @@ from .testutils import TempDirTests, BaseTestClass
from pywb.warcserver.warcserver import WarcServer
import os
from pywb.warcserver.index.indexsource import RemoteIndexSource, LiveIndexSource, MementoIndexSource, FileIndexSource
from pywb.warcserver.index.indexsource import RemoteIndexSource, LiveIndexSource, MementoIndexSource
from pywb.warcserver.index.indexsource import WBMementoIndexSource, FileIndexSource
from pywb.warcserver.index.aggregator import BaseSourceListAggregator, DirectoryIndexSource
from pywb.warcserver.handlers import ResourceHandler, HandlerSeq
@ -49,7 +50,7 @@ class TestWarcServer(TempDirTests, BaseTestClass):
return handler.index_source.sources
def test_list_static(self):
assert len(self.loader.list_fixed_routes()) == 12
assert len(self.loader.list_fixed_routes()) == 13
def test_list_dynamic(self):
assert self.loader.list_dynamic_routes() == ['auto1', 'auto2']
@ -73,6 +74,14 @@ class TestWarcServer(TempDirTests, BaseTestClass):
long_form_sources = self._get_sources('rhiz_long')
assert sources['rhiz'] == long_form_sources['rhiz_long']
def test_wb_memento(self):
sources = self._get_sources('rhiz_wb')
assert isinstance(sources['rhiz_wb'], WBMementoIndexSource)
assert sources['rhiz_wb'].timegate_url == 'http://webenact.rhizome.org/all/{url}'
assert sources['rhiz_wb'].timemap_url == 'http://webenact.rhizome.org/all/timemap/link/{url}'
assert sources['rhiz_wb'].replay_url == 'http://webenact.rhizome.org/all/{timestamp}im_/{url}'
assert sources['rhiz_wb'].prefix == 'http://webenact.rhizome.org/all/'
def test_remote_cdx_2(self):
sources = self._get_sources('rhiz_cdx')
assert isinstance(sources['rhiz_cdx'], RemoteIndexSource)

View File

@ -9,6 +9,9 @@ collections:
# rhizome (cdx)
rhiz_cdx: cdx+http://webenact.rhizome.org/all-cdx
# rhizome (native wb)
rhiz_wb: wb-memento+http://webenact.rhizome.org/all/
# ia cdx
ia: cdx+http://web.archive.org/cdx /web

View File

@ -9,7 +9,7 @@ from pywb.warcserver.handlers import DefaultResourceHandler, HandlerSeq
from pywb.warcserver.index.indexsource import FileIndexSource, RemoteIndexSource
from pywb.warcserver.index.indexsource import MementoIndexSource, RedisIndexSource
from pywb.warcserver.index.indexsource import LiveIndexSource
from pywb.warcserver.index.indexsource import LiveIndexSource, WBMementoIndexSource
from pywb.warcserver.index.zipnum import ZipNumIndexSource
from pywb import DEFAULT_CONFIG
@ -20,6 +20,7 @@ import os
SOURCE_LIST = [LiveIndexSource,
WBMementoIndexSource,
RedisMultiKeyIndexSource,
MementoIndexSource,
CacheDirectoryIndexSource,