mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 08:04:49 +01:00
mementoindexsource improvements:
- use shared session for timegate/timemap queries - catch timegate query exceptions and treat as not found - skip fuzzy match queries (ensure 'is_fuzzy' is set on params) wbmementoindexsource improvements: - fix errors related to exception handling - hook up 'wb-memento' config, add tests jsonp_rewriter: fix typo
This commit is contained in:
parent
582966bb2f
commit
9bda61cab5
@ -14,7 +14,7 @@ class JSONPRewriter(StreamingRewriter):
|
||||
return string
|
||||
|
||||
# see if there is a callback param in current url
|
||||
m_callback = self.CALLBACK.search(self.urlrewriter.wburl.url)
|
||||
m_callback = self.CALLBACK.search(self.url_rewriter.wburl.url)
|
||||
if not m_callback:
|
||||
return string
|
||||
|
||||
|
@ -107,7 +107,8 @@ class FuzzyMatcher(object):
|
||||
|
||||
fuzzy_params = {'url': url,
|
||||
'matchType': matched_rule.match_type,
|
||||
'filter': filters}
|
||||
'filter': filters,
|
||||
'is_fuzzy': True}
|
||||
|
||||
for key in iterkeys(params):
|
||||
if key not in self.FUZZY_SKIP_PARAMS:
|
||||
|
@ -316,6 +316,13 @@ class MementoIndexSource(BaseIndexSource):
|
||||
self.timegate_url = timegate_url
|
||||
self.timemap_url = timemap_url
|
||||
self.replay_url = replay_url
|
||||
self._init_sesh()
|
||||
|
||||
def _init_sesh(self):
|
||||
self.sesh = requests.Session()
|
||||
adapt = requests.adapters.HTTPAdapter(max_retries=3)
|
||||
self.sesh.mount('http://', adapt)
|
||||
self.sesh.mount('https://', adapt)
|
||||
|
||||
def links_to_cdxobject(self, link_header, def_name):
|
||||
results = MementoUtils.parse_links(link_header, def_name)
|
||||
@ -350,8 +357,13 @@ class MementoIndexSource(BaseIndexSource):
|
||||
def get_timegate_links(self, params, timestamp):
|
||||
url = res_template(self.timegate_url, params)
|
||||
accept_dt = timestamp_to_http_date(timestamp)
|
||||
res = requests.head(url, headers={'Accept-Datetime': accept_dt})
|
||||
if res.status_code >= 400:
|
||||
try:
|
||||
headers = self._get_headers(params)
|
||||
headers['Accept-Datetime'] = accept_dt
|
||||
res = self.sesh.head(url, headers=headers, timeout=None)
|
||||
if res.status_code >= 400:
|
||||
raise NotFoundException(url)
|
||||
except:
|
||||
raise NotFoundException(url)
|
||||
|
||||
links = res.headers.get('Link')
|
||||
@ -361,15 +373,15 @@ class MementoIndexSource(BaseIndexSource):
|
||||
|
||||
return links
|
||||
|
||||
def _get_timemap_headers(self, params):
|
||||
def _get_headers(self, params):
|
||||
return {}
|
||||
|
||||
def handle_timemap(self, params):
|
||||
url = res_template(self.timemap_url, params)
|
||||
headers = self._get_timemap_headers(params)
|
||||
res = requests.get(url,
|
||||
headers=headers,
|
||||
timeout=params.get('_timeout'))
|
||||
headers = self._get_headers(params)
|
||||
res = self.sesh.get(url,
|
||||
headers=headers,
|
||||
timeout=params.get('_timeout'))
|
||||
|
||||
if res.status_code >= 400 or not res.text:
|
||||
raise NotFoundException(url)
|
||||
@ -439,11 +451,11 @@ class MementoIndexSource(BaseIndexSource):
|
||||
|
||||
#=============================================================================
|
||||
class WBMementoIndexSource(MementoIndexSource):
|
||||
WBURL_MATCH = re.compile('([0-9]{0,14}(?:\w+_)?)?/{0,3}(.*)')
|
||||
WBURL_MATCH = re.compile('([0-9]{0,14})?(?:\w+_)?/{0,3}(.*)')
|
||||
WAYBACK_ORIG_SUFFIX = '{timestamp}im_/{url}'
|
||||
|
||||
def __init__(self, replay_url):
|
||||
super(WBMementoIndexSource, self).__init__('', '', replay_url)
|
||||
def __init__(self, timegate_url, timemap_url, replay_url):
|
||||
super(WBMementoIndexSource, self).__init__(timegate_url, timemap_url, replay_url)
|
||||
self.prefix = replay_url.split('{', 1)[0]
|
||||
|
||||
def _get_referrer(self, params):
|
||||
@ -456,14 +468,14 @@ class WBMementoIndexSource(MementoIndexSource):
|
||||
|
||||
return ref_url
|
||||
|
||||
def _get_timemap_headers(self, params):
|
||||
def _get_headers(self, params):
|
||||
ref_url = self._get_referrer(params)
|
||||
if ref_url:
|
||||
return {'Referer': ref_url}
|
||||
else:
|
||||
return {}
|
||||
|
||||
def _extract_location(self, location):
|
||||
def _extract_location(self, url, location):
|
||||
if not location or not location.startswith(self.prefix):
|
||||
raise NotFoundException(url)
|
||||
|
||||
@ -473,30 +485,27 @@ class WBMementoIndexSource(MementoIndexSource):
|
||||
|
||||
url = m.group(2)
|
||||
timestamp = m.group(1)
|
||||
location = self._get_replay_url(timestamp, url)
|
||||
return url, timestamp, location
|
||||
|
||||
def handle_timegate(self, params, timestamp):
|
||||
url = params['url']
|
||||
load_url = self._get_replay_url(timestamp, url)
|
||||
ref_url = self._get_referrer(params)
|
||||
|
||||
try:
|
||||
headers = {}
|
||||
if ref_url:
|
||||
headers = {'Referer': ref_url}
|
||||
|
||||
res = requests.head(load_url, headers=headers)
|
||||
headers = self._get_headers(params)
|
||||
res = self.sesh.head(load_url, headers=headers)
|
||||
except Exception as e:
|
||||
raise NotFoundException(url)
|
||||
|
||||
if not res.headers.get('Memento-Datetime'):
|
||||
if res and res.headers.get('Memento-Datetime'):
|
||||
if res.status_code >= 400:
|
||||
raise NotFoundException(url)
|
||||
|
||||
if res.status_code == 302:
|
||||
info = self._extract_location(res.headers.get('Location'))
|
||||
if res.status_code >= 300:
|
||||
info = self._extract_location(url, res.headers.get('Location'))
|
||||
else:
|
||||
info = self._extract_location(res.headers.get('Content-Location'))
|
||||
info = self._extract_location(url, res.headers.get('Content-Location'))
|
||||
|
||||
url, timestamp, load_url = info
|
||||
|
||||
@ -506,12 +515,11 @@ class WBMementoIndexSource(MementoIndexSource):
|
||||
cdx['url'] = url
|
||||
cdx['load_url'] = load_url
|
||||
|
||||
if ref_url:
|
||||
cdx['set_referrer'] = ref_url
|
||||
if 'Referer' in headers:
|
||||
cdx['set_referrer'] = headers['Referer']
|
||||
|
||||
return iter([cdx])
|
||||
|
||||
@classmethod
|
||||
def _init_id(cls):
|
||||
return 'wb-memento'
|
||||
|
||||
|
@ -2,7 +2,8 @@ from .testutils import TempDirTests, BaseTestClass
|
||||
from pywb.warcserver.warcserver import WarcServer
|
||||
import os
|
||||
|
||||
from pywb.warcserver.index.indexsource import RemoteIndexSource, LiveIndexSource, MementoIndexSource, FileIndexSource
|
||||
from pywb.warcserver.index.indexsource import RemoteIndexSource, LiveIndexSource, MementoIndexSource
|
||||
from pywb.warcserver.index.indexsource import WBMementoIndexSource, FileIndexSource
|
||||
from pywb.warcserver.index.aggregator import BaseSourceListAggregator, DirectoryIndexSource
|
||||
from pywb.warcserver.handlers import ResourceHandler, HandlerSeq
|
||||
|
||||
@ -49,7 +50,7 @@ class TestWarcServer(TempDirTests, BaseTestClass):
|
||||
return handler.index_source.sources
|
||||
|
||||
def test_list_static(self):
|
||||
assert len(self.loader.list_fixed_routes()) == 12
|
||||
assert len(self.loader.list_fixed_routes()) == 13
|
||||
|
||||
def test_list_dynamic(self):
|
||||
assert self.loader.list_dynamic_routes() == ['auto1', 'auto2']
|
||||
@ -73,6 +74,14 @@ class TestWarcServer(TempDirTests, BaseTestClass):
|
||||
long_form_sources = self._get_sources('rhiz_long')
|
||||
assert sources['rhiz'] == long_form_sources['rhiz_long']
|
||||
|
||||
def test_wb_memento(self):
|
||||
sources = self._get_sources('rhiz_wb')
|
||||
assert isinstance(sources['rhiz_wb'], WBMementoIndexSource)
|
||||
assert sources['rhiz_wb'].timegate_url == 'http://webenact.rhizome.org/all/{url}'
|
||||
assert sources['rhiz_wb'].timemap_url == 'http://webenact.rhizome.org/all/timemap/link/{url}'
|
||||
assert sources['rhiz_wb'].replay_url == 'http://webenact.rhizome.org/all/{timestamp}im_/{url}'
|
||||
assert sources['rhiz_wb'].prefix == 'http://webenact.rhizome.org/all/'
|
||||
|
||||
def test_remote_cdx_2(self):
|
||||
sources = self._get_sources('rhiz_cdx')
|
||||
assert isinstance(sources['rhiz_cdx'], RemoteIndexSource)
|
||||
|
@ -9,6 +9,9 @@ collections:
|
||||
# rhizome (cdx)
|
||||
rhiz_cdx: cdx+http://webenact.rhizome.org/all-cdx
|
||||
|
||||
# rhizome (native wb)
|
||||
rhiz_wb: wb-memento+http://webenact.rhizome.org/all/
|
||||
|
||||
# ia cdx
|
||||
ia: cdx+http://web.archive.org/cdx /web
|
||||
|
||||
|
@ -9,7 +9,7 @@ from pywb.warcserver.handlers import DefaultResourceHandler, HandlerSeq
|
||||
|
||||
from pywb.warcserver.index.indexsource import FileIndexSource, RemoteIndexSource
|
||||
from pywb.warcserver.index.indexsource import MementoIndexSource, RedisIndexSource
|
||||
from pywb.warcserver.index.indexsource import LiveIndexSource
|
||||
from pywb.warcserver.index.indexsource import LiveIndexSource, WBMementoIndexSource
|
||||
from pywb.warcserver.index.zipnum import ZipNumIndexSource
|
||||
|
||||
from pywb import DEFAULT_CONFIG
|
||||
@ -20,6 +20,7 @@ import os
|
||||
|
||||
|
||||
SOURCE_LIST = [LiveIndexSource,
|
||||
WBMementoIndexSource,
|
||||
RedisMultiKeyIndexSource,
|
||||
MementoIndexSource,
|
||||
CacheDirectoryIndexSource,
|
||||
|
Loading…
x
Reference in New Issue
Block a user