1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 08:04:49 +01:00

index source: add 'wayback' memento index source, which relies on direct wayback-style timestamp redirect, instead of memento timegate redirect. Used if memento support/Memento-Timedate not available (no support for calendar

fuzzy matcher and index source: memento index sources ignore any fuzzy match queries (not supported via memento)
This commit is contained in:
Ilya Kreymer 2017-06-05 14:17:54 -07:00
parent eac5d18985
commit 84ed1b5519
2 changed files with 85 additions and 1 deletions

View File

@ -296,6 +296,7 @@ class RewriterApp(object):
response = WbResponse(status_headers, gen)
if is_proxy:
response.status_headers.remove_header('Content-Security-Policy-Report-Only')
response.status_headers.remove_header('Content-Security-Policy')
response.status_headers.remove_header('X-Frame-Options')

View File

@ -1,6 +1,6 @@
from pywb.utils.binsearch import iter_range
from warcio.timeutils import timestamp_to_http_date, http_date_to_timestamp
from warcio.timeutils import timestamp_now
from warcio.timeutils import timestamp_now, pad_timestamp, PAD_14_DOWN
from pywb.utils.canonicalize import canonicalize
from pywb.utils.wbexception import NotFoundException
@ -379,6 +379,10 @@ class MementoIndexSource(BaseIndexSource):
def load_index(self, params):
timestamp = params.get('closest')
# can't do fuzzy matching via memento
if params.get('is_fuzzy'):
raise NotFoundException(params['url'] + '*')
if not timestamp:
return self.handle_timemap(params)
else:
@ -431,3 +435,82 @@ class MementoIndexSource(BaseIndexSource):
config['timemap_url'],
config['replay_url'])
#=============================================================================
class WBMementoIndexSource(MementoIndexSource):
WBURL_MATCH = re.compile('([0-9]{0,14}(?:\w+_)?)?/{0,3}(.*)')
WAYBACK_ORIG_SUFFIX = '{timestamp}im_/{url}'
def __init__(self, replay_url):
super(WBMementoIndexSource, self).__init__('', '', replay_url)
self.prefix = replay_url.split('{', 1)[0]
def _get_referrer(self, params):
ref_url = super(WBMementoIndexSource, self)._get_referrer(params)
if ref_url:
timestamp = params.get('closest', '20')
timestamp = pad_timestamp(timestamp, PAD_14_DOWN)
ref_url = self._get_replay_url(timestamp, ref_url)
ref_url = ref_url.replace('im_/', '/')
return ref_url
def _get_timemap_headers(self, params):
ref_url = self._get_referrer(params)
if ref_url:
return {'Referer': ref_url}
else:
return {}
def _extract_location(self, location):
if not location or not location.startswith(self.prefix):
raise NotFoundException(url)
m = self.WBURL_MATCH.search(location[len(self.prefix):])
if not m:
raise NotFoundException(url)
url = m.group(2)
timestamp = m.group(1)
return url, timestamp, location
def handle_timegate(self, params, timestamp):
url = params['url']
load_url = self._get_replay_url(timestamp, url)
ref_url = self._get_referrer(params)
try:
headers = {}
if ref_url:
headers = {'Referer': ref_url}
res = requests.head(load_url, headers=headers)
except Exception as e:
raise NotFoundException(url)
if not res.headers.get('Memento-Datetime'):
if res.status_code >= 400:
raise NotFoundException(url)
if res.status_code == 302:
info = self._extract_location(res.headers.get('Location'))
else:
info = self._extract_location(res.headers.get('Content-Location'))
url, timestamp, load_url = info
cdx = CDXObject()
cdx['urlkey'] = canonicalize(url)
cdx['timestamp'] = timestamp
cdx['url'] = url
cdx['load_url'] = load_url
if ref_url:
cdx['set_referrer'] = ref_url
return iter([cdx])
@classmethod
def _init_id(cls):
return 'wb-memento'