mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
index source: add 'wayback' memento index source, which relies on direct wayback-style timestamp redirect, instead of memento timegate redirect. Used if memento support/Memento-Timedate not available (no support for calendar
fuzzy matcher and index source: memento index sources ignore any fuzzy match queries (not supported via memento)
This commit is contained in:
parent
eac5d18985
commit
84ed1b5519
@ -296,6 +296,7 @@ class RewriterApp(object):
|
|||||||
response = WbResponse(status_headers, gen)
|
response = WbResponse(status_headers, gen)
|
||||||
|
|
||||||
if is_proxy:
|
if is_proxy:
|
||||||
|
response.status_headers.remove_header('Content-Security-Policy-Report-Only')
|
||||||
response.status_headers.remove_header('Content-Security-Policy')
|
response.status_headers.remove_header('Content-Security-Policy')
|
||||||
response.status_headers.remove_header('X-Frame-Options')
|
response.status_headers.remove_header('X-Frame-Options')
|
||||||
|
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
from pywb.utils.binsearch import iter_range
|
from pywb.utils.binsearch import iter_range
|
||||||
from warcio.timeutils import timestamp_to_http_date, http_date_to_timestamp
|
from warcio.timeutils import timestamp_to_http_date, http_date_to_timestamp
|
||||||
from warcio.timeutils import timestamp_now
|
from warcio.timeutils import timestamp_now, pad_timestamp, PAD_14_DOWN
|
||||||
from pywb.utils.canonicalize import canonicalize
|
from pywb.utils.canonicalize import canonicalize
|
||||||
from pywb.utils.wbexception import NotFoundException
|
from pywb.utils.wbexception import NotFoundException
|
||||||
|
|
||||||
@ -379,6 +379,10 @@ class MementoIndexSource(BaseIndexSource):
|
|||||||
def load_index(self, params):
|
def load_index(self, params):
|
||||||
timestamp = params.get('closest')
|
timestamp = params.get('closest')
|
||||||
|
|
||||||
|
# can't do fuzzy matching via memento
|
||||||
|
if params.get('is_fuzzy'):
|
||||||
|
raise NotFoundException(params['url'] + '*')
|
||||||
|
|
||||||
if not timestamp:
|
if not timestamp:
|
||||||
return self.handle_timemap(params)
|
return self.handle_timemap(params)
|
||||||
else:
|
else:
|
||||||
@ -431,3 +435,82 @@ class MementoIndexSource(BaseIndexSource):
|
|||||||
config['timemap_url'],
|
config['timemap_url'],
|
||||||
config['replay_url'])
|
config['replay_url'])
|
||||||
|
|
||||||
|
|
||||||
|
#=============================================================================
|
||||||
|
class WBMementoIndexSource(MementoIndexSource):
|
||||||
|
WBURL_MATCH = re.compile('([0-9]{0,14}(?:\w+_)?)?/{0,3}(.*)')
|
||||||
|
WAYBACK_ORIG_SUFFIX = '{timestamp}im_/{url}'
|
||||||
|
|
||||||
|
def __init__(self, replay_url):
|
||||||
|
super(WBMementoIndexSource, self).__init__('', '', replay_url)
|
||||||
|
self.prefix = replay_url.split('{', 1)[0]
|
||||||
|
|
||||||
|
def _get_referrer(self, params):
|
||||||
|
ref_url = super(WBMementoIndexSource, self)._get_referrer(params)
|
||||||
|
if ref_url:
|
||||||
|
timestamp = params.get('closest', '20')
|
||||||
|
timestamp = pad_timestamp(timestamp, PAD_14_DOWN)
|
||||||
|
ref_url = self._get_replay_url(timestamp, ref_url)
|
||||||
|
ref_url = ref_url.replace('im_/', '/')
|
||||||
|
|
||||||
|
return ref_url
|
||||||
|
|
||||||
|
def _get_timemap_headers(self, params):
|
||||||
|
ref_url = self._get_referrer(params)
|
||||||
|
if ref_url:
|
||||||
|
return {'Referer': ref_url}
|
||||||
|
else:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
def _extract_location(self, location):
|
||||||
|
if not location or not location.startswith(self.prefix):
|
||||||
|
raise NotFoundException(url)
|
||||||
|
|
||||||
|
m = self.WBURL_MATCH.search(location[len(self.prefix):])
|
||||||
|
if not m:
|
||||||
|
raise NotFoundException(url)
|
||||||
|
|
||||||
|
url = m.group(2)
|
||||||
|
timestamp = m.group(1)
|
||||||
|
return url, timestamp, location
|
||||||
|
|
||||||
|
def handle_timegate(self, params, timestamp):
|
||||||
|
url = params['url']
|
||||||
|
load_url = self._get_replay_url(timestamp, url)
|
||||||
|
ref_url = self._get_referrer(params)
|
||||||
|
|
||||||
|
try:
|
||||||
|
headers = {}
|
||||||
|
if ref_url:
|
||||||
|
headers = {'Referer': ref_url}
|
||||||
|
|
||||||
|
res = requests.head(load_url, headers=headers)
|
||||||
|
except Exception as e:
|
||||||
|
raise NotFoundException(url)
|
||||||
|
|
||||||
|
if not res.headers.get('Memento-Datetime'):
|
||||||
|
if res.status_code >= 400:
|
||||||
|
raise NotFoundException(url)
|
||||||
|
|
||||||
|
if res.status_code == 302:
|
||||||
|
info = self._extract_location(res.headers.get('Location'))
|
||||||
|
else:
|
||||||
|
info = self._extract_location(res.headers.get('Content-Location'))
|
||||||
|
|
||||||
|
url, timestamp, load_url = info
|
||||||
|
|
||||||
|
cdx = CDXObject()
|
||||||
|
cdx['urlkey'] = canonicalize(url)
|
||||||
|
cdx['timestamp'] = timestamp
|
||||||
|
cdx['url'] = url
|
||||||
|
cdx['load_url'] = load_url
|
||||||
|
|
||||||
|
if ref_url:
|
||||||
|
cdx['set_referrer'] = ref_url
|
||||||
|
|
||||||
|
return iter([cdx])
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _init_id(cls):
|
||||||
|
return 'wb-memento'
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user