1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

Merge 853eedc246385ed8a83b294750319a6a666e50c7 into 7b0f8b58607fb0ed338f0cfddeb80c629582d8f6

This commit is contained in:
Kai Jauslin 2024-12-02 04:56:05 +00:00 committed by GitHub
commit eaf065727e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -1,4 +1,4 @@
from six.moves.urllib.parse import quote_plus from six.moves.urllib.parse import quote_plus, quote, parse_qs, urlparse
from warcio.timeutils import PAD_14_DOWN, http_date_to_timestamp, pad_timestamp, timestamp_now, timestamp_to_http_date from warcio.timeutils import PAD_14_DOWN, http_date_to_timestamp, pad_timestamp, timestamp_now, timestamp_to_http_date
from pywb.utils.binsearch import iter_range from pywb.utils.binsearch import iter_range
@ -118,6 +118,8 @@ class FileIndexSource(BaseIndexSource):
#============================================================================= #=============================================================================
class RemoteIndexSource(BaseIndexSource): class RemoteIndexSource(BaseIndexSource):
CDX_MATCH_RX = re.compile('^cdxj?\+(?P<url>https?\:.*)') CDX_MATCH_RX = re.compile('^cdxj?\+(?P<url>https?\:.*)')
POSTDATA_MATCH_RX = re.compile('.*?[?&](?P<post_key>__wb_post_data|__warc_post_data|__wb_json_data)'\
'=(?P<post_data>[^&]+).*$')
def __init__(self, api_url, replay_url, url_field='load_url', closest_limit=100): def __init__(self, api_url, replay_url, url_field='load_url', closest_limit=100):
self.api_url = api_url self.api_url = api_url
@ -127,15 +129,33 @@ class RemoteIndexSource(BaseIndexSource):
self._init_sesh() self._init_sesh()
def _get_api_url(self, params): def _get_api_url(self, params):
self.add_url_post_param(params)
api_url = res_template(self.api_url, params) api_url = res_template(self.api_url, params)
if 'closest' in params and self.closest_limit: if 'closest' in params and self.closest_limit:
api_url += '&limit=' + str(self.closest_limit) api_url += '&limit=' + str(self.closest_limit)
if 'matchType' in params: if 'matchType' in params:
api_url += '&matchType=' + params.get('matchType') api_url += '&matchType=' + params.get('matchType')
self.logger.info(api_url)
return api_url return api_url
def add_url_post_param(self, params):
# extract POST data value from urlkey and compose url_post parameter
key_str = params['key'].decode('utf-8')
match_post = re.match(self.POSTDATA_MATCH_RX, key_str)
params['url_post'] = quote(params['url'])
if match_post and match_post.groupdict() is not None:
url_query = parse_qs(urlparse(params['url']).query)
post_key = match_post.groupdict()['post_key']
post_data = match_post.groupdict()['post_data']
if len(url_query.keys()) == 0:
params['url_post'] += quote('?%s=%s' % (post_key, post_data))
else:
params['url_post'] += quote('&%s=%s' % (post_key, post_data))
def load_index(self, params): def load_index(self, params):
api_url = self._get_api_url(params) api_url = self._get_api_url(params)
try: try: