mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
Handle ArchivalUrl subclasses.
This commit is contained in:
parent
9ff3fc300b
commit
903583c3d7
@ -59,21 +59,21 @@ class RemoteCDXServer:
|
|||||||
def getQueryParams(wburl, limit = '150000', collapseTime = '10', replayClosest = '4000'):
|
def getQueryParams(wburl, limit = '150000', collapseTime = '10', replayClosest = '4000'):
|
||||||
return {
|
return {
|
||||||
|
|
||||||
ArchivalUrl.QUERY:
|
wburl.QUERY:
|
||||||
{'collapseTime': collapseTime, 'filter': '!statuscode:(500|502|504)', 'limit': limit},
|
{'collapseTime': collapseTime, 'filter': '!statuscode:(500|502|504)', 'limit': limit},
|
||||||
|
|
||||||
ArchivalUrl.URL_QUERY:
|
wburl.URL_QUERY:
|
||||||
{'collapse': 'urlkey', 'matchType': 'prefix', 'showGroupCount': True, 'showUniqCount': True, 'lastSkipTimestamp': True, 'limit': limit,
|
{'collapse': 'urlkey', 'matchType': 'prefix', 'showGroupCount': True, 'showUniqCount': True, 'lastSkipTimestamp': True, 'limit': limit,
|
||||||
'fl': 'urlkey,original,timestamp,endtimestamp,groupcount,uniqcount',
|
'fl': 'urlkey,original,timestamp,endtimestamp,groupcount,uniqcount',
|
||||||
},
|
},
|
||||||
|
|
||||||
ArchivalUrl.REPLAY:
|
wburl.REPLAY:
|
||||||
{'sort': 'closest', 'filter': '!statuscode:(500|502|504)', 'limit': replayClosest, 'closest': wburl.timestamp, 'resolveRevisits': True},
|
{'sort': 'closest', 'filter': '!statuscode:(500|502|504)', 'limit': replayClosest, 'closest': wburl.timestamp, 'resolveRevisits': True},
|
||||||
|
|
||||||
# BUG: resolveRevisits currently doesn't work for this type of query
|
# BUG: resolveRevisits currently doesn't work for this type of query
|
||||||
# This is not an issue in archival mode, as there is a redirect to the actual timestamp query
|
# This is not an issue in archival mode, as there is a redirect to the actual timestamp query
|
||||||
# but may be an issue in proxy mode
|
# but may be an issue in proxy mode
|
||||||
ArchivalUrl.LATEST_REPLAY:
|
wburl.LATEST_REPLAY:
|
||||||
{'sort': 'reverse', 'filter': 'statuscode:[23]..', 'limit': '1', 'resolveRevisits': True}
|
{'sort': 'reverse', 'filter': 'statuscode:[23]..', 'limit': '1', 'resolveRevisits': True}
|
||||||
|
|
||||||
}[wburl.type]
|
}[wburl.type]
|
||||||
|
@ -27,7 +27,7 @@ class WBHandler:
|
|||||||
with utils.PerfTimer(wbrequest.env.get('X_PERF'), 'query') as t:
|
with utils.PerfTimer(wbrequest.env.get('X_PERF'), 'query') as t:
|
||||||
query_response = self.query(wbrequest)
|
query_response = self.query(wbrequest)
|
||||||
|
|
||||||
if (wbrequest.wb_url.type == ArchivalUrl.QUERY) or (wbrequest.wb_url.type == ArchivalUrl.URL_QUERY):
|
if (wbrequest.wb_url.type == wbrequest.wb_url.QUERY) or (wbrequest.wb_url.type == wbrequest.wb_url.URL_QUERY):
|
||||||
return self.htmlquery(wbrequest, query_response) if self.htmlquery else query_response
|
return self.htmlquery(wbrequest, query_response) if self.htmlquery else query_response
|
||||||
|
|
||||||
with utils.PerfTimer(wbrequest.env.get('X_PERF'), 'replay') as t:
|
with utils.PerfTimer(wbrequest.env.get('X_PERF'), 'replay') as t:
|
||||||
|
@ -58,6 +58,7 @@ class ArchivalUrlRewriter:
|
|||||||
def __init__(self, wburl, prefix):
|
def __init__(self, wburl, prefix):
|
||||||
self.wburl = wburl if isinstance(wburl, ArchivalUrl) else ArchivalUrl(wburl)
|
self.wburl = wburl if isinstance(wburl, ArchivalUrl) else ArchivalUrl(wburl)
|
||||||
self.prefix = prefix
|
self.prefix = prefix
|
||||||
|
self.archivalurl_class = self.wburl.__class__
|
||||||
|
|
||||||
if self.prefix.endswith('/'):
|
if self.prefix.endswith('/'):
|
||||||
self.prefix = self.prefix[:-1]
|
self.prefix = self.prefix[:-1]
|
||||||
@ -86,18 +87,18 @@ class ArchivalUrlRewriter:
|
|||||||
if mod is None:
|
if mod is None:
|
||||||
mod = wburl.mod
|
mod = wburl.mod
|
||||||
|
|
||||||
finalUrl = self.prefix + ArchivalUrl.to_str(wburl.type, mod, wburl.timestamp, newUrl)
|
finalUrl = self.prefix + wburl.to_str(mod=mod, url=newUrl)
|
||||||
|
|
||||||
return finalUrl
|
return finalUrl
|
||||||
|
|
||||||
def getAbsUrl(self, url = ''):
|
def getAbsUrl(self, url = ''):
|
||||||
return self.prefix + ArchivalUrl.to_str(self.wburl.type, self.wburl.mod, self.wburl.timestamp, url)
|
return self.prefix + self.wburl.to_str(url=url)
|
||||||
|
|
||||||
def getTimestampUrl(self, timestamp, url = None):
|
def getTimestampUrl(self, timestamp, url = None):
|
||||||
if not url:
|
if url is None:
|
||||||
url = self.wburl.url
|
url = self.wburl.url
|
||||||
|
|
||||||
return self.prefix + ArchivalUrl.to_str(self.wburl.type, self.wburl.mod, timestamp, url)
|
return self.prefix + self.wburl.to_str(timestamp=timestamp, url=url)
|
||||||
|
|
||||||
|
|
||||||
def setBaseUrl(self, newUrl):
|
def setBaseUrl(self, newUrl):
|
||||||
|
@ -81,7 +81,7 @@ class ArchivalUrl:
|
|||||||
self.timestamp = ''
|
self.timestamp = ''
|
||||||
self.mod = ''
|
self.mod = ''
|
||||||
|
|
||||||
if not any (f(self, url) for f in [ArchivalUrl._init_query, ArchivalUrl._init_replay]):
|
if not any (f(url) for f in [self._init_query, self._init_replay]):
|
||||||
raise wbexceptions.RequestParseException('Invalid WB Request Url: ' + url)
|
raise wbexceptions.RequestParseException('Invalid WB Request Url: ' + url)
|
||||||
|
|
||||||
if len(self.url) == 0:
|
if len(self.url) == 0:
|
||||||
@ -89,10 +89,10 @@ class ArchivalUrl:
|
|||||||
|
|
||||||
# protocol agnostic url -> http://
|
# protocol agnostic url -> http://
|
||||||
if self.url.startswith('//'):
|
if self.url.startswith('//'):
|
||||||
self.url = ArchivalUrl.DEFAULT_SCHEME + self.url[2:]
|
self.url = self.DEFAULT_SCHEME + self.url[2:]
|
||||||
# no protocol -> http://
|
# no protocol -> http://
|
||||||
elif not '://' in self.url:
|
elif not '://' in self.url:
|
||||||
self.url = ArchivalUrl.DEFAULT_SCHEME + self.url
|
self.url = self.DEFAULT_SCHEME + self.url
|
||||||
|
|
||||||
# BUG?: adding upper() because rfc3987 lib rejects lower case %-encoding
|
# BUG?: adding upper() because rfc3987 lib rejects lower case %-encoding
|
||||||
# %2F is fine, but %2f -- standard supports either
|
# %2F is fine, but %2f -- standard supports either
|
||||||
@ -104,7 +104,7 @@ class ArchivalUrl:
|
|||||||
# Match query regex
|
# Match query regex
|
||||||
# ======================
|
# ======================
|
||||||
def _init_query(self, url):
|
def _init_query(self, url):
|
||||||
query = ArchivalUrl.QUERY_REGEX.match(url)
|
query = self.QUERY_REGEX.match(url)
|
||||||
if not query:
|
if not query:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@ -114,16 +114,16 @@ class ArchivalUrl:
|
|||||||
self.timestamp = res[1]
|
self.timestamp = res[1]
|
||||||
self.url = res[2]
|
self.url = res[2]
|
||||||
if self.url.endswith('*'):
|
if self.url.endswith('*'):
|
||||||
self.type = ArchivalUrl.URL_QUERY
|
self.type = self.URL_QUERY
|
||||||
self.url = self.url[:-1]
|
self.url = self.url[:-1]
|
||||||
else:
|
else:
|
||||||
self.type = ArchivalUrl.QUERY
|
self.type = self.QUERY
|
||||||
return True
|
return True
|
||||||
|
|
||||||
# Match replay regex
|
# Match replay regex
|
||||||
# ======================
|
# ======================
|
||||||
def _init_replay(self, url):
|
def _init_replay(self, url):
|
||||||
replay = ArchivalUrl.REPLAY_REGEX.match(url)
|
replay = self.REPLAY_REGEX.match(url)
|
||||||
if not replay:
|
if not replay:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@ -133,17 +133,21 @@ class ArchivalUrl:
|
|||||||
self.mod = res[1]
|
self.mod = res[1]
|
||||||
self.url = res[2]
|
self.url = res[2]
|
||||||
if self.timestamp:
|
if self.timestamp:
|
||||||
self.type = ArchivalUrl.REPLAY
|
self.type = self.REPLAY
|
||||||
else:
|
else:
|
||||||
self.type = ArchivalUrl.LATEST_REPLAY
|
self.type = self.LATEST_REPLAY
|
||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
# Str Representation
|
# Str Representation
|
||||||
# ====================
|
# ====================
|
||||||
@staticmethod
|
def to_str(self, **overrides):
|
||||||
def to_str(atype, mod, timestamp, url):
|
atype = overrides['type'] if 'type' in overrides else self.type
|
||||||
if atype == ArchivalUrl.QUERY or atype == ArchivalUrl.URL_QUERY:
|
mod = overrides['mod'] if 'mod' in overrides else self.mod
|
||||||
|
timestamp = overrides['timestamp'] if 'timestamp' in overrides else self.timestamp
|
||||||
|
url = overrides['url'] if 'url' in overrides else self.url
|
||||||
|
|
||||||
|
if atype == self.QUERY or atype == self.URL_QUERY:
|
||||||
tsmod = "/"
|
tsmod = "/"
|
||||||
if mod:
|
if mod:
|
||||||
tsmod += mod + "/"
|
tsmod += mod + "/"
|
||||||
@ -151,7 +155,7 @@ class ArchivalUrl:
|
|||||||
tsmod += timestamp
|
tsmod += timestamp
|
||||||
|
|
||||||
tsmod += "*/" + url
|
tsmod += "*/" + url
|
||||||
if atype == ArchivalUrl.URL_QUERY:
|
if atype == self.URL_QUERY:
|
||||||
tsmod += "*"
|
tsmod += "*"
|
||||||
return tsmod
|
return tsmod
|
||||||
else:
|
else:
|
||||||
@ -162,7 +166,7 @@ class ArchivalUrl:
|
|||||||
return "/" + url
|
return "/" + url
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return self.to_str(self.type, self.mod, self.timestamp, self.url)
|
return self.to_str()
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return str((self.type, self.timestamp, self.mod, self.url, str(self)))
|
return str((self.type, self.timestamp, self.mod, self.url, str(self)))
|
||||||
|
Loading…
x
Reference in New Issue
Block a user