1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

Handle ArchivalUrl subclasses.

This commit is contained in:
Jack Cushman 2014-01-20 14:12:59 -05:00
parent 9ff3fc300b
commit 903583c3d7
4 changed files with 28 additions and 23 deletions

View File

@ -59,21 +59,21 @@ class RemoteCDXServer:
def getQueryParams(wburl, limit = '150000', collapseTime = '10', replayClosest = '4000'): def getQueryParams(wburl, limit = '150000', collapseTime = '10', replayClosest = '4000'):
return { return {
ArchivalUrl.QUERY: wburl.QUERY:
{'collapseTime': collapseTime, 'filter': '!statuscode:(500|502|504)', 'limit': limit}, {'collapseTime': collapseTime, 'filter': '!statuscode:(500|502|504)', 'limit': limit},
ArchivalUrl.URL_QUERY: wburl.URL_QUERY:
{'collapse': 'urlkey', 'matchType': 'prefix', 'showGroupCount': True, 'showUniqCount': True, 'lastSkipTimestamp': True, 'limit': limit, {'collapse': 'urlkey', 'matchType': 'prefix', 'showGroupCount': True, 'showUniqCount': True, 'lastSkipTimestamp': True, 'limit': limit,
'fl': 'urlkey,original,timestamp,endtimestamp,groupcount,uniqcount', 'fl': 'urlkey,original,timestamp,endtimestamp,groupcount,uniqcount',
}, },
ArchivalUrl.REPLAY: wburl.REPLAY:
{'sort': 'closest', 'filter': '!statuscode:(500|502|504)', 'limit': replayClosest, 'closest': wburl.timestamp, 'resolveRevisits': True}, {'sort': 'closest', 'filter': '!statuscode:(500|502|504)', 'limit': replayClosest, 'closest': wburl.timestamp, 'resolveRevisits': True},
# BUG: resolveRevisits currently doesn't work for this type of query # BUG: resolveRevisits currently doesn't work for this type of query
# This is not an issue in archival mode, as there is a redirect to the actual timestamp query # This is not an issue in archival mode, as there is a redirect to the actual timestamp query
# but may be an issue in proxy mode # but may be an issue in proxy mode
ArchivalUrl.LATEST_REPLAY: wburl.LATEST_REPLAY:
{'sort': 'reverse', 'filter': 'statuscode:[23]..', 'limit': '1', 'resolveRevisits': True} {'sort': 'reverse', 'filter': 'statuscode:[23]..', 'limit': '1', 'resolveRevisits': True}
}[wburl.type] }[wburl.type]

View File

@ -27,7 +27,7 @@ class WBHandler:
with utils.PerfTimer(wbrequest.env.get('X_PERF'), 'query') as t: with utils.PerfTimer(wbrequest.env.get('X_PERF'), 'query') as t:
query_response = self.query(wbrequest) query_response = self.query(wbrequest)
if (wbrequest.wb_url.type == ArchivalUrl.QUERY) or (wbrequest.wb_url.type == ArchivalUrl.URL_QUERY): if (wbrequest.wb_url.type == wbrequest.wb_url.QUERY) or (wbrequest.wb_url.type == wbrequest.wb_url.URL_QUERY):
return self.htmlquery(wbrequest, query_response) if self.htmlquery else query_response return self.htmlquery(wbrequest, query_response) if self.htmlquery else query_response
with utils.PerfTimer(wbrequest.env.get('X_PERF'), 'replay') as t: with utils.PerfTimer(wbrequest.env.get('X_PERF'), 'replay') as t:

View File

@ -58,6 +58,7 @@ class ArchivalUrlRewriter:
def __init__(self, wburl, prefix): def __init__(self, wburl, prefix):
self.wburl = wburl if isinstance(wburl, ArchivalUrl) else ArchivalUrl(wburl) self.wburl = wburl if isinstance(wburl, ArchivalUrl) else ArchivalUrl(wburl)
self.prefix = prefix self.prefix = prefix
self.archivalurl_class = self.wburl.__class__
if self.prefix.endswith('/'): if self.prefix.endswith('/'):
self.prefix = self.prefix[:-1] self.prefix = self.prefix[:-1]
@ -86,18 +87,18 @@ class ArchivalUrlRewriter:
if mod is None: if mod is None:
mod = wburl.mod mod = wburl.mod
finalUrl = self.prefix + ArchivalUrl.to_str(wburl.type, mod, wburl.timestamp, newUrl) finalUrl = self.prefix + wburl.to_str(mod=mod, url=newUrl)
return finalUrl return finalUrl
def getAbsUrl(self, url = ''): def getAbsUrl(self, url = ''):
return self.prefix + ArchivalUrl.to_str(self.wburl.type, self.wburl.mod, self.wburl.timestamp, url) return self.prefix + self.wburl.to_str(url=url)
def getTimestampUrl(self, timestamp, url = None): def getTimestampUrl(self, timestamp, url = None):
if not url: if url is None:
url = self.wburl.url url = self.wburl.url
return self.prefix + ArchivalUrl.to_str(self.wburl.type, self.wburl.mod, timestamp, url) return self.prefix + self.wburl.to_str(timestamp=timestamp, url=url)
def setBaseUrl(self, newUrl): def setBaseUrl(self, newUrl):

View File

@ -81,7 +81,7 @@ class ArchivalUrl:
self.timestamp = '' self.timestamp = ''
self.mod = '' self.mod = ''
if not any (f(self, url) for f in [ArchivalUrl._init_query, ArchivalUrl._init_replay]): if not any (f(url) for f in [self._init_query, self._init_replay]):
raise wbexceptions.RequestParseException('Invalid WB Request Url: ' + url) raise wbexceptions.RequestParseException('Invalid WB Request Url: ' + url)
if len(self.url) == 0: if len(self.url) == 0:
@ -89,10 +89,10 @@ class ArchivalUrl:
# protocol agnostic url -> http:// # protocol agnostic url -> http://
if self.url.startswith('//'): if self.url.startswith('//'):
self.url = ArchivalUrl.DEFAULT_SCHEME + self.url[2:] self.url = self.DEFAULT_SCHEME + self.url[2:]
# no protocol -> http:// # no protocol -> http://
elif not '://' in self.url: elif not '://' in self.url:
self.url = ArchivalUrl.DEFAULT_SCHEME + self.url self.url = self.DEFAULT_SCHEME + self.url
# BUG?: adding upper() because rfc3987 lib rejects lower case %-encoding # BUG?: adding upper() because rfc3987 lib rejects lower case %-encoding
# %2F is fine, but %2f -- standard supports either # %2F is fine, but %2f -- standard supports either
@ -104,7 +104,7 @@ class ArchivalUrl:
# Match query regex # Match query regex
# ====================== # ======================
def _init_query(self, url): def _init_query(self, url):
query = ArchivalUrl.QUERY_REGEX.match(url) query = self.QUERY_REGEX.match(url)
if not query: if not query:
return None return None
@ -114,16 +114,16 @@ class ArchivalUrl:
self.timestamp = res[1] self.timestamp = res[1]
self.url = res[2] self.url = res[2]
if self.url.endswith('*'): if self.url.endswith('*'):
self.type = ArchivalUrl.URL_QUERY self.type = self.URL_QUERY
self.url = self.url[:-1] self.url = self.url[:-1]
else: else:
self.type = ArchivalUrl.QUERY self.type = self.QUERY
return True return True
# Match replay regex # Match replay regex
# ====================== # ======================
def _init_replay(self, url): def _init_replay(self, url):
replay = ArchivalUrl.REPLAY_REGEX.match(url) replay = self.REPLAY_REGEX.match(url)
if not replay: if not replay:
return None return None
@ -133,17 +133,21 @@ class ArchivalUrl:
self.mod = res[1] self.mod = res[1]
self.url = res[2] self.url = res[2]
if self.timestamp: if self.timestamp:
self.type = ArchivalUrl.REPLAY self.type = self.REPLAY
else: else:
self.type = ArchivalUrl.LATEST_REPLAY self.type = self.LATEST_REPLAY
return True return True
# Str Representation # Str Representation
# ==================== # ====================
@staticmethod def to_str(self, **overrides):
def to_str(atype, mod, timestamp, url): atype = overrides['type'] if 'type' in overrides else self.type
if atype == ArchivalUrl.QUERY or atype == ArchivalUrl.URL_QUERY: mod = overrides['mod'] if 'mod' in overrides else self.mod
timestamp = overrides['timestamp'] if 'timestamp' in overrides else self.timestamp
url = overrides['url'] if 'url' in overrides else self.url
if atype == self.QUERY or atype == self.URL_QUERY:
tsmod = "/" tsmod = "/"
if mod: if mod:
tsmod += mod + "/" tsmod += mod + "/"
@ -151,7 +155,7 @@ class ArchivalUrl:
tsmod += timestamp tsmod += timestamp
tsmod += "*/" + url tsmod += "*/" + url
if atype == ArchivalUrl.URL_QUERY: if atype == self.URL_QUERY:
tsmod += "*" tsmod += "*"
return tsmod return tsmod
else: else:
@ -162,7 +166,7 @@ class ArchivalUrl:
return "/" + url return "/" + url
def __str__(self): def __str__(self):
return self.to_str(self.type, self.mod, self.timestamp, self.url) return self.to_str()
def __repr__(self): def __repr__(self):
return str((self.type, self.timestamp, self.mod, self.url, str(self))) return str((self.type, self.timestamp, self.mod, self.url, str(self)))