mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
Merge pull request #6 from jcushman/master
Handle ArchivalUrl subclasses.
This commit is contained in:
commit
9a28a2ec6e
@ -59,21 +59,21 @@ class RemoteCDXServer:
|
||||
def getQueryParams(wburl, limit = '150000', collapseTime = '10', replayClosest = '4000'):
|
||||
return {
|
||||
|
||||
ArchivalUrl.QUERY:
|
||||
wburl.QUERY:
|
||||
{'collapseTime': collapseTime, 'filter': '!statuscode:(500|502|504)', 'limit': limit},
|
||||
|
||||
ArchivalUrl.URL_QUERY:
|
||||
wburl.URL_QUERY:
|
||||
{'collapse': 'urlkey', 'matchType': 'prefix', 'showGroupCount': True, 'showUniqCount': True, 'lastSkipTimestamp': True, 'limit': limit,
|
||||
'fl': 'urlkey,original,timestamp,endtimestamp,groupcount,uniqcount',
|
||||
},
|
||||
|
||||
ArchivalUrl.REPLAY:
|
||||
wburl.REPLAY:
|
||||
{'sort': 'closest', 'filter': '!statuscode:(500|502|504)', 'limit': replayClosest, 'closest': wburl.timestamp, 'resolveRevisits': True},
|
||||
|
||||
# BUG: resolveRevisits currently doesn't work for this type of query
|
||||
# This is not an issue in archival mode, as there is a redirect to the actual timestamp query
|
||||
# but may be an issue in proxy mode
|
||||
ArchivalUrl.LATEST_REPLAY:
|
||||
wburl.LATEST_REPLAY:
|
||||
{'sort': 'reverse', 'filter': 'statuscode:[23]..', 'limit': '1', 'resolveRevisits': True}
|
||||
|
||||
}[wburl.type]
|
||||
|
@ -27,7 +27,7 @@ class WBHandler:
|
||||
with utils.PerfTimer(wbrequest.env.get('X_PERF'), 'query') as t:
|
||||
query_response = self.query(wbrequest)
|
||||
|
||||
if (wbrequest.wb_url.type == ArchivalUrl.QUERY) or (wbrequest.wb_url.type == ArchivalUrl.URL_QUERY):
|
||||
if (wbrequest.wb_url.type == wbrequest.wb_url.QUERY) or (wbrequest.wb_url.type == wbrequest.wb_url.URL_QUERY):
|
||||
return self.htmlquery(wbrequest, query_response) if self.htmlquery else query_response
|
||||
|
||||
with utils.PerfTimer(wbrequest.env.get('X_PERF'), 'replay') as t:
|
||||
|
@ -58,6 +58,7 @@ class ArchivalUrlRewriter:
|
||||
def __init__(self, wburl, prefix):
|
||||
self.wburl = wburl if isinstance(wburl, ArchivalUrl) else ArchivalUrl(wburl)
|
||||
self.prefix = prefix
|
||||
self.archivalurl_class = self.wburl.__class__
|
||||
|
||||
if self.prefix.endswith('/'):
|
||||
self.prefix = self.prefix[:-1]
|
||||
@ -86,18 +87,18 @@ class ArchivalUrlRewriter:
|
||||
if mod is None:
|
||||
mod = wburl.mod
|
||||
|
||||
finalUrl = self.prefix + ArchivalUrl.to_str(wburl.type, mod, wburl.timestamp, newUrl)
|
||||
finalUrl = self.prefix + wburl.to_str(mod=mod, url=newUrl)
|
||||
|
||||
return finalUrl
|
||||
|
||||
def getAbsUrl(self, url = ''):
|
||||
return self.prefix + ArchivalUrl.to_str(self.wburl.type, self.wburl.mod, self.wburl.timestamp, url)
|
||||
return self.prefix + self.wburl.to_str(url=url)
|
||||
|
||||
def getTimestampUrl(self, timestamp, url = None):
|
||||
if not url:
|
||||
if url is None:
|
||||
url = self.wburl.url
|
||||
|
||||
return self.prefix + ArchivalUrl.to_str(self.wburl.type, self.wburl.mod, timestamp, url)
|
||||
return self.prefix + self.wburl.to_str(timestamp=timestamp, url=url)
|
||||
|
||||
|
||||
def setBaseUrl(self, newUrl):
|
||||
|
@ -81,7 +81,7 @@ class ArchivalUrl:
|
||||
self.timestamp = ''
|
||||
self.mod = ''
|
||||
|
||||
if not any (f(self, url) for f in [ArchivalUrl._init_query, ArchivalUrl._init_replay]):
|
||||
if not any (f(url) for f in [self._init_query, self._init_replay]):
|
||||
raise wbexceptions.RequestParseException('Invalid WB Request Url: ' + url)
|
||||
|
||||
if len(self.url) == 0:
|
||||
@ -89,10 +89,10 @@ class ArchivalUrl:
|
||||
|
||||
# protocol agnostic url -> http://
|
||||
if self.url.startswith('//'):
|
||||
self.url = ArchivalUrl.DEFAULT_SCHEME + self.url[2:]
|
||||
self.url = self.DEFAULT_SCHEME + self.url[2:]
|
||||
# no protocol -> http://
|
||||
elif not '://' in self.url:
|
||||
self.url = ArchivalUrl.DEFAULT_SCHEME + self.url
|
||||
self.url = self.DEFAULT_SCHEME + self.url
|
||||
|
||||
# BUG?: adding upper() because rfc3987 lib rejects lower case %-encoding
|
||||
# %2F is fine, but %2f -- standard supports either
|
||||
@ -104,7 +104,7 @@ class ArchivalUrl:
|
||||
# Match query regex
|
||||
# ======================
|
||||
def _init_query(self, url):
|
||||
query = ArchivalUrl.QUERY_REGEX.match(url)
|
||||
query = self.QUERY_REGEX.match(url)
|
||||
if not query:
|
||||
return None
|
||||
|
||||
@ -114,16 +114,16 @@ class ArchivalUrl:
|
||||
self.timestamp = res[1]
|
||||
self.url = res[2]
|
||||
if self.url.endswith('*'):
|
||||
self.type = ArchivalUrl.URL_QUERY
|
||||
self.type = self.URL_QUERY
|
||||
self.url = self.url[:-1]
|
||||
else:
|
||||
self.type = ArchivalUrl.QUERY
|
||||
self.type = self.QUERY
|
||||
return True
|
||||
|
||||
# Match replay regex
|
||||
# ======================
|
||||
def _init_replay(self, url):
|
||||
replay = ArchivalUrl.REPLAY_REGEX.match(url)
|
||||
replay = self.REPLAY_REGEX.match(url)
|
||||
if not replay:
|
||||
return None
|
||||
|
||||
@ -133,17 +133,21 @@ class ArchivalUrl:
|
||||
self.mod = res[1]
|
||||
self.url = res[2]
|
||||
if self.timestamp:
|
||||
self.type = ArchivalUrl.REPLAY
|
||||
self.type = self.REPLAY
|
||||
else:
|
||||
self.type = ArchivalUrl.LATEST_REPLAY
|
||||
self.type = self.LATEST_REPLAY
|
||||
|
||||
return True
|
||||
|
||||
# Str Representation
|
||||
# ====================
|
||||
@staticmethod
|
||||
def to_str(atype, mod, timestamp, url):
|
||||
if atype == ArchivalUrl.QUERY or atype == ArchivalUrl.URL_QUERY:
|
||||
def to_str(self, **overrides):
|
||||
atype = overrides['type'] if 'type' in overrides else self.type
|
||||
mod = overrides['mod'] if 'mod' in overrides else self.mod
|
||||
timestamp = overrides['timestamp'] if 'timestamp' in overrides else self.timestamp
|
||||
url = overrides['url'] if 'url' in overrides else self.url
|
||||
|
||||
if atype == self.QUERY or atype == self.URL_QUERY:
|
||||
tsmod = "/"
|
||||
if mod:
|
||||
tsmod += mod + "/"
|
||||
@ -151,7 +155,7 @@ class ArchivalUrl:
|
||||
tsmod += timestamp
|
||||
|
||||
tsmod += "*/" + url
|
||||
if atype == ArchivalUrl.URL_QUERY:
|
||||
if atype == self.URL_QUERY:
|
||||
tsmod += "*"
|
||||
return tsmod
|
||||
else:
|
||||
@ -162,7 +166,7 @@ class ArchivalUrl:
|
||||
return "/" + url
|
||||
|
||||
def __str__(self):
|
||||
return self.to_str(self.type, self.mod, self.timestamp, self.url)
|
||||
return self.to_str()
|
||||
|
||||
def __repr__(self):
|
||||
return str((self.type, self.timestamp, self.mod, self.url, str(self)))
|
||||
|
Loading…
x
Reference in New Issue
Block a user