1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

Merge pull request #6 from jcushman/master

Handle ArchivalUrl subclasses.
This commit is contained in:
ikreymer 2014-01-20 13:08:35 -08:00
commit 9a28a2ec6e
4 changed files with 28 additions and 23 deletions

View File

@ -59,21 +59,21 @@ class RemoteCDXServer:
def getQueryParams(wburl, limit = '150000', collapseTime = '10', replayClosest = '4000'):
return {
ArchivalUrl.QUERY:
wburl.QUERY:
{'collapseTime': collapseTime, 'filter': '!statuscode:(500|502|504)', 'limit': limit},
ArchivalUrl.URL_QUERY:
wburl.URL_QUERY:
{'collapse': 'urlkey', 'matchType': 'prefix', 'showGroupCount': True, 'showUniqCount': True, 'lastSkipTimestamp': True, 'limit': limit,
'fl': 'urlkey,original,timestamp,endtimestamp,groupcount,uniqcount',
},
ArchivalUrl.REPLAY:
wburl.REPLAY:
{'sort': 'closest', 'filter': '!statuscode:(500|502|504)', 'limit': replayClosest, 'closest': wburl.timestamp, 'resolveRevisits': True},
# BUG: resolveRevisits currently doesn't work for this type of query
# This is not an issue in archival mode, as there is a redirect to the actual timestamp query
# but may be an issue in proxy mode
ArchivalUrl.LATEST_REPLAY:
wburl.LATEST_REPLAY:
{'sort': 'reverse', 'filter': 'statuscode:[23]..', 'limit': '1', 'resolveRevisits': True}
}[wburl.type]

View File

@ -27,7 +27,7 @@ class WBHandler:
with utils.PerfTimer(wbrequest.env.get('X_PERF'), 'query') as t:
query_response = self.query(wbrequest)
if (wbrequest.wb_url.type == ArchivalUrl.QUERY) or (wbrequest.wb_url.type == ArchivalUrl.URL_QUERY):
if (wbrequest.wb_url.type == wbrequest.wb_url.QUERY) or (wbrequest.wb_url.type == wbrequest.wb_url.URL_QUERY):
return self.htmlquery(wbrequest, query_response) if self.htmlquery else query_response
with utils.PerfTimer(wbrequest.env.get('X_PERF'), 'replay') as t:

View File

@ -58,6 +58,7 @@ class ArchivalUrlRewriter:
def __init__(self, wburl, prefix):
self.wburl = wburl if isinstance(wburl, ArchivalUrl) else ArchivalUrl(wburl)
self.prefix = prefix
self.archivalurl_class = self.wburl.__class__
if self.prefix.endswith('/'):
self.prefix = self.prefix[:-1]
@ -86,18 +87,18 @@ class ArchivalUrlRewriter:
if mod is None:
mod = wburl.mod
finalUrl = self.prefix + ArchivalUrl.to_str(wburl.type, mod, wburl.timestamp, newUrl)
finalUrl = self.prefix + wburl.to_str(mod=mod, url=newUrl)
return finalUrl
def getAbsUrl(self, url = ''):
return self.prefix + ArchivalUrl.to_str(self.wburl.type, self.wburl.mod, self.wburl.timestamp, url)
return self.prefix + self.wburl.to_str(url=url)
def getTimestampUrl(self, timestamp, url = None):
if not url:
if url is None:
url = self.wburl.url
return self.prefix + ArchivalUrl.to_str(self.wburl.type, self.wburl.mod, timestamp, url)
return self.prefix + self.wburl.to_str(timestamp=timestamp, url=url)
def setBaseUrl(self, newUrl):

View File

@ -81,7 +81,7 @@ class ArchivalUrl:
self.timestamp = ''
self.mod = ''
if not any (f(self, url) for f in [ArchivalUrl._init_query, ArchivalUrl._init_replay]):
if not any (f(url) for f in [self._init_query, self._init_replay]):
raise wbexceptions.RequestParseException('Invalid WB Request Url: ' + url)
if len(self.url) == 0:
@ -89,10 +89,10 @@ class ArchivalUrl:
# protocol agnostic url -> http://
if self.url.startswith('//'):
self.url = ArchivalUrl.DEFAULT_SCHEME + self.url[2:]
self.url = self.DEFAULT_SCHEME + self.url[2:]
# no protocol -> http://
elif not '://' in self.url:
self.url = ArchivalUrl.DEFAULT_SCHEME + self.url
self.url = self.DEFAULT_SCHEME + self.url
# BUG?: adding upper() because rfc3987 lib rejects lower case %-encoding
# %2F is fine, but %2f -- standard supports either
@ -104,7 +104,7 @@ class ArchivalUrl:
# Match query regex
# ======================
def _init_query(self, url):
query = ArchivalUrl.QUERY_REGEX.match(url)
query = self.QUERY_REGEX.match(url)
if not query:
return None
@ -114,16 +114,16 @@ class ArchivalUrl:
self.timestamp = res[1]
self.url = res[2]
if self.url.endswith('*'):
self.type = ArchivalUrl.URL_QUERY
self.type = self.URL_QUERY
self.url = self.url[:-1]
else:
self.type = ArchivalUrl.QUERY
self.type = self.QUERY
return True
# Match replay regex
# ======================
def _init_replay(self, url):
replay = ArchivalUrl.REPLAY_REGEX.match(url)
replay = self.REPLAY_REGEX.match(url)
if not replay:
return None
@ -133,17 +133,21 @@ class ArchivalUrl:
self.mod = res[1]
self.url = res[2]
if self.timestamp:
self.type = ArchivalUrl.REPLAY
self.type = self.REPLAY
else:
self.type = ArchivalUrl.LATEST_REPLAY
self.type = self.LATEST_REPLAY
return True
# Str Representation
# ====================
@staticmethod
def to_str(atype, mod, timestamp, url):
if atype == ArchivalUrl.QUERY or atype == ArchivalUrl.URL_QUERY:
def to_str(self, **overrides):
atype = overrides['type'] if 'type' in overrides else self.type
mod = overrides['mod'] if 'mod' in overrides else self.mod
timestamp = overrides['timestamp'] if 'timestamp' in overrides else self.timestamp
url = overrides['url'] if 'url' in overrides else self.url
if atype == self.QUERY or atype == self.URL_QUERY:
tsmod = "/"
if mod:
tsmod += mod + "/"
@ -151,7 +155,7 @@ class ArchivalUrl:
tsmod += timestamp
tsmod += "*/" + url
if atype == ArchivalUrl.URL_QUERY:
if atype == self.URL_QUERY:
tsmod += "*"
return tsmod
else:
@ -162,7 +166,7 @@ class ArchivalUrl:
return "/" + url
def __str__(self):
return self.to_str(self.type, self.mod, self.timestamp, self.url)
return self.to_str()
def __repr__(self):
return str((self.type, self.timestamp, self.mod, self.url, str(self)))