From 903583c3d7d4f36b77da74a696bf9b8cf0929fce Mon Sep 17 00:00:00 2001 From: Jack Cushman Date: Mon, 20 Jan 2014 14:12:59 -0500 Subject: [PATCH] Handle ArchivalUrl subclasses. --- pywb/indexreader.py | 8 ++++---- pywb/replay.py | 2 +- pywb/url_rewriter.py | 9 +++++---- pywb/wbarchivalurl.py | 32 ++++++++++++++++++-------------- 4 files changed, 28 insertions(+), 23 deletions(-) diff --git a/pywb/indexreader.py b/pywb/indexreader.py index daf024b0..1acac741 100644 --- a/pywb/indexreader.py +++ b/pywb/indexreader.py @@ -59,21 +59,21 @@ class RemoteCDXServer: def getQueryParams(wburl, limit = '150000', collapseTime = '10', replayClosest = '4000'): return { - ArchivalUrl.QUERY: + wburl.QUERY: {'collapseTime': collapseTime, 'filter': '!statuscode:(500|502|504)', 'limit': limit}, - ArchivalUrl.URL_QUERY: + wburl.URL_QUERY: {'collapse': 'urlkey', 'matchType': 'prefix', 'showGroupCount': True, 'showUniqCount': True, 'lastSkipTimestamp': True, 'limit': limit, 'fl': 'urlkey,original,timestamp,endtimestamp,groupcount,uniqcount', }, - ArchivalUrl.REPLAY: + wburl.REPLAY: {'sort': 'closest', 'filter': '!statuscode:(500|502|504)', 'limit': replayClosest, 'closest': wburl.timestamp, 'resolveRevisits': True}, # BUG: resolveRevisits currently doesn't work for this type of query # This is not an issue in archival mode, as there is a redirect to the actual timestamp query # but may be an issue in proxy mode - ArchivalUrl.LATEST_REPLAY: + wburl.LATEST_REPLAY: {'sort': 'reverse', 'filter': 'statuscode:[23]..', 'limit': '1', 'resolveRevisits': True} }[wburl.type] diff --git a/pywb/replay.py b/pywb/replay.py index adc87d52..daf6b373 100644 --- a/pywb/replay.py +++ b/pywb/replay.py @@ -27,7 +27,7 @@ class WBHandler: with utils.PerfTimer(wbrequest.env.get('X_PERF'), 'query') as t: query_response = self.query(wbrequest) - if (wbrequest.wb_url.type == ArchivalUrl.QUERY) or (wbrequest.wb_url.type == ArchivalUrl.URL_QUERY): + if (wbrequest.wb_url.type == wbrequest.wb_url.QUERY) or (wbrequest.wb_url.type == wbrequest.wb_url.URL_QUERY): return self.htmlquery(wbrequest, query_response) if self.htmlquery else query_response with utils.PerfTimer(wbrequest.env.get('X_PERF'), 'replay') as t: diff --git a/pywb/url_rewriter.py b/pywb/url_rewriter.py index e5eae359..7958b260 100644 --- a/pywb/url_rewriter.py +++ b/pywb/url_rewriter.py @@ -58,6 +58,7 @@ class ArchivalUrlRewriter: def __init__(self, wburl, prefix): self.wburl = wburl if isinstance(wburl, ArchivalUrl) else ArchivalUrl(wburl) self.prefix = prefix + self.archivalurl_class = self.wburl.__class__ if self.prefix.endswith('/'): self.prefix = self.prefix[:-1] @@ -86,18 +87,18 @@ class ArchivalUrlRewriter: if mod is None: mod = wburl.mod - finalUrl = self.prefix + ArchivalUrl.to_str(wburl.type, mod, wburl.timestamp, newUrl) + finalUrl = self.prefix + wburl.to_str(mod=mod, url=newUrl) return finalUrl def getAbsUrl(self, url = ''): - return self.prefix + ArchivalUrl.to_str(self.wburl.type, self.wburl.mod, self.wburl.timestamp, url) + return self.prefix + self.wburl.to_str(url=url) def getTimestampUrl(self, timestamp, url = None): - if not url: + if url is None: url = self.wburl.url - return self.prefix + ArchivalUrl.to_str(self.wburl.type, self.wburl.mod, timestamp, url) + return self.prefix + self.wburl.to_str(timestamp=timestamp, url=url) def setBaseUrl(self, newUrl): diff --git a/pywb/wbarchivalurl.py b/pywb/wbarchivalurl.py index b3e024d3..1cba4182 100644 --- a/pywb/wbarchivalurl.py +++ b/pywb/wbarchivalurl.py @@ -81,7 +81,7 @@ class ArchivalUrl: self.timestamp = '' self.mod = '' - if not any (f(self, url) for f in [ArchivalUrl._init_query, ArchivalUrl._init_replay]): + if not any (f(url) for f in [self._init_query, self._init_replay]): raise wbexceptions.RequestParseException('Invalid WB Request Url: ' + url) if len(self.url) == 0: @@ -89,10 +89,10 @@ class ArchivalUrl: # protocol agnostic url -> http:// if self.url.startswith('//'): - self.url = ArchivalUrl.DEFAULT_SCHEME + self.url[2:] + self.url = self.DEFAULT_SCHEME + self.url[2:] # no protocol -> http:// elif not '://' in self.url: - self.url = ArchivalUrl.DEFAULT_SCHEME + self.url + self.url = self.DEFAULT_SCHEME + self.url # BUG?: adding upper() because rfc3987 lib rejects lower case %-encoding # %2F is fine, but %2f -- standard supports either @@ -104,7 +104,7 @@ class ArchivalUrl: # Match query regex # ====================== def _init_query(self, url): - query = ArchivalUrl.QUERY_REGEX.match(url) + query = self.QUERY_REGEX.match(url) if not query: return None @@ -114,16 +114,16 @@ class ArchivalUrl: self.timestamp = res[1] self.url = res[2] if self.url.endswith('*'): - self.type = ArchivalUrl.URL_QUERY + self.type = self.URL_QUERY self.url = self.url[:-1] else: - self.type = ArchivalUrl.QUERY + self.type = self.QUERY return True # Match replay regex # ====================== def _init_replay(self, url): - replay = ArchivalUrl.REPLAY_REGEX.match(url) + replay = self.REPLAY_REGEX.match(url) if not replay: return None @@ -133,17 +133,21 @@ class ArchivalUrl: self.mod = res[1] self.url = res[2] if self.timestamp: - self.type = ArchivalUrl.REPLAY + self.type = self.REPLAY else: - self.type = ArchivalUrl.LATEST_REPLAY + self.type = self.LATEST_REPLAY return True # Str Representation # ==================== - @staticmethod - def to_str(atype, mod, timestamp, url): - if atype == ArchivalUrl.QUERY or atype == ArchivalUrl.URL_QUERY: + def to_str(self, **overrides): + atype = overrides['type'] if 'type' in overrides else self.type + mod = overrides['mod'] if 'mod' in overrides else self.mod + timestamp = overrides['timestamp'] if 'timestamp' in overrides else self.timestamp + url = overrides['url'] if 'url' in overrides else self.url + + if atype == self.QUERY or atype == self.URL_QUERY: tsmod = "/" if mod: tsmod += mod + "/" @@ -151,7 +155,7 @@ class ArchivalUrl: tsmod += timestamp tsmod += "*/" + url - if atype == ArchivalUrl.URL_QUERY: + if atype == self.URL_QUERY: tsmod += "*" return tsmod else: @@ -162,7 +166,7 @@ class ArchivalUrl: return "/" + url def __str__(self): - return self.to_str(self.type, self.mod, self.timestamp, self.url) + return self.to_str() def __repr__(self): return str((self.type, self.timestamp, self.mod, self.url, str(self)))