#!/usr/bin/python """ WbUrl represents the standard wayback archival url format. A regular url is a subset of the WbUrl (latest replay). The WbUrl expresses the common interface for interacting with the wayback machine. There WbUrl may represent one of the following forms: query form: [/modifier]/[timestamp][-end_timestamp]*/ modifier, timestamp and end_timestamp are optional */example.com 20101112030201*/http://example.com 2009-2015*/http://example.com /cdx/*/http://example.com url query form: used to indicate query across urls same as query form but with a final * */example.com* 20101112030201*/http://example.com* replay form: 20101112030201/http://example.com 20101112030201im_/http://example.com latest_replay: (no timestamp) http://example.com Additionally, the BaseWbUrl provides the base components (url, timestamp, end_timestamp, modifier, type) which can be used to provide a custom representation of the wayback url format. """ import re import rfc3987 #================================================================= class BaseWbUrl(object): QUERY = 'query' URL_QUERY = 'url_query' REPLAY = 'replay' LATEST_REPLAY = 'latest_replay' def __init__(self, url='', mod='', timestamp='', end_timestamp='', type=None): self.url = url self.timestamp = timestamp self.end_timestamp = end_timestamp self.mod = mod self.type = type #================================================================= class WbUrl(BaseWbUrl): """ # Replay Urls # ====================== >>> repr(WbUrl('20131010000506/example.com')) "('replay', '20131010000506', '', 'http://example.com', '20131010000506/http://example.com')" >>> repr(WbUrl('20130102im_/https://example.com')) "('replay', '20130102', 'im_', 'https://example.com', '20130102im_/https://example.com')" # Protocol agnostic convert to http >>> repr(WbUrl('20130102im_///example.com')) "('replay', '20130102', 'im_', 'http://example.com', '20130102im_/http://example.com')" >>> repr(WbUrl('cs_/example.com')) "('latest_replay', '', 'cs_', 'http://example.com', 'cs_/http://example.com')" >>> repr(WbUrl('https://example.com/xyz')) "('latest_replay', '', '', 'https://example.com/xyz', 'https://example.com/xyz')" >>> repr(WbUrl('https://example.com/xyz?a=%2f&b=%2E')) "('latest_replay', '', '', 'https://example.com/xyz?a=%2f&b=%2E', 'https://example.com/xyz?a=%2f&b=%2E')" # Query Urls # ====================== >>> repr(WbUrl('*/http://example.com/abc?def=a')) "('query', '', '', 'http://example.com/abc?def=a', '*/http://example.com/abc?def=a')" >>> repr(WbUrl('*/http://example.com/abc?def=a*')) "('url_query', '', '', 'http://example.com/abc?def=a', '*/http://example.com/abc?def=a*')" >>> repr(WbUrl('2010*/http://example.com/abc?def=a')) "('query', '2010', '', 'http://example.com/abc?def=a', '2010*/http://example.com/abc?def=a')" # timestamp range query >>> repr(WbUrl('2009-2015*/http://example.com/abc?def=a')) "('query', '2009', '', 'http://example.com/abc?def=a', '2009-2015*/http://example.com/abc?def=a')" >>> repr(WbUrl('json/*/http://example.com/abc?def=a')) "('query', '', 'json', 'http://example.com/abc?def=a', 'json/*/http://example.com/abc?def=a')" >>> repr(WbUrl('timemap-link/2011*/http://example.com/abc?def=a')) "('query', '2011', 'timemap-link', 'http://example.com/abc?def=a', 'timemap-link/2011*/http://example.com/abc?def=a')" # strip off repeated, likely scheme-agnostic, slashes altogether >>> repr(WbUrl('///example.com')) "('latest_replay', '', '', 'http://example.com', 'http://example.com')" >>> repr(WbUrl('//example.com/')) "('latest_replay', '', '', 'http://example.com/', 'http://example.com/')" >>> repr(WbUrl('/example.com/')) "('latest_replay', '', '', 'http://example.com/', 'http://example.com/')" # Error Urls # ====================== >>> x = WbUrl('/#$%#/') Traceback (most recent call last): Exception: Bad Request Url: http://#$%#/ >>> x = WbUrl('/http://example.com:abc/') Traceback (most recent call last): Exception: Bad Request Url: http://example.com:abc/ """ # Regexs # ====================== QUERY_REGEX = re.compile('^(?:([\w\-:]+)/)?(\d*)(?:-(\d+))?\*/?(.*)$') REPLAY_REGEX = re.compile('^(\d*)([a-z]+_)?/{0,3}(.*)$') DEFAULT_SCHEME = 'http://' # ====================== def __init__(self, url): super(WbUrl, self).__init__() self.original_url = url if not any (f(url) for f in [self._init_query, self._init_replay]): raise Exception('Invalid WbUrl: ', url) if len(self.url) == 0: raise Exception('Invalid WbUrl: ', url) # protocol agnostic url -> http:// #if self.url.startswith('//'): # self.url = self.DEFAULT_SCHEME + self.url[2:] # no protocol -> http:// if not '://' in self.url: self.url = self.DEFAULT_SCHEME + self.url # BUG?: adding upper() because rfc3987 lib rejects lower case %-encoding # %2F is fine, but %2f -- standard supports either matcher = rfc3987.match(self.url.upper(), 'IRI') if not matcher: raise Exception('Bad Request Url: ' + self.url) # Match query regex # ====================== def _init_query(self, url): query = self.QUERY_REGEX.match(url) if not query: return None res = query.groups('') self.mod = res[0] self.timestamp = res[1] self.end_timestamp = res[2] self.url = res[3] if self.url.endswith('*'): self.type = self.URL_QUERY self.url = self.url[:-1] else: self.type = self.QUERY return True # Match replay regex # ====================== def _init_replay(self, url): replay = self.REPLAY_REGEX.match(url) if not replay: return None res = replay.groups('') self.timestamp = res[0] self.mod = res[1] self.url = res[2] if self.timestamp: self.type = self.REPLAY else: self.type = self.LATEST_REPLAY return True # Str Representation # ==================== def to_str(self, **overrides): atype = overrides['type'] if 'type' in overrides else self.type mod = overrides['mod'] if 'mod' in overrides else self.mod timestamp = overrides['timestamp'] if 'timestamp' in overrides else self.timestamp end_timestamp = overrides['end_timestamp'] if 'end_timestamp' in overrides else self.end_timestamp url = overrides['url'] if 'url' in overrides else self.url if atype == self.QUERY or atype == self.URL_QUERY: tsmod = '' if mod: tsmod += mod + "/" if timestamp: tsmod += timestamp if end_timestamp: tsmod += '-' + end_timestamp tsmod += "*/" + url if atype == self.URL_QUERY: tsmod += "*" return tsmod else: tsmod = timestamp + mod if len(tsmod) > 0: return tsmod + "/" + url else: return url def __str__(self): return self.to_str() def __repr__(self): return str((self.type, self.timestamp, self.mod, self.url, str(self))) if __name__ == "__main__": import doctest doctest.testmod()