From 5d42cc0caca9b42d48a2a7d488c18133cf142410 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Fri, 13 Dec 2013 15:43:07 -0800 Subject: [PATCH] rename aurl -> archiveurl, add default scheme, test for empty url --- pywb/{aurl.py => archiveurl.py} | 64 ++++++++++++++++++--------------- pywb/refer_redirect.py | 5 +-- pywb/wbapp.py | 4 +-- pywb/wbrequestresponse.py | 4 +-- 4 files changed, 43 insertions(+), 34 deletions(-) rename pywb/{aurl.py => archiveurl.py} (55%) diff --git a/pywb/aurl.py b/pywb/archiveurl.py similarity index 55% rename from pywb/aurl.py rename to pywb/archiveurl.py index f2b863ee..626df774 100644 --- a/pywb/aurl.py +++ b/pywb/archiveurl.py @@ -5,50 +5,56 @@ import rfc3987 import wbexceptions -# aurl : ArchivalUrl representation for WB +# archiveurl : archivalurl representation for WB -class aurl: +class archiveurl: """ # Replay Urls # ====================== - >>> repr(aurl('/20131010000506/example.com')) - "('replay', '20131010000506', '', 'example.com', '/20131010000506/example.com')" + >>> repr(archiveurl('/20131010000506/example.com')) + "('replay', '20131010000506', '', 'http://example.com', '/20131010000506/http://example.com')" - >>> repr(aurl('/20130102im_/example.com')) - "('replay', '20130102', 'im_', 'example.com', '/20130102im_/example.com')" + >>> repr(archiveurl('/20130102im_/https://example.com')) + "('replay', '20130102', 'im_', 'https://example.com', '/20130102im_/https://example.com')" - >>> repr(aurl('/cs_/example.com')) - "('latest_replay', '', 'cs_', 'example.com', '/cs_/example.com')" + >>> repr(archiveurl('/cs_/example.com')) + "('latest_replay', '', 'cs_', 'http://example.com', '/cs_/http://example.com')" - >>> repr(aurl('/https://example.com/xyz')) + >>> repr(archiveurl('/https://example.com/xyz')) "('latest_replay', '', '', 'https://example.com/xyz', '/https://example.com/xyz')" # Query Urls # ====================== - >>> repr(aurl('/*/http://example.com/abc?def=a')) + >>> repr(archiveurl('/*/http://example.com/abc?def=a')) "('query', '', '', 'http://example.com/abc?def=a', '/*/http://example.com/abc?def=a')" # Error Urls # ====================== - >>> x = aurl('abc') + >>> x = archiveurl('abc') Traceback (most recent call last): RequestParseException: Invalid WB Request Url: abc - >>> x = aurl('/#$%#/') + >>> x = archiveurl('/#$%#/') Traceback (most recent call last): - BadUrlException: Bad Request Url: #$%#/ + BadUrlException: Bad Request Url: http://#$%#/ - >>> x = aurl('/http://example.com:abc/') + >>> x = archiveurl('/http://example.com:abc/') Traceback (most recent call last): BadUrlException: Bad Request Url: http://example.com:abc/ """ # Regexs # ====================== - QUERY_REGEX = re.compile('^/(\d{1,14})?\*/(.*)$') - REPLAY_REGEX = re.compile('^/(\d{1,14})?([a-z]{2}_)?/?(.*)$') + QUERY_REGEX = re.compile('^/(\d*)\*/(.*)$') + REPLAY_REGEX = re.compile('^/(\d*)([a-z]{2}_)?/?(.*)$') + + QUERY = 'query' + REPLAY = 'replay' + LATEST_REPLAY = 'latest_replay' + + DEFAULT_SCHEME = 'http://' # ====================== @@ -59,10 +65,16 @@ class aurl: self.timestamp = '' self.mod = '' - if not any (f(self, url) for f in [aurl._init_query, aurl._init_replay]): + if not any (f(self, url) for f in [archiveurl._init_query, archiveurl._init_replay]): raise wbexceptions.RequestParseException('Invalid WB Request Url: ' + url) - matcher = rfc3987.match(self.url, 'IRI_reference') + if len(self.url) == 0: + raise wbexceptions.RequestParseException('Invalid WB Request Url: ' + url) + + if not self.url.startswith('//') and not '://' in self.url: + self.url = archiveurl.DEFAULT_SCHEME + self.url + + matcher = rfc3987.match(self.url, 'IRI') if not matcher: raise wbexceptions.BadUrlException('Bad Request Url: ' + self.url) @@ -70,7 +82,7 @@ class aurl: # Match query regex # ====================== def _init_query(self, url): - query = aurl.QUERY_REGEX.match(url) + query = archiveurl.QUERY_REGEX.match(url) if not query: return None @@ -78,13 +90,13 @@ class aurl: self.timestamp = res[0] self.url = res[1] - self.type = 'query' + self.type = archiveurl.QUERY return True # Match replay regex # ====================== def _init_replay(self, url): - replay = aurl.REPLAY_REGEX.match(url) + replay = archiveurl.REPLAY_REGEX.match(url) if not replay: return None @@ -94,16 +106,16 @@ class aurl: self.mod = res[1] self.url = res[2] if self.timestamp: - self.type = 'replay' + self.type = archiveurl.REPLAY else: - self.type = 'latest_replay' + self.type = archiveurl.LATEST_REPLAY return True # Str Representation # ==================== def __str__(self): - if self.type == 'query': + if self.type == archiveurl.QUERY: return "/*/" + self.url else: tsmod = self.timestamp + self.mod @@ -117,8 +129,4 @@ class aurl: if __name__ == "__main__": import doctest - - #def print_test(self): - # return self.type, self.timestamp, self.mod, self.url, str(self) - doctest.testmod() diff --git a/pywb/refer_redirect.py b/pywb/refer_redirect.py index 8595c0a2..59f08054 100644 --- a/pywb/refer_redirect.py +++ b/pywb/refer_redirect.py @@ -1,6 +1,7 @@ -import aurl import urlparse from wbrequestresponse import WbRequest, WbResponse +from archiveurl import archiveurl + # Redirect urls that have 'fallen through' based on the referrer # settings @@ -43,7 +44,7 @@ class ReferRedirect: ref_split = urlparse.urlsplit(wbrequest.referrer) ref_path = ref_split.path[1:].split('/', 1) - ref_wb_url = aurl.aurl('/' + ref_path[1]) + ref_wb_url = archiveurl('/' + ref_path[1]) ref_wb_url.url = urlparse.urljoin(ref_wb_url.url, wbrequest.request_uri[1:]) ref_wb_url.url = ref_wb_url.url.replace('../', '') diff --git a/pywb/wbapp.py b/pywb/wbapp.py index 5e7b57d4..f3e464a3 100644 --- a/pywb/wbapp.py +++ b/pywb/wbapp.py @@ -1,10 +1,10 @@ from wbrequestresponse import WbRequest, WbResponse from refer_redirect import ReferRedirect -import aurl +from archiveurl import archiveurl class WBHandler: def run(self, wbrequest): - wburl = aurl.aurl(wbrequest.wb_url) + wburl = archiveurl(wbrequest.wb_url) return WbResponse.text_response(repr(wburl)) class ArchivalParser: diff --git a/pywb/wbrequestresponse.py b/pywb/wbrequestresponse.py index 27cf14a1..20321c47 100644 --- a/pywb/wbrequestresponse.py +++ b/pywb/wbrequestresponse.py @@ -1,6 +1,6 @@ -#from werkzeug.wrappers import BaseRequest, BaseResponse, CommonRequestDescriptorsMixin, CommonResponseDescriptorsMixin -# WbRequest +#WB Request and Response + class WbRequest: def __init__(self, env): self.env = env