1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

rename aurl -> archiveurl, add default scheme, test for empty url

This commit is contained in:
Ilya Kreymer 2013-12-13 15:43:07 -08:00
parent 6b78f59e49
commit 5d42cc0cac
4 changed files with 43 additions and 34 deletions

View File

@ -5,50 +5,56 @@ import rfc3987
import wbexceptions import wbexceptions
# aurl : ArchivalUrl representation for WB # archiveurl : archivalurl representation for WB
class aurl: class archiveurl:
""" """
# Replay Urls # Replay Urls
# ====================== # ======================
>>> repr(aurl('/20131010000506/example.com')) >>> repr(archiveurl('/20131010000506/example.com'))
"('replay', '20131010000506', '', 'example.com', '/20131010000506/example.com')" "('replay', '20131010000506', '', 'http://example.com', '/20131010000506/http://example.com')"
>>> repr(aurl('/20130102im_/example.com')) >>> repr(archiveurl('/20130102im_/https://example.com'))
"('replay', '20130102', 'im_', 'example.com', '/20130102im_/example.com')" "('replay', '20130102', 'im_', 'https://example.com', '/20130102im_/https://example.com')"
>>> repr(aurl('/cs_/example.com')) >>> repr(archiveurl('/cs_/example.com'))
"('latest_replay', '', 'cs_', 'example.com', '/cs_/example.com')" "('latest_replay', '', 'cs_', 'http://example.com', '/cs_/http://example.com')"
>>> repr(aurl('/https://example.com/xyz')) >>> repr(archiveurl('/https://example.com/xyz'))
"('latest_replay', '', '', 'https://example.com/xyz', '/https://example.com/xyz')" "('latest_replay', '', '', 'https://example.com/xyz', '/https://example.com/xyz')"
# Query Urls # Query Urls
# ====================== # ======================
>>> repr(aurl('/*/http://example.com/abc?def=a')) >>> repr(archiveurl('/*/http://example.com/abc?def=a'))
"('query', '', '', 'http://example.com/abc?def=a', '/*/http://example.com/abc?def=a')" "('query', '', '', 'http://example.com/abc?def=a', '/*/http://example.com/abc?def=a')"
# Error Urls # Error Urls
# ====================== # ======================
>>> x = aurl('abc') >>> x = archiveurl('abc')
Traceback (most recent call last): Traceback (most recent call last):
RequestParseException: Invalid WB Request Url: abc RequestParseException: Invalid WB Request Url: abc
>>> x = aurl('/#$%#/') >>> x = archiveurl('/#$%#/')
Traceback (most recent call last): Traceback (most recent call last):
BadUrlException: Bad Request Url: #$%#/ BadUrlException: Bad Request Url: http://#$%#/
>>> x = aurl('/http://example.com:abc/') >>> x = archiveurl('/http://example.com:abc/')
Traceback (most recent call last): Traceback (most recent call last):
BadUrlException: Bad Request Url: http://example.com:abc/ BadUrlException: Bad Request Url: http://example.com:abc/
""" """
# Regexs # Regexs
# ====================== # ======================
QUERY_REGEX = re.compile('^/(\d{1,14})?\*/(.*)$') QUERY_REGEX = re.compile('^/(\d*)\*/(.*)$')
REPLAY_REGEX = re.compile('^/(\d{1,14})?([a-z]{2}_)?/?(.*)$') REPLAY_REGEX = re.compile('^/(\d*)([a-z]{2}_)?/?(.*)$')
QUERY = 'query'
REPLAY = 'replay'
LATEST_REPLAY = 'latest_replay'
DEFAULT_SCHEME = 'http://'
# ====================== # ======================
@ -59,10 +65,16 @@ class aurl:
self.timestamp = '' self.timestamp = ''
self.mod = '' self.mod = ''
if not any (f(self, url) for f in [aurl._init_query, aurl._init_replay]): if not any (f(self, url) for f in [archiveurl._init_query, archiveurl._init_replay]):
raise wbexceptions.RequestParseException('Invalid WB Request Url: ' + url) raise wbexceptions.RequestParseException('Invalid WB Request Url: ' + url)
matcher = rfc3987.match(self.url, 'IRI_reference') if len(self.url) == 0:
raise wbexceptions.RequestParseException('Invalid WB Request Url: ' + url)
if not self.url.startswith('//') and not '://' in self.url:
self.url = archiveurl.DEFAULT_SCHEME + self.url
matcher = rfc3987.match(self.url, 'IRI')
if not matcher: if not matcher:
raise wbexceptions.BadUrlException('Bad Request Url: ' + self.url) raise wbexceptions.BadUrlException('Bad Request Url: ' + self.url)
@ -70,7 +82,7 @@ class aurl:
# Match query regex # Match query regex
# ====================== # ======================
def _init_query(self, url): def _init_query(self, url):
query = aurl.QUERY_REGEX.match(url) query = archiveurl.QUERY_REGEX.match(url)
if not query: if not query:
return None return None
@ -78,13 +90,13 @@ class aurl:
self.timestamp = res[0] self.timestamp = res[0]
self.url = res[1] self.url = res[1]
self.type = 'query' self.type = archiveurl.QUERY
return True return True
# Match replay regex # Match replay regex
# ====================== # ======================
def _init_replay(self, url): def _init_replay(self, url):
replay = aurl.REPLAY_REGEX.match(url) replay = archiveurl.REPLAY_REGEX.match(url)
if not replay: if not replay:
return None return None
@ -94,16 +106,16 @@ class aurl:
self.mod = res[1] self.mod = res[1]
self.url = res[2] self.url = res[2]
if self.timestamp: if self.timestamp:
self.type = 'replay' self.type = archiveurl.REPLAY
else: else:
self.type = 'latest_replay' self.type = archiveurl.LATEST_REPLAY
return True return True
# Str Representation # Str Representation
# ==================== # ====================
def __str__(self): def __str__(self):
if self.type == 'query': if self.type == archiveurl.QUERY:
return "/*/" + self.url return "/*/" + self.url
else: else:
tsmod = self.timestamp + self.mod tsmod = self.timestamp + self.mod
@ -117,8 +129,4 @@ class aurl:
if __name__ == "__main__": if __name__ == "__main__":
import doctest import doctest
#def print_test(self):
# return self.type, self.timestamp, self.mod, self.url, str(self)
doctest.testmod() doctest.testmod()

View File

@ -1,6 +1,7 @@
import aurl
import urlparse import urlparse
from wbrequestresponse import WbRequest, WbResponse from wbrequestresponse import WbRequest, WbResponse
from archiveurl import archiveurl
# Redirect urls that have 'fallen through' based on the referrer # Redirect urls that have 'fallen through' based on the referrer
# settings # settings
@ -43,7 +44,7 @@ class ReferRedirect:
ref_split = urlparse.urlsplit(wbrequest.referrer) ref_split = urlparse.urlsplit(wbrequest.referrer)
ref_path = ref_split.path[1:].split('/', 1) ref_path = ref_split.path[1:].split('/', 1)
ref_wb_url = aurl.aurl('/' + ref_path[1]) ref_wb_url = archiveurl('/' + ref_path[1])
ref_wb_url.url = urlparse.urljoin(ref_wb_url.url, wbrequest.request_uri[1:]) ref_wb_url.url = urlparse.urljoin(ref_wb_url.url, wbrequest.request_uri[1:])
ref_wb_url.url = ref_wb_url.url.replace('../', '') ref_wb_url.url = ref_wb_url.url.replace('../', '')

View File

@ -1,10 +1,10 @@
from wbrequestresponse import WbRequest, WbResponse from wbrequestresponse import WbRequest, WbResponse
from refer_redirect import ReferRedirect from refer_redirect import ReferRedirect
import aurl from archiveurl import archiveurl
class WBHandler: class WBHandler:
def run(self, wbrequest): def run(self, wbrequest):
wburl = aurl.aurl(wbrequest.wb_url) wburl = archiveurl(wbrequest.wb_url)
return WbResponse.text_response(repr(wburl)) return WbResponse.text_response(repr(wburl))
class ArchivalParser: class ArchivalParser:

View File

@ -1,6 +1,6 @@
#from werkzeug.wrappers import BaseRequest, BaseResponse, CommonRequestDescriptorsMixin, CommonResponseDescriptorsMixin
# WbRequest #WB Request and Response
class WbRequest: class WbRequest:
def __init__(self, env): def __init__(self, env):
self.env = env self.env = env