mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
rename aurl -> archiveurl, add default scheme, test for empty url
This commit is contained in:
parent
6b78f59e49
commit
5d42cc0cac
@ -5,50 +5,56 @@ import rfc3987
|
|||||||
|
|
||||||
import wbexceptions
|
import wbexceptions
|
||||||
|
|
||||||
# aurl : ArchivalUrl representation for WB
|
# archiveurl : archivalurl representation for WB
|
||||||
|
|
||||||
class aurl:
|
class archiveurl:
|
||||||
"""
|
"""
|
||||||
# Replay Urls
|
# Replay Urls
|
||||||
# ======================
|
# ======================
|
||||||
>>> repr(aurl('/20131010000506/example.com'))
|
>>> repr(archiveurl('/20131010000506/example.com'))
|
||||||
"('replay', '20131010000506', '', 'example.com', '/20131010000506/example.com')"
|
"('replay', '20131010000506', '', 'http://example.com', '/20131010000506/http://example.com')"
|
||||||
|
|
||||||
>>> repr(aurl('/20130102im_/example.com'))
|
>>> repr(archiveurl('/20130102im_/https://example.com'))
|
||||||
"('replay', '20130102', 'im_', 'example.com', '/20130102im_/example.com')"
|
"('replay', '20130102', 'im_', 'https://example.com', '/20130102im_/https://example.com')"
|
||||||
|
|
||||||
>>> repr(aurl('/cs_/example.com'))
|
>>> repr(archiveurl('/cs_/example.com'))
|
||||||
"('latest_replay', '', 'cs_', 'example.com', '/cs_/example.com')"
|
"('latest_replay', '', 'cs_', 'http://example.com', '/cs_/http://example.com')"
|
||||||
|
|
||||||
>>> repr(aurl('/https://example.com/xyz'))
|
>>> repr(archiveurl('/https://example.com/xyz'))
|
||||||
"('latest_replay', '', '', 'https://example.com/xyz', '/https://example.com/xyz')"
|
"('latest_replay', '', '', 'https://example.com/xyz', '/https://example.com/xyz')"
|
||||||
|
|
||||||
|
|
||||||
# Query Urls
|
# Query Urls
|
||||||
# ======================
|
# ======================
|
||||||
>>> repr(aurl('/*/http://example.com/abc?def=a'))
|
>>> repr(archiveurl('/*/http://example.com/abc?def=a'))
|
||||||
"('query', '', '', 'http://example.com/abc?def=a', '/*/http://example.com/abc?def=a')"
|
"('query', '', '', 'http://example.com/abc?def=a', '/*/http://example.com/abc?def=a')"
|
||||||
|
|
||||||
|
|
||||||
# Error Urls
|
# Error Urls
|
||||||
# ======================
|
# ======================
|
||||||
>>> x = aurl('abc')
|
>>> x = archiveurl('abc')
|
||||||
Traceback (most recent call last):
|
Traceback (most recent call last):
|
||||||
RequestParseException: Invalid WB Request Url: abc
|
RequestParseException: Invalid WB Request Url: abc
|
||||||
|
|
||||||
>>> x = aurl('/#$%#/')
|
>>> x = archiveurl('/#$%#/')
|
||||||
Traceback (most recent call last):
|
Traceback (most recent call last):
|
||||||
BadUrlException: Bad Request Url: #$%#/
|
BadUrlException: Bad Request Url: http://#$%#/
|
||||||
|
|
||||||
>>> x = aurl('/http://example.com:abc/')
|
>>> x = archiveurl('/http://example.com:abc/')
|
||||||
Traceback (most recent call last):
|
Traceback (most recent call last):
|
||||||
BadUrlException: Bad Request Url: http://example.com:abc/
|
BadUrlException: Bad Request Url: http://example.com:abc/
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# Regexs
|
# Regexs
|
||||||
# ======================
|
# ======================
|
||||||
QUERY_REGEX = re.compile('^/(\d{1,14})?\*/(.*)$')
|
QUERY_REGEX = re.compile('^/(\d*)\*/(.*)$')
|
||||||
REPLAY_REGEX = re.compile('^/(\d{1,14})?([a-z]{2}_)?/?(.*)$')
|
REPLAY_REGEX = re.compile('^/(\d*)([a-z]{2}_)?/?(.*)$')
|
||||||
|
|
||||||
|
QUERY = 'query'
|
||||||
|
REPLAY = 'replay'
|
||||||
|
LATEST_REPLAY = 'latest_replay'
|
||||||
|
|
||||||
|
DEFAULT_SCHEME = 'http://'
|
||||||
# ======================
|
# ======================
|
||||||
|
|
||||||
|
|
||||||
@ -59,10 +65,16 @@ class aurl:
|
|||||||
self.timestamp = ''
|
self.timestamp = ''
|
||||||
self.mod = ''
|
self.mod = ''
|
||||||
|
|
||||||
if not any (f(self, url) for f in [aurl._init_query, aurl._init_replay]):
|
if not any (f(self, url) for f in [archiveurl._init_query, archiveurl._init_replay]):
|
||||||
raise wbexceptions.RequestParseException('Invalid WB Request Url: ' + url)
|
raise wbexceptions.RequestParseException('Invalid WB Request Url: ' + url)
|
||||||
|
|
||||||
matcher = rfc3987.match(self.url, 'IRI_reference')
|
if len(self.url) == 0:
|
||||||
|
raise wbexceptions.RequestParseException('Invalid WB Request Url: ' + url)
|
||||||
|
|
||||||
|
if not self.url.startswith('//') and not '://' in self.url:
|
||||||
|
self.url = archiveurl.DEFAULT_SCHEME + self.url
|
||||||
|
|
||||||
|
matcher = rfc3987.match(self.url, 'IRI')
|
||||||
|
|
||||||
if not matcher:
|
if not matcher:
|
||||||
raise wbexceptions.BadUrlException('Bad Request Url: ' + self.url)
|
raise wbexceptions.BadUrlException('Bad Request Url: ' + self.url)
|
||||||
@ -70,7 +82,7 @@ class aurl:
|
|||||||
# Match query regex
|
# Match query regex
|
||||||
# ======================
|
# ======================
|
||||||
def _init_query(self, url):
|
def _init_query(self, url):
|
||||||
query = aurl.QUERY_REGEX.match(url)
|
query = archiveurl.QUERY_REGEX.match(url)
|
||||||
if not query:
|
if not query:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@ -78,13 +90,13 @@ class aurl:
|
|||||||
|
|
||||||
self.timestamp = res[0]
|
self.timestamp = res[0]
|
||||||
self.url = res[1]
|
self.url = res[1]
|
||||||
self.type = 'query'
|
self.type = archiveurl.QUERY
|
||||||
return True
|
return True
|
||||||
|
|
||||||
# Match replay regex
|
# Match replay regex
|
||||||
# ======================
|
# ======================
|
||||||
def _init_replay(self, url):
|
def _init_replay(self, url):
|
||||||
replay = aurl.REPLAY_REGEX.match(url)
|
replay = archiveurl.REPLAY_REGEX.match(url)
|
||||||
if not replay:
|
if not replay:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@ -94,16 +106,16 @@ class aurl:
|
|||||||
self.mod = res[1]
|
self.mod = res[1]
|
||||||
self.url = res[2]
|
self.url = res[2]
|
||||||
if self.timestamp:
|
if self.timestamp:
|
||||||
self.type = 'replay'
|
self.type = archiveurl.REPLAY
|
||||||
else:
|
else:
|
||||||
self.type = 'latest_replay'
|
self.type = archiveurl.LATEST_REPLAY
|
||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
# Str Representation
|
# Str Representation
|
||||||
# ====================
|
# ====================
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
if self.type == 'query':
|
if self.type == archiveurl.QUERY:
|
||||||
return "/*/" + self.url
|
return "/*/" + self.url
|
||||||
else:
|
else:
|
||||||
tsmod = self.timestamp + self.mod
|
tsmod = self.timestamp + self.mod
|
||||||
@ -117,8 +129,4 @@ class aurl:
|
|||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import doctest
|
import doctest
|
||||||
|
|
||||||
#def print_test(self):
|
|
||||||
# return self.type, self.timestamp, self.mod, self.url, str(self)
|
|
||||||
|
|
||||||
doctest.testmod()
|
doctest.testmod()
|
@ -1,6 +1,7 @@
|
|||||||
import aurl
|
|
||||||
import urlparse
|
import urlparse
|
||||||
from wbrequestresponse import WbRequest, WbResponse
|
from wbrequestresponse import WbRequest, WbResponse
|
||||||
|
from archiveurl import archiveurl
|
||||||
|
|
||||||
|
|
||||||
# Redirect urls that have 'fallen through' based on the referrer
|
# Redirect urls that have 'fallen through' based on the referrer
|
||||||
# settings
|
# settings
|
||||||
@ -43,7 +44,7 @@ class ReferRedirect:
|
|||||||
ref_split = urlparse.urlsplit(wbrequest.referrer)
|
ref_split = urlparse.urlsplit(wbrequest.referrer)
|
||||||
ref_path = ref_split.path[1:].split('/', 1)
|
ref_path = ref_split.path[1:].split('/', 1)
|
||||||
|
|
||||||
ref_wb_url = aurl.aurl('/' + ref_path[1])
|
ref_wb_url = archiveurl('/' + ref_path[1])
|
||||||
|
|
||||||
ref_wb_url.url = urlparse.urljoin(ref_wb_url.url, wbrequest.request_uri[1:])
|
ref_wb_url.url = urlparse.urljoin(ref_wb_url.url, wbrequest.request_uri[1:])
|
||||||
ref_wb_url.url = ref_wb_url.url.replace('../', '')
|
ref_wb_url.url = ref_wb_url.url.replace('../', '')
|
||||||
|
@ -1,10 +1,10 @@
|
|||||||
from wbrequestresponse import WbRequest, WbResponse
|
from wbrequestresponse import WbRequest, WbResponse
|
||||||
from refer_redirect import ReferRedirect
|
from refer_redirect import ReferRedirect
|
||||||
import aurl
|
from archiveurl import archiveurl
|
||||||
|
|
||||||
class WBHandler:
|
class WBHandler:
|
||||||
def run(self, wbrequest):
|
def run(self, wbrequest):
|
||||||
wburl = aurl.aurl(wbrequest.wb_url)
|
wburl = archiveurl(wbrequest.wb_url)
|
||||||
return WbResponse.text_response(repr(wburl))
|
return WbResponse.text_response(repr(wburl))
|
||||||
|
|
||||||
class ArchivalParser:
|
class ArchivalParser:
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
#from werkzeug.wrappers import BaseRequest, BaseResponse, CommonRequestDescriptorsMixin, CommonResponseDescriptorsMixin
|
|
||||||
|
|
||||||
# WbRequest
|
#WB Request and Response
|
||||||
|
|
||||||
class WbRequest:
|
class WbRequest:
|
||||||
def __init__(self, env):
|
def __init__(self, env):
|
||||||
self.env = env
|
self.env = env
|
||||||
|
Loading…
x
Reference in New Issue
Block a user