mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
rename aurl -> archiveurl, add default scheme, test for empty url
This commit is contained in:
parent
6b78f59e49
commit
5d42cc0cac
@ -5,50 +5,56 @@ import rfc3987
|
||||
|
||||
import wbexceptions
|
||||
|
||||
# aurl : ArchivalUrl representation for WB
|
||||
# archiveurl : archivalurl representation for WB
|
||||
|
||||
class aurl:
|
||||
class archiveurl:
|
||||
"""
|
||||
# Replay Urls
|
||||
# ======================
|
||||
>>> repr(aurl('/20131010000506/example.com'))
|
||||
"('replay', '20131010000506', '', 'example.com', '/20131010000506/example.com')"
|
||||
>>> repr(archiveurl('/20131010000506/example.com'))
|
||||
"('replay', '20131010000506', '', 'http://example.com', '/20131010000506/http://example.com')"
|
||||
|
||||
>>> repr(aurl('/20130102im_/example.com'))
|
||||
"('replay', '20130102', 'im_', 'example.com', '/20130102im_/example.com')"
|
||||
>>> repr(archiveurl('/20130102im_/https://example.com'))
|
||||
"('replay', '20130102', 'im_', 'https://example.com', '/20130102im_/https://example.com')"
|
||||
|
||||
>>> repr(aurl('/cs_/example.com'))
|
||||
"('latest_replay', '', 'cs_', 'example.com', '/cs_/example.com')"
|
||||
>>> repr(archiveurl('/cs_/example.com'))
|
||||
"('latest_replay', '', 'cs_', 'http://example.com', '/cs_/http://example.com')"
|
||||
|
||||
>>> repr(aurl('/https://example.com/xyz'))
|
||||
>>> repr(archiveurl('/https://example.com/xyz'))
|
||||
"('latest_replay', '', '', 'https://example.com/xyz', '/https://example.com/xyz')"
|
||||
|
||||
|
||||
# Query Urls
|
||||
# ======================
|
||||
>>> repr(aurl('/*/http://example.com/abc?def=a'))
|
||||
>>> repr(archiveurl('/*/http://example.com/abc?def=a'))
|
||||
"('query', '', '', 'http://example.com/abc?def=a', '/*/http://example.com/abc?def=a')"
|
||||
|
||||
|
||||
# Error Urls
|
||||
# ======================
|
||||
>>> x = aurl('abc')
|
||||
>>> x = archiveurl('abc')
|
||||
Traceback (most recent call last):
|
||||
RequestParseException: Invalid WB Request Url: abc
|
||||
|
||||
>>> x = aurl('/#$%#/')
|
||||
>>> x = archiveurl('/#$%#/')
|
||||
Traceback (most recent call last):
|
||||
BadUrlException: Bad Request Url: #$%#/
|
||||
BadUrlException: Bad Request Url: http://#$%#/
|
||||
|
||||
>>> x = aurl('/http://example.com:abc/')
|
||||
>>> x = archiveurl('/http://example.com:abc/')
|
||||
Traceback (most recent call last):
|
||||
BadUrlException: Bad Request Url: http://example.com:abc/
|
||||
"""
|
||||
|
||||
# Regexs
|
||||
# ======================
|
||||
QUERY_REGEX = re.compile('^/(\d{1,14})?\*/(.*)$')
|
||||
REPLAY_REGEX = re.compile('^/(\d{1,14})?([a-z]{2}_)?/?(.*)$')
|
||||
QUERY_REGEX = re.compile('^/(\d*)\*/(.*)$')
|
||||
REPLAY_REGEX = re.compile('^/(\d*)([a-z]{2}_)?/?(.*)$')
|
||||
|
||||
QUERY = 'query'
|
||||
REPLAY = 'replay'
|
||||
LATEST_REPLAY = 'latest_replay'
|
||||
|
||||
DEFAULT_SCHEME = 'http://'
|
||||
# ======================
|
||||
|
||||
|
||||
@ -59,10 +65,16 @@ class aurl:
|
||||
self.timestamp = ''
|
||||
self.mod = ''
|
||||
|
||||
if not any (f(self, url) for f in [aurl._init_query, aurl._init_replay]):
|
||||
if not any (f(self, url) for f in [archiveurl._init_query, archiveurl._init_replay]):
|
||||
raise wbexceptions.RequestParseException('Invalid WB Request Url: ' + url)
|
||||
|
||||
matcher = rfc3987.match(self.url, 'IRI_reference')
|
||||
if len(self.url) == 0:
|
||||
raise wbexceptions.RequestParseException('Invalid WB Request Url: ' + url)
|
||||
|
||||
if not self.url.startswith('//') and not '://' in self.url:
|
||||
self.url = archiveurl.DEFAULT_SCHEME + self.url
|
||||
|
||||
matcher = rfc3987.match(self.url, 'IRI')
|
||||
|
||||
if not matcher:
|
||||
raise wbexceptions.BadUrlException('Bad Request Url: ' + self.url)
|
||||
@ -70,7 +82,7 @@ class aurl:
|
||||
# Match query regex
|
||||
# ======================
|
||||
def _init_query(self, url):
|
||||
query = aurl.QUERY_REGEX.match(url)
|
||||
query = archiveurl.QUERY_REGEX.match(url)
|
||||
if not query:
|
||||
return None
|
||||
|
||||
@ -78,13 +90,13 @@ class aurl:
|
||||
|
||||
self.timestamp = res[0]
|
||||
self.url = res[1]
|
||||
self.type = 'query'
|
||||
self.type = archiveurl.QUERY
|
||||
return True
|
||||
|
||||
# Match replay regex
|
||||
# ======================
|
||||
def _init_replay(self, url):
|
||||
replay = aurl.REPLAY_REGEX.match(url)
|
||||
replay = archiveurl.REPLAY_REGEX.match(url)
|
||||
if not replay:
|
||||
return None
|
||||
|
||||
@ -94,16 +106,16 @@ class aurl:
|
||||
self.mod = res[1]
|
||||
self.url = res[2]
|
||||
if self.timestamp:
|
||||
self.type = 'replay'
|
||||
self.type = archiveurl.REPLAY
|
||||
else:
|
||||
self.type = 'latest_replay'
|
||||
self.type = archiveurl.LATEST_REPLAY
|
||||
|
||||
return True
|
||||
|
||||
# Str Representation
|
||||
# ====================
|
||||
def __str__(self):
|
||||
if self.type == 'query':
|
||||
if self.type == archiveurl.QUERY:
|
||||
return "/*/" + self.url
|
||||
else:
|
||||
tsmod = self.timestamp + self.mod
|
||||
@ -117,8 +129,4 @@ class aurl:
|
||||
|
||||
if __name__ == "__main__":
|
||||
import doctest
|
||||
|
||||
#def print_test(self):
|
||||
# return self.type, self.timestamp, self.mod, self.url, str(self)
|
||||
|
||||
doctest.testmod()
|
@ -1,6 +1,7 @@
|
||||
import aurl
|
||||
import urlparse
|
||||
from wbrequestresponse import WbRequest, WbResponse
|
||||
from archiveurl import archiveurl
|
||||
|
||||
|
||||
# Redirect urls that have 'fallen through' based on the referrer
|
||||
# settings
|
||||
@ -43,7 +44,7 @@ class ReferRedirect:
|
||||
ref_split = urlparse.urlsplit(wbrequest.referrer)
|
||||
ref_path = ref_split.path[1:].split('/', 1)
|
||||
|
||||
ref_wb_url = aurl.aurl('/' + ref_path[1])
|
||||
ref_wb_url = archiveurl('/' + ref_path[1])
|
||||
|
||||
ref_wb_url.url = urlparse.urljoin(ref_wb_url.url, wbrequest.request_uri[1:])
|
||||
ref_wb_url.url = ref_wb_url.url.replace('../', '')
|
||||
|
@ -1,10 +1,10 @@
|
||||
from wbrequestresponse import WbRequest, WbResponse
|
||||
from refer_redirect import ReferRedirect
|
||||
import aurl
|
||||
from archiveurl import archiveurl
|
||||
|
||||
class WBHandler:
|
||||
def run(self, wbrequest):
|
||||
wburl = aurl.aurl(wbrequest.wb_url)
|
||||
wburl = archiveurl(wbrequest.wb_url)
|
||||
return WbResponse.text_response(repr(wburl))
|
||||
|
||||
class ArchivalParser:
|
||||
|
@ -1,6 +1,6 @@
|
||||
#from werkzeug.wrappers import BaseRequest, BaseResponse, CommonRequestDescriptorsMixin, CommonResponseDescriptorsMixin
|
||||
|
||||
# WbRequest
|
||||
#WB Request and Response
|
||||
|
||||
class WbRequest:
|
||||
def __init__(self, env):
|
||||
self.env = env
|
||||
|
Loading…
x
Reference in New Issue
Block a user