1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

rename aurl -> archiveurl, add default scheme, test for empty url

This commit is contained in:
Ilya Kreymer 2013-12-13 15:43:07 -08:00
parent 6b78f59e49
commit 5d42cc0cac
4 changed files with 43 additions and 34 deletions

View File

@ -5,50 +5,56 @@ import rfc3987
import wbexceptions
# aurl : ArchivalUrl representation for WB
# archiveurl : archivalurl representation for WB
class aurl:
class archiveurl:
"""
# Replay Urls
# ======================
>>> repr(aurl('/20131010000506/example.com'))
"('replay', '20131010000506', '', 'example.com', '/20131010000506/example.com')"
>>> repr(archiveurl('/20131010000506/example.com'))
"('replay', '20131010000506', '', 'http://example.com', '/20131010000506/http://example.com')"
>>> repr(aurl('/20130102im_/example.com'))
"('replay', '20130102', 'im_', 'example.com', '/20130102im_/example.com')"
>>> repr(archiveurl('/20130102im_/https://example.com'))
"('replay', '20130102', 'im_', 'https://example.com', '/20130102im_/https://example.com')"
>>> repr(aurl('/cs_/example.com'))
"('latest_replay', '', 'cs_', 'example.com', '/cs_/example.com')"
>>> repr(archiveurl('/cs_/example.com'))
"('latest_replay', '', 'cs_', 'http://example.com', '/cs_/http://example.com')"
>>> repr(aurl('/https://example.com/xyz'))
>>> repr(archiveurl('/https://example.com/xyz'))
"('latest_replay', '', '', 'https://example.com/xyz', '/https://example.com/xyz')"
# Query Urls
# ======================
>>> repr(aurl('/*/http://example.com/abc?def=a'))
>>> repr(archiveurl('/*/http://example.com/abc?def=a'))
"('query', '', '', 'http://example.com/abc?def=a', '/*/http://example.com/abc?def=a')"
# Error Urls
# ======================
>>> x = aurl('abc')
>>> x = archiveurl('abc')
Traceback (most recent call last):
RequestParseException: Invalid WB Request Url: abc
>>> x = aurl('/#$%#/')
>>> x = archiveurl('/#$%#/')
Traceback (most recent call last):
BadUrlException: Bad Request Url: #$%#/
BadUrlException: Bad Request Url: http://#$%#/
>>> x = aurl('/http://example.com:abc/')
>>> x = archiveurl('/http://example.com:abc/')
Traceback (most recent call last):
BadUrlException: Bad Request Url: http://example.com:abc/
"""
# Regexs
# ======================
QUERY_REGEX = re.compile('^/(\d{1,14})?\*/(.*)$')
REPLAY_REGEX = re.compile('^/(\d{1,14})?([a-z]{2}_)?/?(.*)$')
QUERY_REGEX = re.compile('^/(\d*)\*/(.*)$')
REPLAY_REGEX = re.compile('^/(\d*)([a-z]{2}_)?/?(.*)$')
QUERY = 'query'
REPLAY = 'replay'
LATEST_REPLAY = 'latest_replay'
DEFAULT_SCHEME = 'http://'
# ======================
@ -59,10 +65,16 @@ class aurl:
self.timestamp = ''
self.mod = ''
if not any (f(self, url) for f in [aurl._init_query, aurl._init_replay]):
if not any (f(self, url) for f in [archiveurl._init_query, archiveurl._init_replay]):
raise wbexceptions.RequestParseException('Invalid WB Request Url: ' + url)
matcher = rfc3987.match(self.url, 'IRI_reference')
if len(self.url) == 0:
raise wbexceptions.RequestParseException('Invalid WB Request Url: ' + url)
if not self.url.startswith('//') and not '://' in self.url:
self.url = archiveurl.DEFAULT_SCHEME + self.url
matcher = rfc3987.match(self.url, 'IRI')
if not matcher:
raise wbexceptions.BadUrlException('Bad Request Url: ' + self.url)
@ -70,7 +82,7 @@ class aurl:
# Match query regex
# ======================
def _init_query(self, url):
query = aurl.QUERY_REGEX.match(url)
query = archiveurl.QUERY_REGEX.match(url)
if not query:
return None
@ -78,13 +90,13 @@ class aurl:
self.timestamp = res[0]
self.url = res[1]
self.type = 'query'
self.type = archiveurl.QUERY
return True
# Match replay regex
# ======================
def _init_replay(self, url):
replay = aurl.REPLAY_REGEX.match(url)
replay = archiveurl.REPLAY_REGEX.match(url)
if not replay:
return None
@ -94,16 +106,16 @@ class aurl:
self.mod = res[1]
self.url = res[2]
if self.timestamp:
self.type = 'replay'
self.type = archiveurl.REPLAY
else:
self.type = 'latest_replay'
self.type = archiveurl.LATEST_REPLAY
return True
# Str Representation
# ====================
def __str__(self):
if self.type == 'query':
if self.type == archiveurl.QUERY:
return "/*/" + self.url
else:
tsmod = self.timestamp + self.mod
@ -117,8 +129,4 @@ class aurl:
if __name__ == "__main__":
import doctest
#def print_test(self):
# return self.type, self.timestamp, self.mod, self.url, str(self)
doctest.testmod()

View File

@ -1,6 +1,7 @@
import aurl
import urlparse
from wbrequestresponse import WbRequest, WbResponse
from archiveurl import archiveurl
# Redirect urls that have 'fallen through' based on the referrer
# settings
@ -43,7 +44,7 @@ class ReferRedirect:
ref_split = urlparse.urlsplit(wbrequest.referrer)
ref_path = ref_split.path[1:].split('/', 1)
ref_wb_url = aurl.aurl('/' + ref_path[1])
ref_wb_url = archiveurl('/' + ref_path[1])
ref_wb_url.url = urlparse.urljoin(ref_wb_url.url, wbrequest.request_uri[1:])
ref_wb_url.url = ref_wb_url.url.replace('../', '')

View File

@ -1,10 +1,10 @@
from wbrequestresponse import WbRequest, WbResponse
from refer_redirect import ReferRedirect
import aurl
from archiveurl import archiveurl
class WBHandler:
def run(self, wbrequest):
wburl = aurl.aurl(wbrequest.wb_url)
wburl = archiveurl(wbrequest.wb_url)
return WbResponse.text_response(repr(wburl))
class ArchivalParser:

View File

@ -1,6 +1,6 @@
#from werkzeug.wrappers import BaseRequest, BaseResponse, CommonRequestDescriptorsMixin, CommonResponseDescriptorsMixin
# WbRequest
#WB Request and Response
class WbRequest:
def __init__(self, env):
self.env = env