mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
add aurl.py with a few tests
This commit is contained in:
parent
0dc56ee074
commit
10bf465367
104
pywb/aurl.py
Normal file
104
pywb/aurl.py
Normal file
@ -0,0 +1,104 @@
|
|||||||
|
import re
|
||||||
|
import rfc3987
|
||||||
|
|
||||||
|
# aurl : ArchivalUrl representation for WB
|
||||||
|
|
||||||
|
class aurl:
|
||||||
|
"""
|
||||||
|
# Replay Urls
|
||||||
|
# ======================
|
||||||
|
>>> print_test(aurl('/20131010000506/example.com'))
|
||||||
|
('replay', '20131010000506', None, 'example.com')
|
||||||
|
|
||||||
|
>>> print_test(aurl('/20130102im_/example.com'))
|
||||||
|
('replay', '20130102', 'im_', 'example.com')
|
||||||
|
|
||||||
|
>>> print_test(aurl('/https://example.com/xyz'))
|
||||||
|
('latest_replay', None, None, 'https://example.com/xyz')
|
||||||
|
|
||||||
|
|
||||||
|
# Query Urls
|
||||||
|
# ======================
|
||||||
|
>>> print_test(aurl('/*/http://example.com/abc?def=a'))
|
||||||
|
('query', None, None, 'http://example.com/abc?def=a')
|
||||||
|
|
||||||
|
|
||||||
|
# Error Urls
|
||||||
|
# ======================
|
||||||
|
>>> x = aurl('abc')
|
||||||
|
Traceback (most recent call last):
|
||||||
|
RequestParseException: Invalid WB Request Url: abc
|
||||||
|
|
||||||
|
>>> x = aurl('/#$%#/')
|
||||||
|
Traceback (most recent call last):
|
||||||
|
RequestParseException: Bad Request Url: #$%#/
|
||||||
|
|
||||||
|
>>> x = aurl('/http://example.com:abc/')
|
||||||
|
Traceback (most recent call last):
|
||||||
|
RequestParseException: Bad Request Url: http://example.com:abc/
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Regexs
|
||||||
|
# ======================
|
||||||
|
QUERY_REGEX = re.compile('^/(\d{1,14})?\*/(.*)$')
|
||||||
|
REPLAY_REGEX = re.compile('^(/(\d{1,14})([a-z]{2}_)?)?/(.*)$')
|
||||||
|
# ======================
|
||||||
|
|
||||||
|
|
||||||
|
def __init__(self, url):
|
||||||
|
self.original_url = url
|
||||||
|
self.type = None
|
||||||
|
self.url = None
|
||||||
|
self.timestamp = None
|
||||||
|
self.mod = None
|
||||||
|
|
||||||
|
if not any (f(self, url) for f in [aurl._init_query, aurl._init_replay]):
|
||||||
|
raise RequestParseException('Invalid WB Request Url: ' + url)
|
||||||
|
|
||||||
|
matcher = rfc3987.match(self.url, 'URI_reference')
|
||||||
|
|
||||||
|
if not matcher:
|
||||||
|
raise RequestParseException('Bad Request Url: ' + self.url)
|
||||||
|
|
||||||
|
# Match query regex
|
||||||
|
# ======================
|
||||||
|
def _init_query(self, url):
|
||||||
|
query = aurl.QUERY_REGEX.match(url)
|
||||||
|
if not query:
|
||||||
|
return None
|
||||||
|
|
||||||
|
self.timestamp = query.group(1)
|
||||||
|
self.url = query.group(2)
|
||||||
|
self.type = 'query'
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Match replay regex
|
||||||
|
# ======================
|
||||||
|
def _init_replay(self, url):
|
||||||
|
replay = aurl.REPLAY_REGEX.match(url)
|
||||||
|
if not replay:
|
||||||
|
return None
|
||||||
|
|
||||||
|
self.timestamp = replay.group(2)
|
||||||
|
self.mod = replay.group(3)
|
||||||
|
self.url = replay.group(4)
|
||||||
|
if self.timestamp:
|
||||||
|
self.type = 'replay'
|
||||||
|
else:
|
||||||
|
self.type = 'latest_replay'
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
class RequestParseException(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import doctest
|
||||||
|
|
||||||
|
def print_test(self):
|
||||||
|
return self.type, self.timestamp, self.mod, self.url
|
||||||
|
|
||||||
|
doctest.testmod()
|
Loading…
x
Reference in New Issue
Block a user