mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
add basic wsgi app for parsing archivalurls, fallback on a referrer based redirect
This commit is contained in:
parent
b10f0cd041
commit
27b35f31e8
2
pywb/__init__.py
Normal file
2
pywb/__init__.py
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
#Allow importing
|
||||||
|
|
72
pywb/aurl.py
72
pywb/aurl.py
@ -3,26 +3,31 @@
|
|||||||
import re
|
import re
|
||||||
import rfc3987
|
import rfc3987
|
||||||
|
|
||||||
|
import wbexceptions
|
||||||
|
|
||||||
# aurl : ArchivalUrl representation for WB
|
# aurl : ArchivalUrl representation for WB
|
||||||
|
|
||||||
class aurl:
|
class aurl:
|
||||||
"""
|
"""
|
||||||
# Replay Urls
|
# Replay Urls
|
||||||
# ======================
|
# ======================
|
||||||
>>> print_test(aurl('/20131010000506/example.com'))
|
>>> repr(aurl('/20131010000506/example.com'))
|
||||||
('replay', '20131010000506', None, 'example.com')
|
"('replay', '20131010000506', '', 'example.com', '/20131010000506/example.com')"
|
||||||
|
|
||||||
>>> print_test(aurl('/20130102im_/example.com'))
|
>>> repr(aurl('/20130102im_/example.com'))
|
||||||
('replay', '20130102', 'im_', 'example.com')
|
"('replay', '20130102', 'im_', 'example.com', '/20130102im_/example.com')"
|
||||||
|
|
||||||
>>> print_test(aurl('/https://example.com/xyz'))
|
>>> repr(aurl('/cs_/example.com'))
|
||||||
('latest_replay', None, None, 'https://example.com/xyz')
|
"('latest_replay', '', 'cs_', 'example.com', '/cs_/example.com')"
|
||||||
|
|
||||||
|
>>> repr(aurl('/https://example.com/xyz'))
|
||||||
|
"('latest_replay', '', '', 'https://example.com/xyz', '/https://example.com/xyz')"
|
||||||
|
|
||||||
|
|
||||||
# Query Urls
|
# Query Urls
|
||||||
# ======================
|
# ======================
|
||||||
>>> print_test(aurl('/*/http://example.com/abc?def=a'))
|
>>> repr(aurl('/*/http://example.com/abc?def=a'))
|
||||||
('query', None, None, 'http://example.com/abc?def=a')
|
"('query', '', '', 'http://example.com/abc?def=a', '/*/http://example.com/abc?def=a')"
|
||||||
|
|
||||||
|
|
||||||
# Error Urls
|
# Error Urls
|
||||||
@ -33,34 +38,34 @@ class aurl:
|
|||||||
|
|
||||||
>>> x = aurl('/#$%#/')
|
>>> x = aurl('/#$%#/')
|
||||||
Traceback (most recent call last):
|
Traceback (most recent call last):
|
||||||
RequestParseException: Bad Request Url: #$%#/
|
BadUrlException: Bad Request Url: #$%#/
|
||||||
|
|
||||||
>>> x = aurl('/http://example.com:abc/')
|
>>> x = aurl('/http://example.com:abc/')
|
||||||
Traceback (most recent call last):
|
Traceback (most recent call last):
|
||||||
RequestParseException: Bad Request Url: http://example.com:abc/
|
BadUrlException: Bad Request Url: http://example.com:abc/
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# Regexs
|
# Regexs
|
||||||
# ======================
|
# ======================
|
||||||
QUERY_REGEX = re.compile('^/(\d{1,14})?\*/(.*)$')
|
QUERY_REGEX = re.compile('^/(\d{1,14})?\*/(.*)$')
|
||||||
REPLAY_REGEX = re.compile('^(/(\d{1,14})([a-z]{2}_)?)?/(.*)$')
|
REPLAY_REGEX = re.compile('^/(\d{1,14})?([a-z]{2}_)?/?(.*)$')
|
||||||
# ======================
|
# ======================
|
||||||
|
|
||||||
|
|
||||||
def __init__(self, url):
|
def __init__(self, url):
|
||||||
self.original_url = url
|
self.original_url = url
|
||||||
self.type = None
|
self.type = None
|
||||||
self.url = None
|
self.url = ''
|
||||||
self.timestamp = None
|
self.timestamp = ''
|
||||||
self.mod = None
|
self.mod = ''
|
||||||
|
|
||||||
if not any (f(self, url) for f in [aurl._init_query, aurl._init_replay]):
|
if not any (f(self, url) for f in [aurl._init_query, aurl._init_replay]):
|
||||||
raise RequestParseException('Invalid WB Request Url: ' + url)
|
raise wbexceptions.RequestParseException('Invalid WB Request Url: ' + url)
|
||||||
|
|
||||||
matcher = rfc3987.match(self.url, 'IRI_reference')
|
matcher = rfc3987.match(self.url, 'IRI_reference')
|
||||||
|
|
||||||
if not matcher:
|
if not matcher:
|
||||||
raise RequestParseException('Bad Request Url: ' + self.url)
|
raise wbexceptions.BadUrlException('Bad Request Url: ' + self.url)
|
||||||
|
|
||||||
# Match query regex
|
# Match query regex
|
||||||
# ======================
|
# ======================
|
||||||
@ -69,8 +74,10 @@ class aurl:
|
|||||||
if not query:
|
if not query:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
self.timestamp = query.group(1)
|
res = query.groups('')
|
||||||
self.url = query.group(2)
|
|
||||||
|
self.timestamp = res[0]
|
||||||
|
self.url = res[1]
|
||||||
self.type = 'query'
|
self.type = 'query'
|
||||||
return True
|
return True
|
||||||
|
|
||||||
@ -81,9 +88,11 @@ class aurl:
|
|||||||
if not replay:
|
if not replay:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
self.timestamp = replay.group(2)
|
res = replay.groups('')
|
||||||
self.mod = replay.group(3)
|
|
||||||
self.url = replay.group(4)
|
self.timestamp = res[0]
|
||||||
|
self.mod = res[1]
|
||||||
|
self.url = res[2]
|
||||||
if self.timestamp:
|
if self.timestamp:
|
||||||
self.type = 'replay'
|
self.type = 'replay'
|
||||||
else:
|
else:
|
||||||
@ -91,16 +100,25 @@ class aurl:
|
|||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
# Str Representation
|
||||||
|
# ====================
|
||||||
|
def __str__(self):
|
||||||
|
if self.type == 'query':
|
||||||
|
return "/*/" + self.url
|
||||||
|
else:
|
||||||
|
tsmod = self.timestamp + self.mod
|
||||||
|
if len(tsmod) > 0:
|
||||||
|
return "/" + tsmod + "/" + self.url
|
||||||
|
else:
|
||||||
|
return "/" + self.url
|
||||||
|
|
||||||
class RequestParseException(Exception):
|
def __repr__(self):
|
||||||
pass
|
return str((self.type, self.timestamp, self.mod, self.url, str(self)))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import doctest
|
import doctest
|
||||||
|
|
||||||
def print_test(self):
|
#def print_test(self):
|
||||||
return self.type, self.timestamp, self.mod, self.url
|
# return self.type, self.timestamp, self.mod, self.url, str(self)
|
||||||
|
|
||||||
doctest.testmod()
|
doctest.testmod()
|
||||||
|
75
pywb/refer_redirect.py
Normal file
75
pywb/refer_redirect.py
Normal file
@ -0,0 +1,75 @@
|
|||||||
|
import aurl
|
||||||
|
import urlparse
|
||||||
|
from wbrequestresponse import WbRequest, WbResponse
|
||||||
|
|
||||||
|
# Redirect urls that have 'fallen through' based on the referrer
|
||||||
|
# settings
|
||||||
|
class ReferRedirect:
|
||||||
|
|
||||||
|
"""
|
||||||
|
>>> ReferRedirect('http://localhost:8080/').matchPrefixs
|
||||||
|
['http://localhost:8080/']
|
||||||
|
|
||||||
|
>>> ReferRedirect(['http://example:9090/']).matchPrefixs
|
||||||
|
['http://example:9090/']
|
||||||
|
|
||||||
|
>>> test_redir('http://localhost:8080/', '/other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html')
|
||||||
|
'http://localhost:8080/coll/20131010/http://example.com/path/other.html'
|
||||||
|
|
||||||
|
>>> test_redir('http://localhost:8080/', '/../other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html')
|
||||||
|
'http://localhost:8080/coll/20131010/http://example.com/other.html'
|
||||||
|
|
||||||
|
>>> test_redir('http://localhost:8080/', '/../../other.html', 'http://localhost:8080/coll/20131010/http://example.com/index.html')
|
||||||
|
'http://localhost:8080/coll/20131010/http://example.com/other.html'
|
||||||
|
|
||||||
|
>>> test_redir('http://example:8080/', '/other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html')
|
||||||
|
False
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, matchPrefixs):
|
||||||
|
if isinstance(matchPrefixs, list):
|
||||||
|
self.matchPrefixs = matchPrefixs
|
||||||
|
else:
|
||||||
|
self.matchPrefixs = [matchPrefixs]
|
||||||
|
|
||||||
|
def run(self, wbrequest):
|
||||||
|
if wbrequest.referrer is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
if not any (wbrequest.referrer.startswith(i) for i in self.matchPrefixs):
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
ref_split = urlparse.urlsplit(wbrequest.referrer)
|
||||||
|
ref_path = ref_split.path[1:].split('/', 1)
|
||||||
|
|
||||||
|
ref_wb_url = aurl.aurl('/' + ref_path[1])
|
||||||
|
|
||||||
|
ref_wb_url.url = urlparse.urljoin(ref_wb_url.url, wbrequest.request_uri[1:])
|
||||||
|
ref_wb_url.url = ref_wb_url.url.replace('../', '')
|
||||||
|
|
||||||
|
final_url = urlparse.urlunsplit((ref_split.scheme, ref_split.netloc, ref_path[0] + str(ref_wb_url), '', ''))
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return WbResponse.redir_response(final_url)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import doctest
|
||||||
|
|
||||||
|
def test_redir(matchHost, request_uri, referrer):
|
||||||
|
env = {'REQUEST_URI': request_uri, 'HTTP_REFERER': referrer}
|
||||||
|
|
||||||
|
redir = ReferRedirect(matchHost)
|
||||||
|
req = WbRequest(env)
|
||||||
|
rep = redir.run(req)
|
||||||
|
if not rep:
|
||||||
|
return False
|
||||||
|
|
||||||
|
return rep.get_header('Location')
|
||||||
|
|
||||||
|
|
||||||
|
doctest.testmod()
|
||||||
|
|
||||||
|
|
8
pywb/run.sh
Executable file
8
pywb/run.sh
Executable file
@ -0,0 +1,8 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
|
||||||
|
app=$1
|
||||||
|
if [ -z "$app" ]; then
|
||||||
|
app=wbapp.py
|
||||||
|
fi
|
||||||
|
|
||||||
|
uwsgi --http :9090 --wsgi-file $app
|
59
pywb/wbapp.py
Normal file
59
pywb/wbapp.py
Normal file
@ -0,0 +1,59 @@
|
|||||||
|
from wbrequestresponse import WbRequest, WbResponse
|
||||||
|
from refer_redirect import ReferRedirect
|
||||||
|
import aurl
|
||||||
|
|
||||||
|
class WBHandler:
|
||||||
|
def run(self, wbrequest):
|
||||||
|
wburl = aurl.aurl(wbrequest.wb_url)
|
||||||
|
return WbResponse.text_response(repr(wburl))
|
||||||
|
|
||||||
|
class ArchivalParser:
|
||||||
|
def __init__(self, mappings, hostpaths=None):
|
||||||
|
self.mappings = mappings
|
||||||
|
self.fallback = ReferRedirect(hostpaths)
|
||||||
|
|
||||||
|
def find_handler(self, env):
|
||||||
|
request_uri = env['REQUEST_URI']
|
||||||
|
|
||||||
|
for key, value in self.mappings.iteritems():
|
||||||
|
if request_uri.startswith(key):
|
||||||
|
env['WB_URL'] = request_uri[len(key)-1:]
|
||||||
|
env['WB_COLL'] = key[1:-1]
|
||||||
|
#print "Found: " + str(value) + " for " + key
|
||||||
|
return value
|
||||||
|
|
||||||
|
return self.fallback
|
||||||
|
|
||||||
|
def handle_request(self, env):
|
||||||
|
handler = self.find_handler(env)
|
||||||
|
return handler.run(WbRequest(env))
|
||||||
|
|
||||||
|
def handle_exception(self, env, exc):
|
||||||
|
return WbResponse.text_response('Error: ' + str(exc), status = '400 Bad Request')
|
||||||
|
|
||||||
|
def handle_not_found(self, env):
|
||||||
|
return WbResponse.text_response('Not Found: ' + env['REQUEST_URI'], status = '404 Not Found')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
## ===========
|
||||||
|
parser = ArchivalParser({'/web/': WBHandler()}, hostpaths = ['http://localhost:9090/'])
|
||||||
|
## ===========
|
||||||
|
|
||||||
|
|
||||||
|
def application(env, start_response):
|
||||||
|
response = None
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = parser.handle_request(env)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
last_exc = e
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
response = parser.handle_exception(env, e)
|
||||||
|
|
||||||
|
if not response:
|
||||||
|
response = parser.handle_not_found(env)
|
||||||
|
|
||||||
|
return response(env, start_response)
|
8
pywb/wbexceptions.py
Normal file
8
pywb/wbexceptions.py
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
|
||||||
|
class RequestParseException(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
class BadUrlException(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
48
pywb/wbrequestresponse.py
Normal file
48
pywb/wbrequestresponse.py
Normal file
@ -0,0 +1,48 @@
|
|||||||
|
#from werkzeug.wrappers import BaseRequest, BaseResponse, CommonRequestDescriptorsMixin, CommonResponseDescriptorsMixin
|
||||||
|
|
||||||
|
# WbRequest
|
||||||
|
class WbRequest:
|
||||||
|
def __init__(self, env):
|
||||||
|
self.env = env
|
||||||
|
self.wb_url = env.get('WB_URL')
|
||||||
|
self.coll = env.get('WB_COLL')
|
||||||
|
|
||||||
|
setattr(self, 'request_uri', env.get('REQUEST_URI'))
|
||||||
|
setattr(self, 'referrer', env.get('HTTP_REFERER'))
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return self.coll + " " + self.wb_url
|
||||||
|
|
||||||
|
|
||||||
|
class WbResponse:
|
||||||
|
def __init__(self, status, value = [], headersList = []):
|
||||||
|
self.status = status
|
||||||
|
self.body = value
|
||||||
|
self.headersList = headersList
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def text_response(text, status = '200 OK'):
|
||||||
|
return WbResponse(status, value = [text], headersList = [('Content-Type', 'text/plain')])
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def redir_response(location):
|
||||||
|
return WbResponse('302 Redirect', headersList = [('Location', location)])
|
||||||
|
|
||||||
|
def get_header(self, name):
|
||||||
|
name_upp = name.upper()
|
||||||
|
for value in self.headersList:
|
||||||
|
if (value[0].upper() == name_upp):
|
||||||
|
return value[1]
|
||||||
|
|
||||||
|
def __call__(self, env, start_response):
|
||||||
|
#headersList = []
|
||||||
|
#for key, value in self.headers.iteritems():
|
||||||
|
# headersList.append((key, value))
|
||||||
|
|
||||||
|
start_response(self.status, self.headersList)
|
||||||
|
return self.body
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
x
Reference in New Issue
Block a user