1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

add basic wsgi app for parsing archivalurls, fallback on a referrer based redirect

This commit is contained in:
Ilya Kreymer 2013-12-13 15:20:13 -08:00
parent b10f0cd041
commit 27b35f31e8
7 changed files with 245 additions and 27 deletions

2
pywb/__init__.py Normal file
View File

@ -0,0 +1,2 @@
#Allow importing

View File

@ -3,26 +3,31 @@
import re
import rfc3987
import wbexceptions
# aurl : ArchivalUrl representation for WB
class aurl:
"""
# Replay Urls
# ======================
>>> print_test(aurl('/20131010000506/example.com'))
('replay', '20131010000506', None, 'example.com')
>>> repr(aurl('/20131010000506/example.com'))
"('replay', '20131010000506', '', 'example.com', '/20131010000506/example.com')"
>>> print_test(aurl('/20130102im_/example.com'))
('replay', '20130102', 'im_', 'example.com')
>>> repr(aurl('/20130102im_/example.com'))
"('replay', '20130102', 'im_', 'example.com', '/20130102im_/example.com')"
>>> print_test(aurl('/https://example.com/xyz'))
('latest_replay', None, None, 'https://example.com/xyz')
>>> repr(aurl('/cs_/example.com'))
"('latest_replay', '', 'cs_', 'example.com', '/cs_/example.com')"
>>> repr(aurl('/https://example.com/xyz'))
"('latest_replay', '', '', 'https://example.com/xyz', '/https://example.com/xyz')"
# Query Urls
# ======================
>>> print_test(aurl('/*/http://example.com/abc?def=a'))
('query', None, None, 'http://example.com/abc?def=a')
>>> repr(aurl('/*/http://example.com/abc?def=a'))
"('query', '', '', 'http://example.com/abc?def=a', '/*/http://example.com/abc?def=a')"
# Error Urls
@ -33,34 +38,34 @@ class aurl:
>>> x = aurl('/#$%#/')
Traceback (most recent call last):
RequestParseException: Bad Request Url: #$%#/
BadUrlException: Bad Request Url: #$%#/
>>> x = aurl('/http://example.com:abc/')
Traceback (most recent call last):
RequestParseException: Bad Request Url: http://example.com:abc/
BadUrlException: Bad Request Url: http://example.com:abc/
"""
# Regexs
# ======================
QUERY_REGEX = re.compile('^/(\d{1,14})?\*/(.*)$')
REPLAY_REGEX = re.compile('^(/(\d{1,14})([a-z]{2}_)?)?/(.*)$')
REPLAY_REGEX = re.compile('^/(\d{1,14})?([a-z]{2}_)?/?(.*)$')
# ======================
def __init__(self, url):
self.original_url = url
self.type = None
self.url = None
self.timestamp = None
self.mod = None
self.url = ''
self.timestamp = ''
self.mod = ''
if not any (f(self, url) for f in [aurl._init_query, aurl._init_replay]):
raise RequestParseException('Invalid WB Request Url: ' + url)
raise wbexceptions.RequestParseException('Invalid WB Request Url: ' + url)
matcher = rfc3987.match(self.url, 'IRI_reference')
if not matcher:
raise RequestParseException('Bad Request Url: ' + self.url)
raise wbexceptions.BadUrlException('Bad Request Url: ' + self.url)
# Match query regex
# ======================
@ -69,8 +74,10 @@ class aurl:
if not query:
return None
self.timestamp = query.group(1)
self.url = query.group(2)
res = query.groups('')
self.timestamp = res[0]
self.url = res[1]
self.type = 'query'
return True
@ -81,9 +88,11 @@ class aurl:
if not replay:
return None
self.timestamp = replay.group(2)
self.mod = replay.group(3)
self.url = replay.group(4)
res = replay.groups('')
self.timestamp = res[0]
self.mod = res[1]
self.url = res[2]
if self.timestamp:
self.type = 'replay'
else:
@ -91,16 +100,25 @@ class aurl:
return True
# Str Representation
# ====================
def __str__(self):
if self.type == 'query':
return "/*/" + self.url
else:
tsmod = self.timestamp + self.mod
if len(tsmod) > 0:
return "/" + tsmod + "/" + self.url
else:
return "/" + self.url
class RequestParseException(Exception):
pass
def __repr__(self):
return str((self.type, self.timestamp, self.mod, self.url, str(self)))
if __name__ == "__main__":
import doctest
def print_test(self):
return self.type, self.timestamp, self.mod, self.url
#def print_test(self):
# return self.type, self.timestamp, self.mod, self.url, str(self)
doctest.testmod()

75
pywb/refer_redirect.py Normal file
View File

@ -0,0 +1,75 @@
import aurl
import urlparse
from wbrequestresponse import WbRequest, WbResponse
# Redirect urls that have 'fallen through' based on the referrer
# settings
class ReferRedirect:
"""
>>> ReferRedirect('http://localhost:8080/').matchPrefixs
['http://localhost:8080/']
>>> ReferRedirect(['http://example:9090/']).matchPrefixs
['http://example:9090/']
>>> test_redir('http://localhost:8080/', '/other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html')
'http://localhost:8080/coll/20131010/http://example.com/path/other.html'
>>> test_redir('http://localhost:8080/', '/../other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html')
'http://localhost:8080/coll/20131010/http://example.com/other.html'
>>> test_redir('http://localhost:8080/', '/../../other.html', 'http://localhost:8080/coll/20131010/http://example.com/index.html')
'http://localhost:8080/coll/20131010/http://example.com/other.html'
>>> test_redir('http://example:8080/', '/other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html')
False
"""
def __init__(self, matchPrefixs):
if isinstance(matchPrefixs, list):
self.matchPrefixs = matchPrefixs
else:
self.matchPrefixs = [matchPrefixs]
def run(self, wbrequest):
if wbrequest.referrer is None:
return None
if not any (wbrequest.referrer.startswith(i) for i in self.matchPrefixs):
return None
try:
ref_split = urlparse.urlsplit(wbrequest.referrer)
ref_path = ref_split.path[1:].split('/', 1)
ref_wb_url = aurl.aurl('/' + ref_path[1])
ref_wb_url.url = urlparse.urljoin(ref_wb_url.url, wbrequest.request_uri[1:])
ref_wb_url.url = ref_wb_url.url.replace('../', '')
final_url = urlparse.urlunsplit((ref_split.scheme, ref_split.netloc, ref_path[0] + str(ref_wb_url), '', ''))
except Exception as e:
return None
return WbResponse.redir_response(final_url)
if __name__ == "__main__":
import doctest
def test_redir(matchHost, request_uri, referrer):
env = {'REQUEST_URI': request_uri, 'HTTP_REFERER': referrer}
redir = ReferRedirect(matchHost)
req = WbRequest(env)
rep = redir.run(req)
if not rep:
return False
return rep.get_header('Location')
doctest.testmod()

8
pywb/run.sh Executable file
View File

@ -0,0 +1,8 @@
#!/bin/sh
app=$1
if [ -z "$app" ]; then
app=wbapp.py
fi
uwsgi --http :9090 --wsgi-file $app

59
pywb/wbapp.py Normal file
View File

@ -0,0 +1,59 @@
from wbrequestresponse import WbRequest, WbResponse
from refer_redirect import ReferRedirect
import aurl
class WBHandler:
def run(self, wbrequest):
wburl = aurl.aurl(wbrequest.wb_url)
return WbResponse.text_response(repr(wburl))
class ArchivalParser:
def __init__(self, mappings, hostpaths=None):
self.mappings = mappings
self.fallback = ReferRedirect(hostpaths)
def find_handler(self, env):
request_uri = env['REQUEST_URI']
for key, value in self.mappings.iteritems():
if request_uri.startswith(key):
env['WB_URL'] = request_uri[len(key)-1:]
env['WB_COLL'] = key[1:-1]
#print "Found: " + str(value) + " for " + key
return value
return self.fallback
def handle_request(self, env):
handler = self.find_handler(env)
return handler.run(WbRequest(env))
def handle_exception(self, env, exc):
return WbResponse.text_response('Error: ' + str(exc), status = '400 Bad Request')
def handle_not_found(self, env):
return WbResponse.text_response('Not Found: ' + env['REQUEST_URI'], status = '404 Not Found')
## ===========
parser = ArchivalParser({'/web/': WBHandler()}, hostpaths = ['http://localhost:9090/'])
## ===========
def application(env, start_response):
response = None
try:
response = parser.handle_request(env)
except Exception as e:
last_exc = e
import traceback
traceback.print_exc()
response = parser.handle_exception(env, e)
if not response:
response = parser.handle_not_found(env)
return response(env, start_response)

8
pywb/wbexceptions.py Normal file
View File

@ -0,0 +1,8 @@
class RequestParseException(Exception):
pass
class BadUrlException(Exception):
pass

48
pywb/wbrequestresponse.py Normal file
View File

@ -0,0 +1,48 @@
#from werkzeug.wrappers import BaseRequest, BaseResponse, CommonRequestDescriptorsMixin, CommonResponseDescriptorsMixin
# WbRequest
class WbRequest:
def __init__(self, env):
self.env = env
self.wb_url = env.get('WB_URL')
self.coll = env.get('WB_COLL')
setattr(self, 'request_uri', env.get('REQUEST_URI'))
setattr(self, 'referrer', env.get('HTTP_REFERER'))
def __repr__(self):
return self.coll + " " + self.wb_url
class WbResponse:
def __init__(self, status, value = [], headersList = []):
self.status = status
self.body = value
self.headersList = headersList
@staticmethod
def text_response(text, status = '200 OK'):
return WbResponse(status, value = [text], headersList = [('Content-Type', 'text/plain')])
@staticmethod
def redir_response(location):
return WbResponse('302 Redirect', headersList = [('Location', location)])
def get_header(self, name):
name_upp = name.upper()
for value in self.headersList:
if (value[0].upper() == name_upp):
return value[1]
def __call__(self, env, start_response):
#headersList = []
#for key, value in self.headers.iteritems():
# headersList.append((key, value))
start_response(self.status, self.headersList)
return self.body