mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
add wburlrewriter, ReferRedirect uses the rewriter
more refactoring, ReferRedirect moved into archivalrouter module wbrequest: parses from uri directly, keeps track of wburl and prefix
This commit is contained in:
parent
0a2b16407d
commit
4cf4bf3bbb
@ -1,22 +1,122 @@
|
||||
from refer_redirect import ReferRedirect
|
||||
from wbrequestresponse import WbRequest, WbResponse
|
||||
import urlparse
|
||||
|
||||
from wbrequestresponse import WbRequest, WbResponse
|
||||
from wburlrewriter import ArchivalUrlRewriter
|
||||
|
||||
#=================================================================
|
||||
# ArchivalRequestRouter -- route WB requests in archival mode
|
||||
#=================================================================
|
||||
class ArchivalRequestRouter:
|
||||
def __init__(self, mappings, hostpaths=None):
|
||||
def __init__(self, mappings, hostpaths = None, abs_path = True):
|
||||
self.mappings = mappings
|
||||
self.fallback = ReferRedirect(hostpaths)
|
||||
self.abs_path = abs_path
|
||||
|
||||
def parse_request(self, env):
|
||||
def _parseRequest(self, env):
|
||||
request_uri = env['REQUEST_URI']
|
||||
|
||||
for key, value in self.mappings.iteritems():
|
||||
if request_uri.startswith(key):
|
||||
return value, WbRequest.prefix_request(env, key, request_uri)
|
||||
for coll, handler in self.mappings.iteritems():
|
||||
rel_prefix = '/' + coll + '/'
|
||||
if request_uri.startswith(rel_prefix):
|
||||
#return value, ArchivalRequestRouter._prefix_request(env, key, request_uri)
|
||||
req = WbRequest(env,
|
||||
request_uri = request_uri,
|
||||
coll = coll,
|
||||
wb_url = request_uri[len(coll) + 1:],
|
||||
wb_prefix = self.getPrefix(env, rel_prefix))
|
||||
|
||||
return handler, req
|
||||
|
||||
return self.fallback, WbRequest(env)
|
||||
|
||||
def handle_request(self, env):
|
||||
handler, wbrequest = self.parse_request(env)
|
||||
def handleRequest(self, env):
|
||||
handler, wbrequest = self._parseRequest(env)
|
||||
return handler.run(wbrequest)
|
||||
|
||||
def getPrefix(self, env, rel_prefix):
|
||||
if self.abs_path:
|
||||
try:
|
||||
return env['wsgi.url_scheme'] + '://' + env['HTTP_HOST'] + rel_prefix
|
||||
except KeyError:
|
||||
return rel_prefix
|
||||
else:
|
||||
return rel_prefix
|
||||
|
||||
|
||||
#=================================================================
|
||||
# ReferRedirect -- redirect urls that have 'fallen through' based on the referrer settings
|
||||
#=================================================================
|
||||
class ReferRedirect:
|
||||
|
||||
"""
|
||||
>>> ReferRedirect('http://localhost:8080/').matchPrefixs
|
||||
['http://localhost:8080/']
|
||||
|
||||
>>> ReferRedirect(['http://example:9090/']).matchPrefixs
|
||||
['http://example:9090/']
|
||||
|
||||
>>> test_redir('http://localhost:8080/', '/other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html')
|
||||
'http://localhost:8080/coll/20131010/http://example.com/path/other.html'
|
||||
|
||||
>>> test_redir('http://localhost:8080/', '/../other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html')
|
||||
'http://localhost:8080/coll/20131010/http://example.com/other.html'
|
||||
|
||||
>>> test_redir('http://localhost:8080/', '/../../other.html', 'http://localhost:8080/coll/20131010/http://example.com/index.html')
|
||||
'http://localhost:8080/coll/20131010/http://example.com/other.html'
|
||||
|
||||
>>> test_redir('http://example:8080/', '/other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html')
|
||||
False
|
||||
"""
|
||||
|
||||
def __init__(self, matchPrefixs):
|
||||
if isinstance(matchPrefixs, list):
|
||||
self.matchPrefixs = matchPrefixs
|
||||
else:
|
||||
self.matchPrefixs = [matchPrefixs]
|
||||
|
||||
|
||||
def run(self, wbrequest):
|
||||
if wbrequest.referrer is None:
|
||||
return None
|
||||
|
||||
if not any (wbrequest.referrer.startswith(i) for i in self.matchPrefixs):
|
||||
return None
|
||||
|
||||
try:
|
||||
ref_split = urlparse.urlsplit(wbrequest.referrer)
|
||||
ref_path = ref_split.path[1:].split('/', 1)
|
||||
|
||||
rewriter = ArchivalUrlRewriter('/' + ref_path[1], '/' + ref_path[0])
|
||||
|
||||
rel_request_uri = wbrequest.request_uri[1:]
|
||||
|
||||
#ref_wb_url = archiveurl('/' + ref_path[1])
|
||||
#ref_wb_url.url = urlparse.urljoin(ref_wb_url.url, wbrequest.request_uri[1:])
|
||||
#ref_wb_url.url = ref_wb_url.url.replace('../', '')
|
||||
|
||||
#final_url = urlparse.urlunsplit((ref_split.scheme, ref_split.netloc, ref_path[0] + str(ref_wb_url), '', ''))
|
||||
final_url = urlparse.urlunsplit((ref_split.scheme, ref_split.netloc, rewriter.rewrite(rel_request_uri), '', ''))
|
||||
|
||||
except Exception as e:
|
||||
raise e
|
||||
|
||||
return WbResponse.redir_response(final_url)
|
||||
|
||||
if __name__ == "__main__":
|
||||
import doctest
|
||||
|
||||
def test_redir(matchHost, request_uri, referrer):
|
||||
env = {'REQUEST_URI': request_uri, 'HTTP_REFERER': referrer}
|
||||
|
||||
redir = ReferRedirect(matchHost)
|
||||
req = WbRequest.parse(env)
|
||||
rep = redir.run(req)
|
||||
if not rep:
|
||||
return False
|
||||
|
||||
return rep.get_header('Location')
|
||||
|
||||
|
||||
doctest.testmod()
|
||||
|
||||
|
||||
|
@ -2,6 +2,8 @@ import urllib
|
||||
import urllib2
|
||||
import wbexceptions
|
||||
|
||||
from wbarchivalurl import ArchivalUrl
|
||||
|
||||
class RemoteCDXServer:
|
||||
"""
|
||||
>>> x = cdxserver.load('example.com', parse_cdx = True, limit = '2')
|
||||
@ -45,6 +47,27 @@ class RemoteCDXServer:
|
||||
else:
|
||||
return response
|
||||
|
||||
@staticmethod
|
||||
def getQueryParams(wburl):
|
||||
return {
|
||||
|
||||
ArchivalUrl.QUERY:
|
||||
{'collapseTime': '10', 'filter': '!statuscode:(500|502|504)', 'limit': '150000'},
|
||||
|
||||
ArchivalUrl.URL_QUERY:
|
||||
{'collapse': 'urlkey', 'matchType': 'prefix', 'showGroupCount': True, 'showUniqCount': True, 'lastSkipTimestamp': True, 'limit': '100',
|
||||
'fl': 'urlkey,original,timestamp,endtimestamp,groupcount,uniqcount',
|
||||
},
|
||||
|
||||
ArchivalUrl.REPLAY:
|
||||
{'sort': 'closest', 'filter': '!statuscode:(500|502|504)', 'limit': '10', 'closest': wburl.timestamp, 'resolveRevisits': True},
|
||||
|
||||
ArchivalUrl.LATEST_REPLAY:
|
||||
{'sort': 'reverse', 'filter': 'statuscode:[23]..', 'limit': '1', 'resolveRevisits': True}
|
||||
|
||||
}[wburl.type]
|
||||
|
||||
|
||||
class CDXCaptureResult:
|
||||
CDX_FORMATS = [["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"],
|
||||
["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","offset","filename"]]
|
||||
|
@ -1,76 +0,0 @@
|
||||
import urlparse
|
||||
from wbrequestresponse import WbRequest, WbResponse
|
||||
from archiveurl import archiveurl
|
||||
|
||||
|
||||
# Redirect urls that have 'fallen through' based on the referrer
|
||||
# settings
|
||||
class ReferRedirect:
|
||||
|
||||
"""
|
||||
>>> ReferRedirect('http://localhost:8080/').matchPrefixs
|
||||
['http://localhost:8080/']
|
||||
|
||||
>>> ReferRedirect(['http://example:9090/']).matchPrefixs
|
||||
['http://example:9090/']
|
||||
|
||||
>>> test_redir('http://localhost:8080/', '/other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html')
|
||||
'http://localhost:8080/coll/20131010/http://example.com/path/other.html'
|
||||
|
||||
>>> test_redir('http://localhost:8080/', '/../other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html')
|
||||
'http://localhost:8080/coll/20131010/http://example.com/other.html'
|
||||
|
||||
>>> test_redir('http://localhost:8080/', '/../../other.html', 'http://localhost:8080/coll/20131010/http://example.com/index.html')
|
||||
'http://localhost:8080/coll/20131010/http://example.com/other.html'
|
||||
|
||||
>>> test_redir('http://example:8080/', '/other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html')
|
||||
False
|
||||
"""
|
||||
|
||||
def __init__(self, matchPrefixs):
|
||||
if isinstance(matchPrefixs, list):
|
||||
self.matchPrefixs = matchPrefixs
|
||||
else:
|
||||
self.matchPrefixs = [matchPrefixs]
|
||||
|
||||
def run(self, wbrequest):
|
||||
if wbrequest.referrer is None:
|
||||
return None
|
||||
|
||||
if not any (wbrequest.referrer.startswith(i) for i in self.matchPrefixs):
|
||||
return None
|
||||
|
||||
try:
|
||||
ref_split = urlparse.urlsplit(wbrequest.referrer)
|
||||
ref_path = ref_split.path[1:].split('/', 1)
|
||||
|
||||
ref_wb_url = archiveurl('/' + ref_path[1])
|
||||
|
||||
ref_wb_url.url = urlparse.urljoin(ref_wb_url.url, wbrequest.request_uri[1:])
|
||||
ref_wb_url.url = ref_wb_url.url.replace('../', '')
|
||||
|
||||
final_url = urlparse.urlunsplit((ref_split.scheme, ref_split.netloc, ref_path[0] + str(ref_wb_url), '', ''))
|
||||
|
||||
except Exception as e:
|
||||
return None
|
||||
|
||||
return WbResponse.redir_response(final_url)
|
||||
|
||||
if __name__ == "__main__":
|
||||
import doctest
|
||||
|
||||
def test_redir(matchHost, request_uri, referrer):
|
||||
env = {'REQUEST_URI': request_uri, 'HTTP_REFERER': referrer}
|
||||
|
||||
redir = ReferRedirect(matchHost)
|
||||
req = WbRequest(env)
|
||||
rep = redir.run(req)
|
||||
if not rep:
|
||||
return False
|
||||
|
||||
return rep.get_header('Location')
|
||||
|
||||
|
||||
doctest.testmod()
|
||||
|
||||
|
@ -1,47 +1,28 @@
|
||||
from wbrequestresponse import WbResponse
|
||||
from archiveurl import archiveurl
|
||||
from archivalrouter import ArchivalRequestRouter
|
||||
import indexreader
|
||||
import json
|
||||
import wbexceptions
|
||||
import utils
|
||||
|
||||
from wbrequestresponse import WbResponse
|
||||
from archivalrouter import ArchivalRequestRouter
|
||||
|
||||
class EchoEnv:
|
||||
def run(self, wbrequest):
|
||||
return WbResponse.text_response(str(wbrequest.env))
|
||||
|
||||
class WBHandler:
|
||||
def run(self, wbrequest):
|
||||
wburl = archiveurl(wbrequest.wb_url)
|
||||
wbrequest.parsed_url = wburl
|
||||
return WbResponse.text_stream(str(vars(wburl)))
|
||||
return WbResponse.text_response(str(wbrequest))
|
||||
|
||||
class QueryHandler:
|
||||
def __init__(self):
|
||||
self.cdxserver = indexreader.RemoteCDXServer('http://web.archive.org/cdx/search/cdx')
|
||||
|
||||
@staticmethod
|
||||
def get_query_params(wburl):
|
||||
return {
|
||||
|
||||
archiveurl.QUERY:
|
||||
{'collapseTime': '10', 'filter': '!statuscode:(500|502|504)', 'limit': '150000'},
|
||||
|
||||
archiveurl.URL_QUERY:
|
||||
{'collapse': 'urlkey', 'matchType': 'prefix', 'showGroupCount': True, 'showUniqCount': True, 'lastSkipTimestamp': True, 'limit': '100',
|
||||
'fl': 'urlkey,original,timestamp,endtimestamp,groupcount,uniqcount',
|
||||
},
|
||||
|
||||
archiveurl.REPLAY:
|
||||
{'sort': 'closest', 'filter': '!statuscode:(500|502|504)', 'limit': '10', 'closest': wburl.timestamp, 'resolveRevisits': True},
|
||||
|
||||
archiveurl.LATEST_REPLAY:
|
||||
{'sort': 'reverse', 'filter': 'statuscode:[23]..', 'limit': '1', 'resolveRevisits': True}
|
||||
|
||||
}[wburl.type]
|
||||
|
||||
|
||||
def run(self, wbrequest):
|
||||
wburl = archiveurl(wbrequest.wb_url)
|
||||
#wburl = wbresponse.body.parsed_url
|
||||
wburl = wbrequest.wb_url
|
||||
|
||||
params = QueryHandler.get_query_params(wburl)
|
||||
params = self.cdxserver.getQueryParams(wburl)
|
||||
|
||||
cdxlines = self.cdxserver.load(wburl.url, params)
|
||||
|
||||
@ -56,8 +37,10 @@ class QueryHandler:
|
||||
|
||||
## ===========
|
||||
parser = ArchivalRequestRouter(
|
||||
{'/t1/' : WBHandler(),
|
||||
'/t2/' : QueryHandler()
|
||||
{
|
||||
't0' : EchoEnv(),
|
||||
't1' : WBHandler(),
|
||||
't2' : QueryHandler()
|
||||
},
|
||||
hostpaths = ['http://localhost:9090/'])
|
||||
## ===========
|
||||
@ -67,7 +50,7 @@ def application(env, start_response):
|
||||
response = None
|
||||
|
||||
try:
|
||||
response = parser.handle_request(env)
|
||||
response = parser.handleRequest(env)
|
||||
|
||||
if not response:
|
||||
raise wbexceptions.NotFoundException(env['REQUEST_URI'] + ' was not found')
|
||||
@ -76,11 +59,11 @@ def application(env, start_response):
|
||||
last_exc = e
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
response = handle_exception(env, e)
|
||||
response = handleException(env, e)
|
||||
|
||||
return response(env, start_response)
|
||||
|
||||
def handle_exception(env, exc):
|
||||
def handleException(env, exc):
|
||||
if hasattr(exc, 'status'):
|
||||
status = exc.status()
|
||||
else:
|
||||
|
@ -5,51 +5,51 @@ import rfc3987
|
||||
|
||||
import wbexceptions
|
||||
|
||||
# archiveurl : archivalurl representation for WB
|
||||
# ArchivalUrl : archivalurl representation for WB
|
||||
|
||||
class archiveurl:
|
||||
class ArchivalUrl:
|
||||
"""
|
||||
# Replay Urls
|
||||
# ======================
|
||||
>>> repr(archiveurl('/20131010000506/example.com'))
|
||||
>>> repr(ArchivalUrl('/20131010000506/example.com'))
|
||||
"('replay', '20131010000506', '', 'http://example.com', '/20131010000506/http://example.com')"
|
||||
|
||||
>>> repr(archiveurl('/20130102im_/https://example.com'))
|
||||
>>> repr(ArchivalUrl('/20130102im_/https://example.com'))
|
||||
"('replay', '20130102', 'im_', 'https://example.com', '/20130102im_/https://example.com')"
|
||||
|
||||
>>> repr(archiveurl('/cs_/example.com'))
|
||||
>>> repr(ArchivalUrl('/cs_/example.com'))
|
||||
"('latest_replay', '', 'cs_', 'http://example.com', '/cs_/http://example.com')"
|
||||
|
||||
>>> repr(archiveurl('/https://example.com/xyz'))
|
||||
>>> repr(ArchivalUrl('/https://example.com/xyz'))
|
||||
"('latest_replay', '', '', 'https://example.com/xyz', '/https://example.com/xyz')"
|
||||
|
||||
|
||||
# Query Urls
|
||||
# ======================
|
||||
>>> repr(archiveurl('/*/http://example.com/abc?def=a'))
|
||||
>>> repr(ArchivalUrl('/*/http://example.com/abc?def=a'))
|
||||
"('query', '', '', 'http://example.com/abc?def=a', '/*/http://example.com/abc?def=a')"
|
||||
|
||||
>>> repr(archiveurl('/*/http://example.com/abc?def=a*'))
|
||||
>>> repr(ArchivalUrl('/*/http://example.com/abc?def=a*'))
|
||||
"('url_query', '', '', 'http://example.com/abc?def=a', '/*/http://example.com/abc?def=a*')"
|
||||
|
||||
>>> repr(archiveurl('/json/*/http://example.com/abc?def=a'))
|
||||
>>> repr(ArchivalUrl('/json/*/http://example.com/abc?def=a'))
|
||||
"('query', '', 'json', 'http://example.com/abc?def=a', '/json/*/http://example.com/abc?def=a')"
|
||||
|
||||
>>> repr(archiveurl('/timemap-link/2011*/http://example.com/abc?def=a'))
|
||||
>>> repr(ArchivalUrl('/timemap-link/2011*/http://example.com/abc?def=a'))
|
||||
"('query', '2011', 'timemap-link', 'http://example.com/abc?def=a', '/timemap-link/2011*/http://example.com/abc?def=a')"
|
||||
|
||||
|
||||
# Error Urls
|
||||
# ======================
|
||||
>>> x = archiveurl('abc')
|
||||
>>> x = ArchivalUrl('abc')
|
||||
Traceback (most recent call last):
|
||||
RequestParseException: Invalid WB Request Url: abc
|
||||
|
||||
>>> x = archiveurl('/#$%#/')
|
||||
>>> x = ArchivalUrl('/#$%#/')
|
||||
Traceback (most recent call last):
|
||||
BadUrlException: Bad Request Url: http://#$%#/
|
||||
|
||||
>>> x = archiveurl('/http://example.com:abc/')
|
||||
>>> x = ArchivalUrl('/http://example.com:abc/')
|
||||
Traceback (most recent call last):
|
||||
BadUrlException: Bad Request Url: http://example.com:abc/
|
||||
"""
|
||||
@ -75,14 +75,14 @@ class archiveurl:
|
||||
self.timestamp = ''
|
||||
self.mod = ''
|
||||
|
||||
if not any (f(self, url) for f in [archiveurl._init_query, archiveurl._init_replay]):
|
||||
if not any (f(self, url) for f in [ArchivalUrl._init_query, ArchivalUrl._init_replay]):
|
||||
raise wbexceptions.RequestParseException('Invalid WB Request Url: ' + url)
|
||||
|
||||
if len(self.url) == 0:
|
||||
raise wbexceptions.RequestParseException('Invalid WB Request Url: ' + url)
|
||||
|
||||
if not self.url.startswith('//') and not '://' in self.url:
|
||||
self.url = archiveurl.DEFAULT_SCHEME + self.url
|
||||
self.url = ArchivalUrl.DEFAULT_SCHEME + self.url
|
||||
|
||||
matcher = rfc3987.match(self.url, 'IRI')
|
||||
|
||||
@ -92,7 +92,7 @@ class archiveurl:
|
||||
# Match query regex
|
||||
# ======================
|
||||
def _init_query(self, url):
|
||||
query = archiveurl.QUERY_REGEX.match(url)
|
||||
query = ArchivalUrl.QUERY_REGEX.match(url)
|
||||
if not query:
|
||||
return None
|
||||
|
||||
@ -102,16 +102,16 @@ class archiveurl:
|
||||
self.timestamp = res[1]
|
||||
self.url = res[2]
|
||||
if self.url.endswith('*'):
|
||||
self.type = archiveurl.URL_QUERY
|
||||
self.type = ArchivalUrl.URL_QUERY
|
||||
self.url = self.url[:-1]
|
||||
else:
|
||||
self.type = archiveurl.QUERY
|
||||
self.type = ArchivalUrl.QUERY
|
||||
return True
|
||||
|
||||
# Match replay regex
|
||||
# ======================
|
||||
def _init_replay(self, url):
|
||||
replay = archiveurl.REPLAY_REGEX.match(url)
|
||||
replay = ArchivalUrl.REPLAY_REGEX.match(url)
|
||||
if not replay:
|
||||
return None
|
||||
|
||||
@ -121,16 +121,16 @@ class archiveurl:
|
||||
self.mod = res[1]
|
||||
self.url = res[2]
|
||||
if self.timestamp:
|
||||
self.type = archiveurl.REPLAY
|
||||
self.type = ArchivalUrl.REPLAY
|
||||
else:
|
||||
self.type = archiveurl.LATEST_REPLAY
|
||||
self.type = ArchivalUrl.LATEST_REPLAY
|
||||
|
||||
return True
|
||||
|
||||
# Str Representation
|
||||
# ====================
|
||||
def __str__(self):
|
||||
if self.type == archiveurl.QUERY or self.type == archiveurl.URL_QUERY:
|
||||
if self.type == ArchivalUrl.QUERY or self.type == ArchivalUrl.URL_QUERY:
|
||||
tsmod = "/"
|
||||
if self.mod:
|
||||
tsmod += self.mod + "/"
|
||||
@ -138,7 +138,7 @@ class archiveurl:
|
||||
tsmod += self.timestamp
|
||||
|
||||
tsmod += "*/" + self.url
|
||||
if self.type == archiveurl.URL_QUERY:
|
||||
if self.type == ArchivalUrl.URL_QUERY:
|
||||
tsmod += "*"
|
||||
return tsmod
|
||||
else:
|
@ -1,32 +1,79 @@
|
||||
from wbarchivalurl import ArchivalUrl
|
||||
#WB Request and Response
|
||||
|
||||
class WbRequest:
|
||||
"""
|
||||
>>> WbRequest.prefix_request({'REQUEST_URI': '/save/_embed/example.com/?a=b'}, '/save/')
|
||||
WbRequest(env, '/_embed/example.com/?a=b', 'save')
|
||||
>>> WbRequest.parse({'REQUEST_URI': '/save/_embed/example.com/?a=b'})
|
||||
{'wb_url': ('latest_replay', '', '', 'http://_embed/example.com/?a=b', '/http://_embed/example.com/?a=b'), 'coll': 'save', 'wb_prefix': '/save/', 'request_uri': '/save/_embed/example.com/?a=b'}
|
||||
|
||||
>>> WbRequest.parse({'REQUEST_URI': '/2345/20101024101112im_/example.com/?b=c'})
|
||||
{'wb_url': ('replay', '20101024101112', 'im_', 'http://example.com/?b=c', '/20101024101112im_/http://example.com/?b=c'), 'coll': '2345', 'wb_prefix': '/2345/', 'request_uri': '/2345/20101024101112im_/example.com/?b=c'}
|
||||
|
||||
>>> WbRequest.parse({'REQUEST_URI': '/2010/example.com'})
|
||||
{'wb_url': ('latest_replay', '', '', 'http://example.com', '/http://example.com'), 'coll': '2010', 'wb_prefix': '/2010/', 'request_uri': '/2010/example.com'}
|
||||
|
||||
>>> WbRequest.parse({'REQUEST_URI': '../example.com'})
|
||||
{'wb_url': ('latest_replay', '', '', 'http://example.com', '/http://example.com'), 'coll': '', 'wb_prefix': '/', 'request_uri': '../example.com'}
|
||||
"""
|
||||
|
||||
def __init__(self, env, request_uri = '', wb_url = '', coll = ''):
|
||||
self.env = env
|
||||
|
||||
# if len(wb_url) == 0:
|
||||
# wb_url = request_uri
|
||||
|
||||
setattr(self, 'wb_url', wb_url)
|
||||
setattr(self, 'coll', coll)
|
||||
|
||||
setattr(self, 'request_uri', request_uri)
|
||||
setattr(self, 'referrer', env.get('HTTP_REFERER'))
|
||||
|
||||
|
||||
@staticmethod
|
||||
def prefix_request(env, prefix, request_uri = ''):
|
||||
def parse(env, request_uri = ''):
|
||||
if not request_uri:
|
||||
request_uri = env.get('REQUEST_URI')
|
||||
return WbRequest(env, request_uri, request_uri[len(prefix)-1:], coll = prefix[1:-1])
|
||||
|
||||
parts = request_uri.split('/', 2)
|
||||
|
||||
# Has coll prefix
|
||||
if len(parts) == 3:
|
||||
wb_prefix = '/' + parts[1] + '/'
|
||||
wb_url = '/' + parts[2]
|
||||
coll = parts[1]
|
||||
# No Coll Prefix
|
||||
elif len(parts) == 2:
|
||||
wb_prefix = '/'
|
||||
wb_url = '/' + parts[1]
|
||||
coll = ''
|
||||
else:
|
||||
wb_prefix = '/'
|
||||
wb_url = parts[0]
|
||||
coll = ''
|
||||
|
||||
return WbRequest(env, request_uri, wb_prefix, wb_url, coll)
|
||||
|
||||
def __init__(self, env, request_uri, wb_prefix, wb_url, coll):
|
||||
self.env = env
|
||||
|
||||
self.request_uri = request_uri if request_uri else env.get('REQUEST_URI')
|
||||
|
||||
self.wb_prefix = wb_prefix
|
||||
|
||||
self.wb_url = ArchivalUrl(wb_url)
|
||||
|
||||
self.coll = coll
|
||||
|
||||
self.referrer = env.get('HTTP_REFERER')
|
||||
|
||||
self.is_ajax = self._is_ajax()
|
||||
|
||||
|
||||
def _is_ajax(self):
|
||||
value = self.env.get('HTTP_X_REQUESTED_WITH')
|
||||
if not value:
|
||||
return False
|
||||
|
||||
if value.lower() == 'xmlhttprequest':
|
||||
return True
|
||||
|
||||
if self.referrer and ('ajaxpipe' in self.env.get('QUERY_STRING')):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def __repr__(self):
|
||||
return "WbRequest(env, '" + (self.wb_url) + "', '" + (self.coll) + "')"
|
||||
#return "WbRequest(env, '" + (self.wb_url) + "', '" + (self.coll) + "')"
|
||||
#return str(vars(self))
|
||||
varlist = vars(self)
|
||||
return str({k: varlist[k] for k in ('request_uri', 'wb_prefix', 'wb_url', 'coll')})
|
||||
|
||||
|
||||
class WbResponse:
|
||||
|
52
pywb/wburlrewriter.py
Normal file
52
pywb/wburlrewriter.py
Normal file
@ -0,0 +1,52 @@
|
||||
import copy
|
||||
import urlparse
|
||||
|
||||
from wbarchivalurl import ArchivalUrl
|
||||
|
||||
class ArchivalUrlRewriter:
|
||||
"""
|
||||
>>> test_rewrite('other.html', '/20131010/http://example.com/path/page.html', 'https://web.archive.org/web/')
|
||||
'https://web.archive.org/web/20131010/http://example.com/path/other.html'
|
||||
|
||||
>>> test_rewrite('./other.html', '/20130907*/http://example.com/path/page.html', '/coll/')
|
||||
'/coll/20130907*/http://example.com/path/other.html'
|
||||
|
||||
>>> test_rewrite('../other.html', '/20131112im_/http://example.com/path/page.html', '/coll/')
|
||||
'/coll/20131112im_/http://example.com/other.html'
|
||||
|
||||
>>> test_rewrite('../../other.html', '/*/http://example.com/index.html', 'localhost:8080/')
|
||||
'localhost:8080/*/http://example.com/other.html'
|
||||
|
||||
>>> test_rewrite('../../other.html', '/2020/http://example.com/index.html', '/')
|
||||
'/2020/http://example.com/other.html'
|
||||
|
||||
>>> test_rewrite('../../other.html', '/2020/http://example.com/index.html', '')
|
||||
'/2020/http://example.com/other.html'
|
||||
"""
|
||||
|
||||
def __init__(self, wburl_str, prefix):
|
||||
self.wburl_str = wburl_str
|
||||
self.prefix = prefix
|
||||
if self.prefix.endswith('/'):
|
||||
self.prefix = self.prefix[:-1]
|
||||
|
||||
def rewrite(self, rel_url):
|
||||
if '../' in rel_url:
|
||||
wburl = ArchivalUrl(self.wburl_str)
|
||||
wburl.url = urlparse.urljoin(wburl.url, rel_url)
|
||||
wburl.url = wburl.url.replace('../', '')
|
||||
|
||||
final_url = self.prefix + str(wburl)
|
||||
else:
|
||||
final_url = urlparse.urljoin(self.prefix + self.wburl_str, rel_url)
|
||||
|
||||
return final_url
|
||||
|
||||
if __name__ == "__main__":
|
||||
import doctest
|
||||
|
||||
def test_rewrite(rel_url, base_url, prefix):
|
||||
rewriter = ArchivalUrlRewriter(base_url, prefix)
|
||||
return rewriter.rewrite(rel_url)
|
||||
|
||||
doctest.testmod()
|
Loading…
x
Reference in New Issue
Block a user