mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
urlrewrite: remove dependency on bottle from rewriterapp,
add overridable error and query views, with extensible get_query_params() and process_cdx_query() to extend cdx for query view add get_top_url() for adding custom top_url for frame insert add call_with_params() for adding custom params to environ
This commit is contained in:
parent
b056acd88e
commit
3b6cab1730
@ -1,30 +1,41 @@
|
|||||||
import requests
|
import requests
|
||||||
|
|
||||||
from bottle import request, response, HTTPError
|
|
||||||
|
|
||||||
from pywb.rewrite.rewrite_content import RewriteContent
|
from pywb.rewrite.rewrite_content import RewriteContent
|
||||||
from pywb.rewrite.wburl import WbUrl
|
from pywb.rewrite.wburl import WbUrl
|
||||||
from pywb.rewrite.url_rewriter import UrlRewriter
|
from pywb.rewrite.url_rewriter import UrlRewriter
|
||||||
|
|
||||||
|
from pywb.utils.wbexception import WbException
|
||||||
from pywb.utils.canonicalize import canonicalize
|
from pywb.utils.canonicalize import canonicalize
|
||||||
from pywb.utils.timeutils import http_date_to_timestamp
|
from pywb.utils.timeutils import http_date_to_timestamp
|
||||||
from pywb.utils.loaders import extract_client_cookie
|
from pywb.utils.loaders import extract_client_cookie
|
||||||
|
|
||||||
from pywb.cdx.cdxobject import CDXObject
|
from pywb.cdx.cdxobject import CDXObject
|
||||||
from pywb.warc.recordloader import ArcWarcRecordLoader
|
from pywb.warc.recordloader import ArcWarcRecordLoader
|
||||||
|
from pywb.framework.wbrequestresponse import WbResponse
|
||||||
|
|
||||||
|
|
||||||
from urlrewrite.rewriteinputreq import RewriteInputRequest
|
from urlrewrite.rewriteinputreq import RewriteInputRequest
|
||||||
from urlrewrite.templateview import JinjaEnv, HeadInsertView, TopFrameView
|
from urlrewrite.templateview import JinjaEnv, HeadInsertView, TopFrameView, BaseInsertView
|
||||||
|
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
import gevent
|
import gevent
|
||||||
|
import json
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
class UpstreamException(WbException):
|
||||||
|
def __init__(self, status_code, url, details):
|
||||||
|
super(UpstreamException, self).__init__(url=url, msg=details)
|
||||||
|
self.status_code = status_code
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
class RewriterApp(object):
|
class RewriterApp(object):
|
||||||
def __init__(self, framed_replay=False, jinja_env=None):
|
def __init__(self, framed_replay=False, jinja_env=None, config=None):
|
||||||
self.loader = ArcWarcRecordLoader()
|
self.loader = ArcWarcRecordLoader()
|
||||||
|
|
||||||
|
config = config or {}
|
||||||
|
|
||||||
self.framed_replay = framed_replay
|
self.framed_replay = framed_replay
|
||||||
self.frame_mod = ''
|
self.frame_mod = ''
|
||||||
self.replay_mod = 'mp_'
|
self.replay_mod = 'mp_'
|
||||||
@ -37,33 +48,55 @@ class RewriterApp(object):
|
|||||||
jinja_env = JinjaEnv(globals={'static_path': 'static/__pywb'})
|
jinja_env = JinjaEnv(globals={'static_path': 'static/__pywb'})
|
||||||
|
|
||||||
self.jinja_env = jinja_env
|
self.jinja_env = jinja_env
|
||||||
|
|
||||||
self.head_insert_view = HeadInsertView(self.jinja_env, 'head_insert.html', 'banner.html')
|
self.head_insert_view = HeadInsertView(self.jinja_env, 'head_insert.html', 'banner.html')
|
||||||
self.frame_insert_view = TopFrameView(self.jinja_env, 'frame_insert.html', 'banner.html')
|
self.frame_insert_view = TopFrameView(self.jinja_env, 'frame_insert.html', 'banner.html')
|
||||||
|
self.error_view = BaseInsertView(self.jinja_env, 'error.html')
|
||||||
|
self.query_view = BaseInsertView(self.jinja_env, config.get('query_html', 'query.html'))
|
||||||
|
|
||||||
def render_content(self, wb_url, **kwargs):
|
def call_with_params(self, **kwargs):
|
||||||
|
def run_app(environ, start_response):
|
||||||
|
environ['pywb.kwargs'] = kwargs
|
||||||
|
return self(environ, start_response)
|
||||||
|
|
||||||
|
return run_app
|
||||||
|
|
||||||
|
def __call__(self, environ, start_response):
|
||||||
|
wb_url = self.get_wburl(environ)
|
||||||
|
kwargs = environ.get('pywb.kwargs', {})
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = self.render_content(wb_url, kwargs, environ)
|
||||||
|
except UpstreamException as ue:
|
||||||
|
response = self.handle_error(environ, ue)
|
||||||
|
|
||||||
|
return response(environ, start_response)
|
||||||
|
|
||||||
|
def render_content(self, wb_url, kwargs, environ):
|
||||||
wb_url = WbUrl(wb_url)
|
wb_url = WbUrl(wb_url)
|
||||||
#if wb_url.mod == 'vi_':
|
#if wb_url.mod == 'vi_':
|
||||||
# return self._get_video_info(wbrequest)
|
# return self._get_video_info(wbrequest)
|
||||||
|
|
||||||
host_prefix = self.get_host_prefix()
|
host_prefix = self.get_host_prefix(environ)
|
||||||
rel_prefix = self.get_rel_prefix()
|
rel_prefix = self.get_rel_prefix(environ)
|
||||||
full_prefix = host_prefix + rel_prefix
|
full_prefix = host_prefix + rel_prefix
|
||||||
|
|
||||||
resp = self.handle_custom_response(wb_url, full_prefix, host_prefix, kwargs)
|
resp = self.handle_custom_response(environ, wb_url,
|
||||||
|
full_prefix, host_prefix, kwargs)
|
||||||
if resp is not None:
|
if resp is not None:
|
||||||
return resp
|
return WbResponse.text_response(resp, content_type='text/html')
|
||||||
|
|
||||||
urlrewriter = UrlRewriter(wb_url,
|
urlrewriter = UrlRewriter(wb_url,
|
||||||
prefix=full_prefix,
|
prefix=full_prefix,
|
||||||
full_prefix=full_prefix,
|
full_prefix=full_prefix,
|
||||||
rel_prefix=rel_prefix)
|
rel_prefix=rel_prefix)
|
||||||
|
|
||||||
self.unrewrite_referrer()
|
self.unrewrite_referrer(environ)
|
||||||
|
|
||||||
url = wb_url.url
|
url = wb_url.url
|
||||||
urlkey = canonicalize(url)
|
urlkey = canonicalize(url)
|
||||||
|
|
||||||
inputreq = RewriteInputRequest(request.environ, urlkey, url,
|
inputreq = RewriteInputRequest(environ, urlkey, url,
|
||||||
self.content_rewriter)
|
self.content_rewriter)
|
||||||
|
|
||||||
mod_url = None
|
mod_url = None
|
||||||
@ -86,7 +119,7 @@ class RewriterApp(object):
|
|||||||
wb_url.url = mod_url
|
wb_url.url = mod_url
|
||||||
inputreq.url = mod_url
|
inputreq.url = mod_url
|
||||||
|
|
||||||
del request.environ['HTTP_RANGE']
|
del environ['HTTP_RANGE']
|
||||||
readd_range = True
|
readd_range = True
|
||||||
else:
|
else:
|
||||||
async_record_url = mod_url
|
async_record_url = mod_url
|
||||||
@ -107,12 +140,12 @@ class RewriterApp(object):
|
|||||||
else:
|
else:
|
||||||
error = ''
|
error = ''
|
||||||
|
|
||||||
data = dict(url=url, args=kwargs, error=error)
|
details = dict(args=kwargs, error=error)
|
||||||
raise HTTPError(r.status_code, exception=data)
|
raise UpstreamException(r.status_code, url=url, details=details)
|
||||||
|
|
||||||
if async_record_url:
|
if async_record_url:
|
||||||
#print('ASYNC REC', async_record_url)
|
#print('ASYNC REC', async_record_url)
|
||||||
request.environ.pop('HTTP_RANGE', '')
|
environ.pop('HTTP_RANGE', '')
|
||||||
gevent.spawn(self._do_async_req,
|
gevent.spawn(self._do_async_req,
|
||||||
inputreq,
|
inputreq,
|
||||||
async_record_url,
|
async_record_url,
|
||||||
@ -139,14 +172,16 @@ class RewriterApp(object):
|
|||||||
except (ValueError, TypeError):
|
except (ValueError, TypeError):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
if self.is_ajax():
|
if self.is_ajax(environ):
|
||||||
head_insert_func = None
|
head_insert_func = None
|
||||||
else:
|
else:
|
||||||
|
top_url = self.get_top_url(full_prefix, wb_url, cdx, kwargs)
|
||||||
head_insert_func = (self.head_insert_view.
|
head_insert_func = (self.head_insert_view.
|
||||||
create_insert_func(wb_url,
|
create_insert_func(wb_url,
|
||||||
full_prefix,
|
full_prefix,
|
||||||
host_prefix,
|
host_prefix,
|
||||||
request.environ,
|
top_url,
|
||||||
|
environ,
|
||||||
self.framed_replay))
|
self.framed_replay))
|
||||||
|
|
||||||
result = self.content_rewriter.rewrite_content(urlrewriter,
|
result = self.content_rewriter.rewrite_content(urlrewriter,
|
||||||
@ -157,17 +192,15 @@ class RewriterApp(object):
|
|||||||
cdx)
|
cdx)
|
||||||
|
|
||||||
status_headers, gen, is_rw = result
|
status_headers, gen, is_rw = result
|
||||||
|
return WbResponse(status_headers, gen)
|
||||||
|
|
||||||
response.status = int(status_headers.get_statuscode())
|
def get_top_url(self, full_prefix, wb_url, cdx, kwargs):
|
||||||
|
top_url = full_prefix
|
||||||
for n, v in status_headers.headers:
|
top_url += wb_url.to_str(mod='')
|
||||||
response.add_header(n, v)
|
return top_url
|
||||||
|
|
||||||
return gen
|
|
||||||
|
|
||||||
def _do_async_req(self, *args):
|
def _do_async_req(self, *args):
|
||||||
count = 0
|
count = 0
|
||||||
#print('ASYNC')
|
|
||||||
try:
|
try:
|
||||||
r = self._do_req(*args)
|
r = self._do_req(*args)
|
||||||
while True:
|
while True:
|
||||||
@ -180,13 +213,17 @@ class RewriterApp(object):
|
|||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
|
|
||||||
finally:
|
finally:
|
||||||
#print('CLOSING')
|
|
||||||
#print('READ ASYNC', count)
|
|
||||||
try:
|
try:
|
||||||
r.raw.close()
|
r.raw.close()
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
def handle_error(self, environ, ue):
|
||||||
|
error_html = self.error_view.render_to_string(environ,
|
||||||
|
err_msg=ue.url,
|
||||||
|
err_details=ue.msg)
|
||||||
|
|
||||||
|
return WbResponse.text_response(error_html, content_type='text/html')
|
||||||
|
|
||||||
def _do_req(self, inputreq, url, wb_url, kwargs, skip):
|
def _do_req(self, inputreq, url, wb_url, kwargs, skip):
|
||||||
req_data = inputreq.reconstruct_request(url)
|
req_data = inputreq.reconstruct_request(url)
|
||||||
@ -213,36 +250,92 @@ class RewriterApp(object):
|
|||||||
def do_query(self, wb_url, kwargs):
|
def do_query(self, wb_url, kwargs):
|
||||||
upstream_url = self.get_upstream_url(wb_url.url, wb_url, 'now', kwargs)
|
upstream_url = self.get_upstream_url(wb_url.url, wb_url, 'now', kwargs)
|
||||||
upstream_url = upstream_url.replace('/resource/postreq', '/index')
|
upstream_url = upstream_url.replace('/resource/postreq', '/index')
|
||||||
r = requests.get(upstream_url + '&output=json')
|
|
||||||
print(r.text)
|
upstream_url += '&output=json'
|
||||||
|
upstream_url += '&from=' + wb_url.timestamp + '&to=' + wb_url.end_timestamp
|
||||||
|
|
||||||
|
r = requests.get(upstream_url)
|
||||||
|
|
||||||
return r.text
|
return r.text
|
||||||
|
|
||||||
def get_host_prefix(self):
|
def handle_query(self, environ, wb_url, kwargs):
|
||||||
return request.urlparts.scheme + '://' + request.urlparts.netloc
|
res = self.do_query(wb_url, kwargs)
|
||||||
|
|
||||||
def get_rel_prefix(self):
|
def format_cdx(text):
|
||||||
return request.script_name
|
cdx_lines = text.rstrip().split('\n')
|
||||||
|
for cdx in cdx_lines:
|
||||||
|
if not cdx:
|
||||||
|
continue
|
||||||
|
|
||||||
def get_full_prefix(self):
|
cdx = json.loads(cdx)
|
||||||
return self.get_host_prefix() + self.get_rel_prefix()
|
self.process_query_cdx(cdx, wb_url, kwargs)
|
||||||
|
yield cdx
|
||||||
|
|
||||||
def unrewrite_referrer(self):
|
prefix = self.get_full_prefix(environ)
|
||||||
referrer = request.environ.get('HTTP_REFERER')
|
|
||||||
|
params = dict(url=wb_url.url,
|
||||||
|
prefix=prefix,
|
||||||
|
cdx_lines=list(format_cdx(res)))
|
||||||
|
|
||||||
|
extra_params = self.get_query_params(wb_url, kwargs)
|
||||||
|
if extra_params:
|
||||||
|
params.update(extra_params)
|
||||||
|
|
||||||
|
return self.query_view.render_to_string(environ, **params)
|
||||||
|
|
||||||
|
def process_query_cdx(self, cdx, wb_url, kwargs):
|
||||||
|
return
|
||||||
|
|
||||||
|
def get_query_params(self, wb_url, kwargs):
|
||||||
|
return None
|
||||||
|
|
||||||
|
def get_host_prefix(self, environ):
|
||||||
|
#return request.urlparts.scheme + '://' + request.urlparts.netloc
|
||||||
|
url = environ['wsgi.url_scheme'] + '://'
|
||||||
|
if environ.get('HTTP_HOST'):
|
||||||
|
url += environ['HTTP_HOST']
|
||||||
|
else:
|
||||||
|
url += environ['SERVER_NAME']
|
||||||
|
if environ['wsgi.url_scheme'] == 'https':
|
||||||
|
if environ['SERVER_PORT'] != '443':
|
||||||
|
url += ':' + environ['SERVER_PORT']
|
||||||
|
else:
|
||||||
|
if environ['SERVER_PORT'] != '80':
|
||||||
|
url += ':' + environ['SERVER_PORT']
|
||||||
|
|
||||||
|
return url
|
||||||
|
|
||||||
|
def get_rel_prefix(self, environ):
|
||||||
|
#return request.script_name
|
||||||
|
return environ.get('SCRIPT_NAME') + '/'
|
||||||
|
|
||||||
|
def get_full_prefix(self, environ):
|
||||||
|
return self.get_host_prefix(environ) + self.get_rel_prefix(environ)
|
||||||
|
|
||||||
|
def get_wburl(self, environ):
|
||||||
|
wb_url = environ.get('PATH_INFO', '/')[1:]
|
||||||
|
if environ.get('QUERY_STRING'):
|
||||||
|
wb_url += '?' + environ.get('QUERY_STRING')
|
||||||
|
|
||||||
|
return wb_url
|
||||||
|
|
||||||
|
def unrewrite_referrer(self, environ):
|
||||||
|
referrer = environ.get('HTTP_REFERER')
|
||||||
if not referrer:
|
if not referrer:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
full_prefix = self.get_full_prefix()
|
full_prefix = self.get_full_prefix(environ)
|
||||||
|
|
||||||
if referrer.startswith(full_prefix):
|
if referrer.startswith(full_prefix):
|
||||||
referrer = referrer[len(full_prefix):]
|
referrer = referrer[len(full_prefix):]
|
||||||
request.environ['HTTP_REFERER'] = WbUrl(referrer).url
|
environ['HTTP_REFERER'] = WbUrl(referrer).url
|
||||||
return True
|
return True
|
||||||
|
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def is_ajax(self):
|
def is_ajax(self, environ):
|
||||||
value = request.environ.get('HTTP_X_REQUESTED_WITH')
|
value = environ.get('HTTP_X_REQUESTED_WITH')
|
||||||
value = value or request.environ.get('HTTP_X_PYWB_REQUESTED_WITH')
|
value = value or environ.get('HTTP_X_PYWB_REQUESTED_WITH')
|
||||||
if value and value.lower() == 'xmlhttprequest':
|
if value and value.lower() == 'xmlhttprequest':
|
||||||
return True
|
return True
|
||||||
|
|
||||||
@ -258,16 +351,17 @@ class RewriterApp(object):
|
|||||||
def get_top_frame_params(self, wb_url, kwargs):
|
def get_top_frame_params(self, wb_url, kwargs):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def handle_custom_response(self, wb_url, full_prefix, host_prefix, kwargs):
|
def handle_custom_response(self, environ, wb_url, full_prefix, host_prefix, kwargs):
|
||||||
if wb_url.is_query():
|
if wb_url.is_query():
|
||||||
return self.do_query(wb_url, kwargs)
|
return self.handle_query(environ, wb_url, kwargs)
|
||||||
|
#return self.do_query(wb_url, kwargs)
|
||||||
|
|
||||||
if self.framed_replay and wb_url.mod == self.frame_mod:
|
if self.framed_replay and wb_url.mod == self.frame_mod:
|
||||||
extra_params = self.get_top_frame_params(wb_url, kwargs)
|
extra_params = self.get_top_frame_params(wb_url, kwargs)
|
||||||
return self.frame_insert_view.get_top_frame(wb_url,
|
return self.frame_insert_view.get_top_frame(wb_url,
|
||||||
full_prefix,
|
full_prefix,
|
||||||
host_prefix,
|
host_prefix,
|
||||||
request.environ,
|
environ,
|
||||||
self.frame_mod,
|
self.frame_mod,
|
||||||
self.replay_mod,
|
self.replay_mod,
|
||||||
coll='',
|
coll='',
|
||||||
|
@ -87,7 +87,7 @@ class JinjaEnv(object):
|
|||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
class BaseInsertView(object):
|
class BaseInsertView(object):
|
||||||
def __init__(self, jenv, insert_file, banner_file):
|
def __init__(self, jenv, insert_file, banner_file=''):
|
||||||
self.jenv = jenv
|
self.jenv = jenv
|
||||||
self.insert_file = insert_file
|
self.insert_file = insert_file
|
||||||
self.banner_file = banner_file
|
self.banner_file = banner_file
|
||||||
@ -106,6 +106,7 @@ class HeadInsertView(BaseInsertView):
|
|||||||
def create_insert_func(self, wb_url,
|
def create_insert_func(self, wb_url,
|
||||||
wb_prefix,
|
wb_prefix,
|
||||||
host_prefix,
|
host_prefix,
|
||||||
|
top_url,
|
||||||
env,
|
env,
|
||||||
is_framed,
|
is_framed,
|
||||||
coll='',
|
coll='',
|
||||||
@ -113,9 +114,6 @@ class HeadInsertView(BaseInsertView):
|
|||||||
|
|
||||||
url = wb_url.get_url()
|
url = wb_url.get_url()
|
||||||
|
|
||||||
top_url = wb_prefix
|
|
||||||
top_url += wb_url.to_str(mod='')
|
|
||||||
|
|
||||||
include_wombat = not wb_url.is_banner_only
|
include_wombat = not wb_url.is_banner_only
|
||||||
|
|
||||||
wbrequest = {'host_prefix': host_prefix,
|
wbrequest = {'host_prefix': host_prefix,
|
||||||
|
Loading…
x
Reference in New Issue
Block a user