mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 08:04:49 +01:00
urlrewrite: remove dependency on bottle from rewriterapp,
add overridable error and query views, with extensible get_query_params() and process_cdx_query() to extend cdx for query view add get_top_url() for adding custom top_url for frame insert add call_with_params() for adding custom params to environ
This commit is contained in:
parent
b056acd88e
commit
3b6cab1730
@ -1,30 +1,41 @@
|
||||
import requests
|
||||
|
||||
from bottle import request, response, HTTPError
|
||||
|
||||
from pywb.rewrite.rewrite_content import RewriteContent
|
||||
from pywb.rewrite.wburl import WbUrl
|
||||
from pywb.rewrite.url_rewriter import UrlRewriter
|
||||
|
||||
from pywb.utils.wbexception import WbException
|
||||
from pywb.utils.canonicalize import canonicalize
|
||||
from pywb.utils.timeutils import http_date_to_timestamp
|
||||
from pywb.utils.loaders import extract_client_cookie
|
||||
|
||||
from pywb.cdx.cdxobject import CDXObject
|
||||
from pywb.warc.recordloader import ArcWarcRecordLoader
|
||||
from pywb.framework.wbrequestresponse import WbResponse
|
||||
|
||||
|
||||
from urlrewrite.rewriteinputreq import RewriteInputRequest
|
||||
from urlrewrite.templateview import JinjaEnv, HeadInsertView, TopFrameView
|
||||
from urlrewrite.templateview import JinjaEnv, HeadInsertView, TopFrameView, BaseInsertView
|
||||
|
||||
from io import BytesIO
|
||||
import gevent
|
||||
import json
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class UpstreamException(WbException):
|
||||
def __init__(self, status_code, url, details):
|
||||
super(UpstreamException, self).__init__(url=url, msg=details)
|
||||
self.status_code = status_code
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class RewriterApp(object):
|
||||
def __init__(self, framed_replay=False, jinja_env=None):
|
||||
def __init__(self, framed_replay=False, jinja_env=None, config=None):
|
||||
self.loader = ArcWarcRecordLoader()
|
||||
|
||||
config = config or {}
|
||||
|
||||
self.framed_replay = framed_replay
|
||||
self.frame_mod = ''
|
||||
self.replay_mod = 'mp_'
|
||||
@ -37,33 +48,55 @@ class RewriterApp(object):
|
||||
jinja_env = JinjaEnv(globals={'static_path': 'static/__pywb'})
|
||||
|
||||
self.jinja_env = jinja_env
|
||||
|
||||
self.head_insert_view = HeadInsertView(self.jinja_env, 'head_insert.html', 'banner.html')
|
||||
self.frame_insert_view = TopFrameView(self.jinja_env, 'frame_insert.html', 'banner.html')
|
||||
self.error_view = BaseInsertView(self.jinja_env, 'error.html')
|
||||
self.query_view = BaseInsertView(self.jinja_env, config.get('query_html', 'query.html'))
|
||||
|
||||
def render_content(self, wb_url, **kwargs):
|
||||
def call_with_params(self, **kwargs):
|
||||
def run_app(environ, start_response):
|
||||
environ['pywb.kwargs'] = kwargs
|
||||
return self(environ, start_response)
|
||||
|
||||
return run_app
|
||||
|
||||
def __call__(self, environ, start_response):
|
||||
wb_url = self.get_wburl(environ)
|
||||
kwargs = environ.get('pywb.kwargs', {})
|
||||
|
||||
try:
|
||||
response = self.render_content(wb_url, kwargs, environ)
|
||||
except UpstreamException as ue:
|
||||
response = self.handle_error(environ, ue)
|
||||
|
||||
return response(environ, start_response)
|
||||
|
||||
def render_content(self, wb_url, kwargs, environ):
|
||||
wb_url = WbUrl(wb_url)
|
||||
#if wb_url.mod == 'vi_':
|
||||
# return self._get_video_info(wbrequest)
|
||||
|
||||
host_prefix = self.get_host_prefix()
|
||||
rel_prefix = self.get_rel_prefix()
|
||||
host_prefix = self.get_host_prefix(environ)
|
||||
rel_prefix = self.get_rel_prefix(environ)
|
||||
full_prefix = host_prefix + rel_prefix
|
||||
|
||||
resp = self.handle_custom_response(wb_url, full_prefix, host_prefix, kwargs)
|
||||
resp = self.handle_custom_response(environ, wb_url,
|
||||
full_prefix, host_prefix, kwargs)
|
||||
if resp is not None:
|
||||
return resp
|
||||
return WbResponse.text_response(resp, content_type='text/html')
|
||||
|
||||
urlrewriter = UrlRewriter(wb_url,
|
||||
prefix=full_prefix,
|
||||
full_prefix=full_prefix,
|
||||
rel_prefix=rel_prefix)
|
||||
|
||||
self.unrewrite_referrer()
|
||||
self.unrewrite_referrer(environ)
|
||||
|
||||
url = wb_url.url
|
||||
urlkey = canonicalize(url)
|
||||
|
||||
inputreq = RewriteInputRequest(request.environ, urlkey, url,
|
||||
inputreq = RewriteInputRequest(environ, urlkey, url,
|
||||
self.content_rewriter)
|
||||
|
||||
mod_url = None
|
||||
@ -86,7 +119,7 @@ class RewriterApp(object):
|
||||
wb_url.url = mod_url
|
||||
inputreq.url = mod_url
|
||||
|
||||
del request.environ['HTTP_RANGE']
|
||||
del environ['HTTP_RANGE']
|
||||
readd_range = True
|
||||
else:
|
||||
async_record_url = mod_url
|
||||
@ -107,12 +140,12 @@ class RewriterApp(object):
|
||||
else:
|
||||
error = ''
|
||||
|
||||
data = dict(url=url, args=kwargs, error=error)
|
||||
raise HTTPError(r.status_code, exception=data)
|
||||
details = dict(args=kwargs, error=error)
|
||||
raise UpstreamException(r.status_code, url=url, details=details)
|
||||
|
||||
if async_record_url:
|
||||
#print('ASYNC REC', async_record_url)
|
||||
request.environ.pop('HTTP_RANGE', '')
|
||||
environ.pop('HTTP_RANGE', '')
|
||||
gevent.spawn(self._do_async_req,
|
||||
inputreq,
|
||||
async_record_url,
|
||||
@ -139,14 +172,16 @@ class RewriterApp(object):
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
|
||||
if self.is_ajax():
|
||||
if self.is_ajax(environ):
|
||||
head_insert_func = None
|
||||
else:
|
||||
top_url = self.get_top_url(full_prefix, wb_url, cdx, kwargs)
|
||||
head_insert_func = (self.head_insert_view.
|
||||
create_insert_func(wb_url,
|
||||
full_prefix,
|
||||
host_prefix,
|
||||
request.environ,
|
||||
top_url,
|
||||
environ,
|
||||
self.framed_replay))
|
||||
|
||||
result = self.content_rewriter.rewrite_content(urlrewriter,
|
||||
@ -157,17 +192,15 @@ class RewriterApp(object):
|
||||
cdx)
|
||||
|
||||
status_headers, gen, is_rw = result
|
||||
return WbResponse(status_headers, gen)
|
||||
|
||||
response.status = int(status_headers.get_statuscode())
|
||||
|
||||
for n, v in status_headers.headers:
|
||||
response.add_header(n, v)
|
||||
|
||||
return gen
|
||||
def get_top_url(self, full_prefix, wb_url, cdx, kwargs):
|
||||
top_url = full_prefix
|
||||
top_url += wb_url.to_str(mod='')
|
||||
return top_url
|
||||
|
||||
def _do_async_req(self, *args):
|
||||
count = 0
|
||||
#print('ASYNC')
|
||||
try:
|
||||
r = self._do_req(*args)
|
||||
while True:
|
||||
@ -180,13 +213,17 @@ class RewriterApp(object):
|
||||
traceback.print_exc()
|
||||
|
||||
finally:
|
||||
#print('CLOSING')
|
||||
#print('READ ASYNC', count)
|
||||
try:
|
||||
r.raw.close()
|
||||
except:
|
||||
pass
|
||||
|
||||
def handle_error(self, environ, ue):
|
||||
error_html = self.error_view.render_to_string(environ,
|
||||
err_msg=ue.url,
|
||||
err_details=ue.msg)
|
||||
|
||||
return WbResponse.text_response(error_html, content_type='text/html')
|
||||
|
||||
def _do_req(self, inputreq, url, wb_url, kwargs, skip):
|
||||
req_data = inputreq.reconstruct_request(url)
|
||||
@ -213,36 +250,92 @@ class RewriterApp(object):
|
||||
def do_query(self, wb_url, kwargs):
|
||||
upstream_url = self.get_upstream_url(wb_url.url, wb_url, 'now', kwargs)
|
||||
upstream_url = upstream_url.replace('/resource/postreq', '/index')
|
||||
r = requests.get(upstream_url + '&output=json')
|
||||
print(r.text)
|
||||
|
||||
upstream_url += '&output=json'
|
||||
upstream_url += '&from=' + wb_url.timestamp + '&to=' + wb_url.end_timestamp
|
||||
|
||||
r = requests.get(upstream_url)
|
||||
|
||||
return r.text
|
||||
|
||||
def get_host_prefix(self):
|
||||
return request.urlparts.scheme + '://' + request.urlparts.netloc
|
||||
def handle_query(self, environ, wb_url, kwargs):
|
||||
res = self.do_query(wb_url, kwargs)
|
||||
|
||||
def get_rel_prefix(self):
|
||||
return request.script_name
|
||||
def format_cdx(text):
|
||||
cdx_lines = text.rstrip().split('\n')
|
||||
for cdx in cdx_lines:
|
||||
if not cdx:
|
||||
continue
|
||||
|
||||
def get_full_prefix(self):
|
||||
return self.get_host_prefix() + self.get_rel_prefix()
|
||||
cdx = json.loads(cdx)
|
||||
self.process_query_cdx(cdx, wb_url, kwargs)
|
||||
yield cdx
|
||||
|
||||
def unrewrite_referrer(self):
|
||||
referrer = request.environ.get('HTTP_REFERER')
|
||||
prefix = self.get_full_prefix(environ)
|
||||
|
||||
params = dict(url=wb_url.url,
|
||||
prefix=prefix,
|
||||
cdx_lines=list(format_cdx(res)))
|
||||
|
||||
extra_params = self.get_query_params(wb_url, kwargs)
|
||||
if extra_params:
|
||||
params.update(extra_params)
|
||||
|
||||
return self.query_view.render_to_string(environ, **params)
|
||||
|
||||
def process_query_cdx(self, cdx, wb_url, kwargs):
|
||||
return
|
||||
|
||||
def get_query_params(self, wb_url, kwargs):
|
||||
return None
|
||||
|
||||
def get_host_prefix(self, environ):
|
||||
#return request.urlparts.scheme + '://' + request.urlparts.netloc
|
||||
url = environ['wsgi.url_scheme'] + '://'
|
||||
if environ.get('HTTP_HOST'):
|
||||
url += environ['HTTP_HOST']
|
||||
else:
|
||||
url += environ['SERVER_NAME']
|
||||
if environ['wsgi.url_scheme'] == 'https':
|
||||
if environ['SERVER_PORT'] != '443':
|
||||
url += ':' + environ['SERVER_PORT']
|
||||
else:
|
||||
if environ['SERVER_PORT'] != '80':
|
||||
url += ':' + environ['SERVER_PORT']
|
||||
|
||||
return url
|
||||
|
||||
def get_rel_prefix(self, environ):
|
||||
#return request.script_name
|
||||
return environ.get('SCRIPT_NAME') + '/'
|
||||
|
||||
def get_full_prefix(self, environ):
|
||||
return self.get_host_prefix(environ) + self.get_rel_prefix(environ)
|
||||
|
||||
def get_wburl(self, environ):
|
||||
wb_url = environ.get('PATH_INFO', '/')[1:]
|
||||
if environ.get('QUERY_STRING'):
|
||||
wb_url += '?' + environ.get('QUERY_STRING')
|
||||
|
||||
return wb_url
|
||||
|
||||
def unrewrite_referrer(self, environ):
|
||||
referrer = environ.get('HTTP_REFERER')
|
||||
if not referrer:
|
||||
return False
|
||||
|
||||
full_prefix = self.get_full_prefix()
|
||||
full_prefix = self.get_full_prefix(environ)
|
||||
|
||||
if referrer.startswith(full_prefix):
|
||||
referrer = referrer[len(full_prefix):]
|
||||
request.environ['HTTP_REFERER'] = WbUrl(referrer).url
|
||||
environ['HTTP_REFERER'] = WbUrl(referrer).url
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def is_ajax(self):
|
||||
value = request.environ.get('HTTP_X_REQUESTED_WITH')
|
||||
value = value or request.environ.get('HTTP_X_PYWB_REQUESTED_WITH')
|
||||
def is_ajax(self, environ):
|
||||
value = environ.get('HTTP_X_REQUESTED_WITH')
|
||||
value = value or environ.get('HTTP_X_PYWB_REQUESTED_WITH')
|
||||
if value and value.lower() == 'xmlhttprequest':
|
||||
return True
|
||||
|
||||
@ -258,16 +351,17 @@ class RewriterApp(object):
|
||||
def get_top_frame_params(self, wb_url, kwargs):
|
||||
return None
|
||||
|
||||
def handle_custom_response(self, wb_url, full_prefix, host_prefix, kwargs):
|
||||
def handle_custom_response(self, environ, wb_url, full_prefix, host_prefix, kwargs):
|
||||
if wb_url.is_query():
|
||||
return self.do_query(wb_url, kwargs)
|
||||
return self.handle_query(environ, wb_url, kwargs)
|
||||
#return self.do_query(wb_url, kwargs)
|
||||
|
||||
if self.framed_replay and wb_url.mod == self.frame_mod:
|
||||
extra_params = self.get_top_frame_params(wb_url, kwargs)
|
||||
return self.frame_insert_view.get_top_frame(wb_url,
|
||||
full_prefix,
|
||||
host_prefix,
|
||||
request.environ,
|
||||
environ,
|
||||
self.frame_mod,
|
||||
self.replay_mod,
|
||||
coll='',
|
||||
|
@ -87,7 +87,7 @@ class JinjaEnv(object):
|
||||
|
||||
# ============================================================================
|
||||
class BaseInsertView(object):
|
||||
def __init__(self, jenv, insert_file, banner_file):
|
||||
def __init__(self, jenv, insert_file, banner_file=''):
|
||||
self.jenv = jenv
|
||||
self.insert_file = insert_file
|
||||
self.banner_file = banner_file
|
||||
@ -106,6 +106,7 @@ class HeadInsertView(BaseInsertView):
|
||||
def create_insert_func(self, wb_url,
|
||||
wb_prefix,
|
||||
host_prefix,
|
||||
top_url,
|
||||
env,
|
||||
is_framed,
|
||||
coll='',
|
||||
@ -113,9 +114,6 @@ class HeadInsertView(BaseInsertView):
|
||||
|
||||
url = wb_url.get_url()
|
||||
|
||||
top_url = wb_prefix
|
||||
top_url += wb_url.to_str(mod='')
|
||||
|
||||
include_wombat = not wb_url.is_banner_only
|
||||
|
||||
wbrequest = {'host_prefix': host_prefix,
|
||||
|
Loading…
x
Reference in New Issue
Block a user