1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

urlrewrite: remove dependency on bottle from rewriterapp,

add overridable error and query views, with extensible get_query_params() and process_cdx_query()
to extend cdx for query view
add get_top_url() for adding custom top_url for frame insert
add call_with_params() for adding custom params to environ
This commit is contained in:
Ilya Kreymer 2016-04-25 12:03:23 -07:00
parent b056acd88e
commit 3b6cab1730
2 changed files with 140 additions and 48 deletions

View File

@ -1,30 +1,41 @@
import requests import requests
from bottle import request, response, HTTPError
from pywb.rewrite.rewrite_content import RewriteContent from pywb.rewrite.rewrite_content import RewriteContent
from pywb.rewrite.wburl import WbUrl from pywb.rewrite.wburl import WbUrl
from pywb.rewrite.url_rewriter import UrlRewriter from pywb.rewrite.url_rewriter import UrlRewriter
from pywb.utils.wbexception import WbException
from pywb.utils.canonicalize import canonicalize from pywb.utils.canonicalize import canonicalize
from pywb.utils.timeutils import http_date_to_timestamp from pywb.utils.timeutils import http_date_to_timestamp
from pywb.utils.loaders import extract_client_cookie from pywb.utils.loaders import extract_client_cookie
from pywb.cdx.cdxobject import CDXObject from pywb.cdx.cdxobject import CDXObject
from pywb.warc.recordloader import ArcWarcRecordLoader from pywb.warc.recordloader import ArcWarcRecordLoader
from pywb.framework.wbrequestresponse import WbResponse
from urlrewrite.rewriteinputreq import RewriteInputRequest from urlrewrite.rewriteinputreq import RewriteInputRequest
from urlrewrite.templateview import JinjaEnv, HeadInsertView, TopFrameView from urlrewrite.templateview import JinjaEnv, HeadInsertView, TopFrameView, BaseInsertView
from io import BytesIO from io import BytesIO
import gevent import gevent
import json
# ============================================================================
class UpstreamException(WbException):
def __init__(self, status_code, url, details):
super(UpstreamException, self).__init__(url=url, msg=details)
self.status_code = status_code
# ============================================================================ # ============================================================================
class RewriterApp(object): class RewriterApp(object):
def __init__(self, framed_replay=False, jinja_env=None): def __init__(self, framed_replay=False, jinja_env=None, config=None):
self.loader = ArcWarcRecordLoader() self.loader = ArcWarcRecordLoader()
config = config or {}
self.framed_replay = framed_replay self.framed_replay = framed_replay
self.frame_mod = '' self.frame_mod = ''
self.replay_mod = 'mp_' self.replay_mod = 'mp_'
@ -37,33 +48,55 @@ class RewriterApp(object):
jinja_env = JinjaEnv(globals={'static_path': 'static/__pywb'}) jinja_env = JinjaEnv(globals={'static_path': 'static/__pywb'})
self.jinja_env = jinja_env self.jinja_env = jinja_env
self.head_insert_view = HeadInsertView(self.jinja_env, 'head_insert.html', 'banner.html') self.head_insert_view = HeadInsertView(self.jinja_env, 'head_insert.html', 'banner.html')
self.frame_insert_view = TopFrameView(self.jinja_env, 'frame_insert.html', 'banner.html') self.frame_insert_view = TopFrameView(self.jinja_env, 'frame_insert.html', 'banner.html')
self.error_view = BaseInsertView(self.jinja_env, 'error.html')
self.query_view = BaseInsertView(self.jinja_env, config.get('query_html', 'query.html'))
def render_content(self, wb_url, **kwargs): def call_with_params(self, **kwargs):
def run_app(environ, start_response):
environ['pywb.kwargs'] = kwargs
return self(environ, start_response)
return run_app
def __call__(self, environ, start_response):
wb_url = self.get_wburl(environ)
kwargs = environ.get('pywb.kwargs', {})
try:
response = self.render_content(wb_url, kwargs, environ)
except UpstreamException as ue:
response = self.handle_error(environ, ue)
return response(environ, start_response)
def render_content(self, wb_url, kwargs, environ):
wb_url = WbUrl(wb_url) wb_url = WbUrl(wb_url)
#if wb_url.mod == 'vi_': #if wb_url.mod == 'vi_':
# return self._get_video_info(wbrequest) # return self._get_video_info(wbrequest)
host_prefix = self.get_host_prefix() host_prefix = self.get_host_prefix(environ)
rel_prefix = self.get_rel_prefix() rel_prefix = self.get_rel_prefix(environ)
full_prefix = host_prefix + rel_prefix full_prefix = host_prefix + rel_prefix
resp = self.handle_custom_response(wb_url, full_prefix, host_prefix, kwargs) resp = self.handle_custom_response(environ, wb_url,
full_prefix, host_prefix, kwargs)
if resp is not None: if resp is not None:
return resp return WbResponse.text_response(resp, content_type='text/html')
urlrewriter = UrlRewriter(wb_url, urlrewriter = UrlRewriter(wb_url,
prefix=full_prefix, prefix=full_prefix,
full_prefix=full_prefix, full_prefix=full_prefix,
rel_prefix=rel_prefix) rel_prefix=rel_prefix)
self.unrewrite_referrer() self.unrewrite_referrer(environ)
url = wb_url.url url = wb_url.url
urlkey = canonicalize(url) urlkey = canonicalize(url)
inputreq = RewriteInputRequest(request.environ, urlkey, url, inputreq = RewriteInputRequest(environ, urlkey, url,
self.content_rewriter) self.content_rewriter)
mod_url = None mod_url = None
@ -86,7 +119,7 @@ class RewriterApp(object):
wb_url.url = mod_url wb_url.url = mod_url
inputreq.url = mod_url inputreq.url = mod_url
del request.environ['HTTP_RANGE'] del environ['HTTP_RANGE']
readd_range = True readd_range = True
else: else:
async_record_url = mod_url async_record_url = mod_url
@ -107,12 +140,12 @@ class RewriterApp(object):
else: else:
error = '' error = ''
data = dict(url=url, args=kwargs, error=error) details = dict(args=kwargs, error=error)
raise HTTPError(r.status_code, exception=data) raise UpstreamException(r.status_code, url=url, details=details)
if async_record_url: if async_record_url:
#print('ASYNC REC', async_record_url) #print('ASYNC REC', async_record_url)
request.environ.pop('HTTP_RANGE', '') environ.pop('HTTP_RANGE', '')
gevent.spawn(self._do_async_req, gevent.spawn(self._do_async_req,
inputreq, inputreq,
async_record_url, async_record_url,
@ -139,14 +172,16 @@ class RewriterApp(object):
except (ValueError, TypeError): except (ValueError, TypeError):
pass pass
if self.is_ajax(): if self.is_ajax(environ):
head_insert_func = None head_insert_func = None
else: else:
top_url = self.get_top_url(full_prefix, wb_url, cdx, kwargs)
head_insert_func = (self.head_insert_view. head_insert_func = (self.head_insert_view.
create_insert_func(wb_url, create_insert_func(wb_url,
full_prefix, full_prefix,
host_prefix, host_prefix,
request.environ, top_url,
environ,
self.framed_replay)) self.framed_replay))
result = self.content_rewriter.rewrite_content(urlrewriter, result = self.content_rewriter.rewrite_content(urlrewriter,
@ -157,17 +192,15 @@ class RewriterApp(object):
cdx) cdx)
status_headers, gen, is_rw = result status_headers, gen, is_rw = result
return WbResponse(status_headers, gen)
response.status = int(status_headers.get_statuscode()) def get_top_url(self, full_prefix, wb_url, cdx, kwargs):
top_url = full_prefix
for n, v in status_headers.headers: top_url += wb_url.to_str(mod='')
response.add_header(n, v) return top_url
return gen
def _do_async_req(self, *args): def _do_async_req(self, *args):
count = 0 count = 0
#print('ASYNC')
try: try:
r = self._do_req(*args) r = self._do_req(*args)
while True: while True:
@ -180,13 +213,17 @@ class RewriterApp(object):
traceback.print_exc() traceback.print_exc()
finally: finally:
#print('CLOSING')
#print('READ ASYNC', count)
try: try:
r.raw.close() r.raw.close()
except: except:
pass pass
def handle_error(self, environ, ue):
error_html = self.error_view.render_to_string(environ,
err_msg=ue.url,
err_details=ue.msg)
return WbResponse.text_response(error_html, content_type='text/html')
def _do_req(self, inputreq, url, wb_url, kwargs, skip): def _do_req(self, inputreq, url, wb_url, kwargs, skip):
req_data = inputreq.reconstruct_request(url) req_data = inputreq.reconstruct_request(url)
@ -213,36 +250,92 @@ class RewriterApp(object):
def do_query(self, wb_url, kwargs): def do_query(self, wb_url, kwargs):
upstream_url = self.get_upstream_url(wb_url.url, wb_url, 'now', kwargs) upstream_url = self.get_upstream_url(wb_url.url, wb_url, 'now', kwargs)
upstream_url = upstream_url.replace('/resource/postreq', '/index') upstream_url = upstream_url.replace('/resource/postreq', '/index')
r = requests.get(upstream_url + '&output=json')
print(r.text) upstream_url += '&output=json'
upstream_url += '&from=' + wb_url.timestamp + '&to=' + wb_url.end_timestamp
r = requests.get(upstream_url)
return r.text return r.text
def get_host_prefix(self): def handle_query(self, environ, wb_url, kwargs):
return request.urlparts.scheme + '://' + request.urlparts.netloc res = self.do_query(wb_url, kwargs)
def get_rel_prefix(self): def format_cdx(text):
return request.script_name cdx_lines = text.rstrip().split('\n')
for cdx in cdx_lines:
if not cdx:
continue
def get_full_prefix(self): cdx = json.loads(cdx)
return self.get_host_prefix() + self.get_rel_prefix() self.process_query_cdx(cdx, wb_url, kwargs)
yield cdx
def unrewrite_referrer(self): prefix = self.get_full_prefix(environ)
referrer = request.environ.get('HTTP_REFERER')
params = dict(url=wb_url.url,
prefix=prefix,
cdx_lines=list(format_cdx(res)))
extra_params = self.get_query_params(wb_url, kwargs)
if extra_params:
params.update(extra_params)
return self.query_view.render_to_string(environ, **params)
def process_query_cdx(self, cdx, wb_url, kwargs):
return
def get_query_params(self, wb_url, kwargs):
return None
def get_host_prefix(self, environ):
#return request.urlparts.scheme + '://' + request.urlparts.netloc
url = environ['wsgi.url_scheme'] + '://'
if environ.get('HTTP_HOST'):
url += environ['HTTP_HOST']
else:
url += environ['SERVER_NAME']
if environ['wsgi.url_scheme'] == 'https':
if environ['SERVER_PORT'] != '443':
url += ':' + environ['SERVER_PORT']
else:
if environ['SERVER_PORT'] != '80':
url += ':' + environ['SERVER_PORT']
return url
def get_rel_prefix(self, environ):
#return request.script_name
return environ.get('SCRIPT_NAME') + '/'
def get_full_prefix(self, environ):
return self.get_host_prefix(environ) + self.get_rel_prefix(environ)
def get_wburl(self, environ):
wb_url = environ.get('PATH_INFO', '/')[1:]
if environ.get('QUERY_STRING'):
wb_url += '?' + environ.get('QUERY_STRING')
return wb_url
def unrewrite_referrer(self, environ):
referrer = environ.get('HTTP_REFERER')
if not referrer: if not referrer:
return False return False
full_prefix = self.get_full_prefix() full_prefix = self.get_full_prefix(environ)
if referrer.startswith(full_prefix): if referrer.startswith(full_prefix):
referrer = referrer[len(full_prefix):] referrer = referrer[len(full_prefix):]
request.environ['HTTP_REFERER'] = WbUrl(referrer).url environ['HTTP_REFERER'] = WbUrl(referrer).url
return True return True
return False return False
def is_ajax(self): def is_ajax(self, environ):
value = request.environ.get('HTTP_X_REQUESTED_WITH') value = environ.get('HTTP_X_REQUESTED_WITH')
value = value or request.environ.get('HTTP_X_PYWB_REQUESTED_WITH') value = value or environ.get('HTTP_X_PYWB_REQUESTED_WITH')
if value and value.lower() == 'xmlhttprequest': if value and value.lower() == 'xmlhttprequest':
return True return True
@ -258,16 +351,17 @@ class RewriterApp(object):
def get_top_frame_params(self, wb_url, kwargs): def get_top_frame_params(self, wb_url, kwargs):
return None return None
def handle_custom_response(self, wb_url, full_prefix, host_prefix, kwargs): def handle_custom_response(self, environ, wb_url, full_prefix, host_prefix, kwargs):
if wb_url.is_query(): if wb_url.is_query():
return self.do_query(wb_url, kwargs) return self.handle_query(environ, wb_url, kwargs)
#return self.do_query(wb_url, kwargs)
if self.framed_replay and wb_url.mod == self.frame_mod: if self.framed_replay and wb_url.mod == self.frame_mod:
extra_params = self.get_top_frame_params(wb_url, kwargs) extra_params = self.get_top_frame_params(wb_url, kwargs)
return self.frame_insert_view.get_top_frame(wb_url, return self.frame_insert_view.get_top_frame(wb_url,
full_prefix, full_prefix,
host_prefix, host_prefix,
request.environ, environ,
self.frame_mod, self.frame_mod,
self.replay_mod, self.replay_mod,
coll='', coll='',

View File

@ -87,7 +87,7 @@ class JinjaEnv(object):
# ============================================================================ # ============================================================================
class BaseInsertView(object): class BaseInsertView(object):
def __init__(self, jenv, insert_file, banner_file): def __init__(self, jenv, insert_file, banner_file=''):
self.jenv = jenv self.jenv = jenv
self.insert_file = insert_file self.insert_file = insert_file
self.banner_file = banner_file self.banner_file = banner_file
@ -106,6 +106,7 @@ class HeadInsertView(BaseInsertView):
def create_insert_func(self, wb_url, def create_insert_func(self, wb_url,
wb_prefix, wb_prefix,
host_prefix, host_prefix,
top_url,
env, env,
is_framed, is_framed,
coll='', coll='',
@ -113,9 +114,6 @@ class HeadInsertView(BaseInsertView):
url = wb_url.get_url() url = wb_url.get_url()
top_url = wb_prefix
top_url += wb_url.to_str(mod='')
include_wombat = not wb_url.is_banner_only include_wombat = not wb_url.is_banner_only
wbrequest = {'host_prefix': host_prefix, wbrequest = {'host_prefix': host_prefix,