From 3b6cab1730e25610c12154799d957addb7337ade Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 25 Apr 2016 12:03:23 -0700 Subject: [PATCH] urlrewrite: remove dependency on bottle from rewriterapp, add overridable error and query views, with extensible get_query_params() and process_cdx_query() to extend cdx for query view add get_top_url() for adding custom top_url for frame insert add call_with_params() for adding custom params to environ --- urlrewrite/rewriterapp.py | 182 ++++++++++++++++++++++++++++--------- urlrewrite/templateview.py | 6 +- 2 files changed, 140 insertions(+), 48 deletions(-) diff --git a/urlrewrite/rewriterapp.py b/urlrewrite/rewriterapp.py index 84e536a9..fc630d8f 100644 --- a/urlrewrite/rewriterapp.py +++ b/urlrewrite/rewriterapp.py @@ -1,30 +1,41 @@ import requests -from bottle import request, response, HTTPError - from pywb.rewrite.rewrite_content import RewriteContent from pywb.rewrite.wburl import WbUrl from pywb.rewrite.url_rewriter import UrlRewriter +from pywb.utils.wbexception import WbException from pywb.utils.canonicalize import canonicalize from pywb.utils.timeutils import http_date_to_timestamp from pywb.utils.loaders import extract_client_cookie from pywb.cdx.cdxobject import CDXObject from pywb.warc.recordloader import ArcWarcRecordLoader +from pywb.framework.wbrequestresponse import WbResponse + from urlrewrite.rewriteinputreq import RewriteInputRequest -from urlrewrite.templateview import JinjaEnv, HeadInsertView, TopFrameView +from urlrewrite.templateview import JinjaEnv, HeadInsertView, TopFrameView, BaseInsertView from io import BytesIO import gevent +import json + + +# ============================================================================ +class UpstreamException(WbException): + def __init__(self, status_code, url, details): + super(UpstreamException, self).__init__(url=url, msg=details) + self.status_code = status_code # ============================================================================ class RewriterApp(object): - def __init__(self, framed_replay=False, jinja_env=None): + def __init__(self, framed_replay=False, jinja_env=None, config=None): self.loader = ArcWarcRecordLoader() + config = config or {} + self.framed_replay = framed_replay self.frame_mod = '' self.replay_mod = 'mp_' @@ -37,33 +48,55 @@ class RewriterApp(object): jinja_env = JinjaEnv(globals={'static_path': 'static/__pywb'}) self.jinja_env = jinja_env + self.head_insert_view = HeadInsertView(self.jinja_env, 'head_insert.html', 'banner.html') self.frame_insert_view = TopFrameView(self.jinja_env, 'frame_insert.html', 'banner.html') + self.error_view = BaseInsertView(self.jinja_env, 'error.html') + self.query_view = BaseInsertView(self.jinja_env, config.get('query_html', 'query.html')) - def render_content(self, wb_url, **kwargs): + def call_with_params(self, **kwargs): + def run_app(environ, start_response): + environ['pywb.kwargs'] = kwargs + return self(environ, start_response) + + return run_app + + def __call__(self, environ, start_response): + wb_url = self.get_wburl(environ) + kwargs = environ.get('pywb.kwargs', {}) + + try: + response = self.render_content(wb_url, kwargs, environ) + except UpstreamException as ue: + response = self.handle_error(environ, ue) + + return response(environ, start_response) + + def render_content(self, wb_url, kwargs, environ): wb_url = WbUrl(wb_url) #if wb_url.mod == 'vi_': # return self._get_video_info(wbrequest) - host_prefix = self.get_host_prefix() - rel_prefix = self.get_rel_prefix() + host_prefix = self.get_host_prefix(environ) + rel_prefix = self.get_rel_prefix(environ) full_prefix = host_prefix + rel_prefix - resp = self.handle_custom_response(wb_url, full_prefix, host_prefix, kwargs) + resp = self.handle_custom_response(environ, wb_url, + full_prefix, host_prefix, kwargs) if resp is not None: - return resp + return WbResponse.text_response(resp, content_type='text/html') urlrewriter = UrlRewriter(wb_url, prefix=full_prefix, full_prefix=full_prefix, rel_prefix=rel_prefix) - self.unrewrite_referrer() + self.unrewrite_referrer(environ) url = wb_url.url urlkey = canonicalize(url) - inputreq = RewriteInputRequest(request.environ, urlkey, url, + inputreq = RewriteInputRequest(environ, urlkey, url, self.content_rewriter) mod_url = None @@ -86,7 +119,7 @@ class RewriterApp(object): wb_url.url = mod_url inputreq.url = mod_url - del request.environ['HTTP_RANGE'] + del environ['HTTP_RANGE'] readd_range = True else: async_record_url = mod_url @@ -107,12 +140,12 @@ class RewriterApp(object): else: error = '' - data = dict(url=url, args=kwargs, error=error) - raise HTTPError(r.status_code, exception=data) + details = dict(args=kwargs, error=error) + raise UpstreamException(r.status_code, url=url, details=details) if async_record_url: #print('ASYNC REC', async_record_url) - request.environ.pop('HTTP_RANGE', '') + environ.pop('HTTP_RANGE', '') gevent.spawn(self._do_async_req, inputreq, async_record_url, @@ -139,14 +172,16 @@ class RewriterApp(object): except (ValueError, TypeError): pass - if self.is_ajax(): + if self.is_ajax(environ): head_insert_func = None else: + top_url = self.get_top_url(full_prefix, wb_url, cdx, kwargs) head_insert_func = (self.head_insert_view. create_insert_func(wb_url, full_prefix, host_prefix, - request.environ, + top_url, + environ, self.framed_replay)) result = self.content_rewriter.rewrite_content(urlrewriter, @@ -157,17 +192,15 @@ class RewriterApp(object): cdx) status_headers, gen, is_rw = result + return WbResponse(status_headers, gen) - response.status = int(status_headers.get_statuscode()) - - for n, v in status_headers.headers: - response.add_header(n, v) - - return gen + def get_top_url(self, full_prefix, wb_url, cdx, kwargs): + top_url = full_prefix + top_url += wb_url.to_str(mod='') + return top_url def _do_async_req(self, *args): count = 0 - #print('ASYNC') try: r = self._do_req(*args) while True: @@ -180,13 +213,17 @@ class RewriterApp(object): traceback.print_exc() finally: - #print('CLOSING') - #print('READ ASYNC', count) try: r.raw.close() except: pass + def handle_error(self, environ, ue): + error_html = self.error_view.render_to_string(environ, + err_msg=ue.url, + err_details=ue.msg) + + return WbResponse.text_response(error_html, content_type='text/html') def _do_req(self, inputreq, url, wb_url, kwargs, skip): req_data = inputreq.reconstruct_request(url) @@ -213,36 +250,92 @@ class RewriterApp(object): def do_query(self, wb_url, kwargs): upstream_url = self.get_upstream_url(wb_url.url, wb_url, 'now', kwargs) upstream_url = upstream_url.replace('/resource/postreq', '/index') - r = requests.get(upstream_url + '&output=json') - print(r.text) + + upstream_url += '&output=json' + upstream_url += '&from=' + wb_url.timestamp + '&to=' + wb_url.end_timestamp + + r = requests.get(upstream_url) + return r.text - def get_host_prefix(self): - return request.urlparts.scheme + '://' + request.urlparts.netloc + def handle_query(self, environ, wb_url, kwargs): + res = self.do_query(wb_url, kwargs) - def get_rel_prefix(self): - return request.script_name + def format_cdx(text): + cdx_lines = text.rstrip().split('\n') + for cdx in cdx_lines: + if not cdx: + continue - def get_full_prefix(self): - return self.get_host_prefix() + self.get_rel_prefix() + cdx = json.loads(cdx) + self.process_query_cdx(cdx, wb_url, kwargs) + yield cdx - def unrewrite_referrer(self): - referrer = request.environ.get('HTTP_REFERER') + prefix = self.get_full_prefix(environ) + + params = dict(url=wb_url.url, + prefix=prefix, + cdx_lines=list(format_cdx(res))) + + extra_params = self.get_query_params(wb_url, kwargs) + if extra_params: + params.update(extra_params) + + return self.query_view.render_to_string(environ, **params) + + def process_query_cdx(self, cdx, wb_url, kwargs): + return + + def get_query_params(self, wb_url, kwargs): + return None + + def get_host_prefix(self, environ): + #return request.urlparts.scheme + '://' + request.urlparts.netloc + url = environ['wsgi.url_scheme'] + '://' + if environ.get('HTTP_HOST'): + url += environ['HTTP_HOST'] + else: + url += environ['SERVER_NAME'] + if environ['wsgi.url_scheme'] == 'https': + if environ['SERVER_PORT'] != '443': + url += ':' + environ['SERVER_PORT'] + else: + if environ['SERVER_PORT'] != '80': + url += ':' + environ['SERVER_PORT'] + + return url + + def get_rel_prefix(self, environ): + #return request.script_name + return environ.get('SCRIPT_NAME') + '/' + + def get_full_prefix(self, environ): + return self.get_host_prefix(environ) + self.get_rel_prefix(environ) + + def get_wburl(self, environ): + wb_url = environ.get('PATH_INFO', '/')[1:] + if environ.get('QUERY_STRING'): + wb_url += '?' + environ.get('QUERY_STRING') + + return wb_url + + def unrewrite_referrer(self, environ): + referrer = environ.get('HTTP_REFERER') if not referrer: return False - full_prefix = self.get_full_prefix() + full_prefix = self.get_full_prefix(environ) if referrer.startswith(full_prefix): referrer = referrer[len(full_prefix):] - request.environ['HTTP_REFERER'] = WbUrl(referrer).url + environ['HTTP_REFERER'] = WbUrl(referrer).url return True return False - def is_ajax(self): - value = request.environ.get('HTTP_X_REQUESTED_WITH') - value = value or request.environ.get('HTTP_X_PYWB_REQUESTED_WITH') + def is_ajax(self, environ): + value = environ.get('HTTP_X_REQUESTED_WITH') + value = value or environ.get('HTTP_X_PYWB_REQUESTED_WITH') if value and value.lower() == 'xmlhttprequest': return True @@ -258,16 +351,17 @@ class RewriterApp(object): def get_top_frame_params(self, wb_url, kwargs): return None - def handle_custom_response(self, wb_url, full_prefix, host_prefix, kwargs): + def handle_custom_response(self, environ, wb_url, full_prefix, host_prefix, kwargs): if wb_url.is_query(): - return self.do_query(wb_url, kwargs) + return self.handle_query(environ, wb_url, kwargs) + #return self.do_query(wb_url, kwargs) if self.framed_replay and wb_url.mod == self.frame_mod: extra_params = self.get_top_frame_params(wb_url, kwargs) return self.frame_insert_view.get_top_frame(wb_url, full_prefix, host_prefix, - request.environ, + environ, self.frame_mod, self.replay_mod, coll='', diff --git a/urlrewrite/templateview.py b/urlrewrite/templateview.py index 19039567..804727a2 100644 --- a/urlrewrite/templateview.py +++ b/urlrewrite/templateview.py @@ -87,7 +87,7 @@ class JinjaEnv(object): # ============================================================================ class BaseInsertView(object): - def __init__(self, jenv, insert_file, banner_file): + def __init__(self, jenv, insert_file, banner_file=''): self.jenv = jenv self.insert_file = insert_file self.banner_file = banner_file @@ -106,6 +106,7 @@ class HeadInsertView(BaseInsertView): def create_insert_func(self, wb_url, wb_prefix, host_prefix, + top_url, env, is_framed, coll='', @@ -113,9 +114,6 @@ class HeadInsertView(BaseInsertView): url = wb_url.get_url() - top_url = wb_prefix - top_url += wb_url.to_str(mod='') - include_wombat = not wb_url.is_banner_only wbrequest = {'host_prefix': host_prefix,