From 95028ab692a19d6c2a293625011f38d945fde6aa Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 4 Aug 2014 01:18:46 -0700 Subject: [PATCH] refactoring for better extensibility: remove BaseContentView, move top-frame functionality to SearchPageWbUrlHandler remove RewriteLiveView, fold functionality into the handler move default mod setting into RewriteContent --- pywb/rewrite/rewrite_content.py | 8 ++- pywb/rewrite/rewrite_live.py | 9 ++- pywb/webapp/handlers.py | 84 +++++++++++++++--------- pywb/webapp/live_rewrite_handler.py | 40 ++++++++++-- pywb/webapp/pywb_init.py | 17 ++++- pywb/webapp/replay_views.py | 99 +++-------------------------- pywb/webapp/views.py | 8 +++ tests/test_integration.py | 4 +- 8 files changed, 131 insertions(+), 138 deletions(-) diff --git a/pywb/rewrite/rewrite_content.py b/pywb/rewrite/rewrite_content.py index 93ec396b..3a635d4e 100644 --- a/pywb/rewrite/rewrite_content.py +++ b/pywb/rewrite/rewrite_content.py @@ -18,11 +18,15 @@ from pywb.utils.bufferedreaders import ChunkedDataReader #================================================================= class RewriteContent: - def __init__(self, ds_rules_file=None, defmod=''): + def __init__(self, ds_rules_file=None, is_framed_replay=False): self.ruleset = RuleSet(RewriteRules, 'rewrite', default_rule_config={}, ds_rules_file=ds_rules_file) - self.defmod = defmod + + if is_framed_replay: + self.defmod = 'mp_' + else: + self.defmod = '' def sanitize_content(self, status_headers, stream): # remove transfer encoding chunked and wrap in a dechunking stream diff --git a/pywb/rewrite/rewrite_live.py b/pywb/rewrite/rewrite_live.py index 97024600..6c7f33fe 100644 --- a/pywb/rewrite/rewrite_live.py +++ b/pywb/rewrite/rewrite_live.py @@ -21,8 +21,8 @@ from rewrite_content import RewriteContent #================================================================= class LiveRewriter(object): - def __init__(self, defmod='', default_proxy=None): - self.rewriter = RewriteContent(defmod=defmod) + def __init__(self, is_framed_replay=False, default_proxy=None): + self.rewriter = RewriteContent(is_framed_replay=is_framed_replay) self.default_proxy = default_proxy if self.default_proxy: logging.debug('Live Rewrite via proxy ' + self.default_proxy) @@ -73,7 +73,7 @@ class LiveRewriter(object): def fetch_http(self, url, env=None, - req_headers={}, + req_headers=None, follow_redirects=False, proxies=None): @@ -84,6 +84,9 @@ class LiveRewriter(object): proxies = {'http': self.default_proxy, 'https': self.default_proxy} + if not req_headers: + req_headers = {} + if env is not None: method = env['REQUEST_METHOD'].upper() input_ = env['wsgi.input'] diff --git a/pywb/webapp/handlers.py b/pywb/webapp/handlers.py index ce30793d..9b5fa718 100644 --- a/pywb/webapp/handlers.py +++ b/pywb/webapp/handlers.py @@ -2,6 +2,8 @@ import pkgutil import mimetypes import time +from datetime import datetime + from pywb.utils.wbexception import NotFoundException from pywb.utils.loaders import BlockLoader @@ -11,8 +13,9 @@ from pywb.framework.wbrequestresponse import WbResponse from pywb.warc.recordloader import ArcWarcRecordLoader from pywb.warc.resolvingloader import ResolvingLoader -from views import J2TemplateView, add_env_globals +from views import J2TemplateView from replay_views import ReplayView +from pywb.utils.timeutils import datetime_to_timestamp #================================================================= @@ -26,6 +29,15 @@ class SearchPageWbUrlHandler(WbUrlHandler): create_template(config.get('search_html'), 'Search Page')) + self.is_frame_mode = config.get('framed_replay', False) + + if self.is_frame_mode: + html = config.get('frame_insert_html', 'ui/frame_insert.html') + self.frame_insert_view = (J2TemplateView. + create_template(html, 'Frame Insert')) + else: + self.frame_insert_view = None + def render_search_page(self, wbrequest, **kwargs): if self.search_view: return self.search_view.render_response(wbrequest=wbrequest, @@ -34,6 +46,38 @@ class SearchPageWbUrlHandler(WbUrlHandler): else: return WbResponse.text_response('No Lookup Url Specified') + def __call__(self, wbrequest): + # root search page + if wbrequest.wb_url_str == '/': + return self.render_search_page(wbrequest) + + # render top level frame if in frame mode + # (not supported in proxy mode) + if (self.is_frame_mode and wbrequest.wb_url and + not wbrequest.wb_url.is_query() and + not wbrequest.wb_url.mod and + not wbrequest.options['is_proxy']): + + params = self.get_top_frame_params(wbrequest) + + return self.frame_insert_view.render_response(**params) + + return self.handle_request(wbrequest) + + def get_top_frame_params(self, wbrequest): + if wbrequest.wb_url.timestamp: + timestamp = wbrequest.wb_url.timestamp + else: + timestamp = datetime_to_timestamp(datetime.utcnow()) + + embed_url = wbrequest.wb_url.to_str(mod='mp_') + + return dict(embed_url=embed_url, + wbrequest=wbrequest, + timestamp=timestamp, + url=wbrequest.wb_url.url, + content_type='text/html') + #================================================================= # Standard WB Handler @@ -52,10 +96,6 @@ class WBHandler(SearchPageWbUrlHandler): resolving_loader = ResolvingLoader(paths=paths, record_loader=record_loader) - template_globals = config.get('template_globals') - if template_globals: - add_env_globals(template_globals) - self.replay = ReplayView(resolving_loader, config) self.fallback_handler = None @@ -65,13 +105,9 @@ class WBHandler(SearchPageWbUrlHandler): if self.fallback_name: self.fallback_handler = handler_dict.get(self.fallback_name) - def __call__(self, wbrequest): - if wbrequest.wb_url_str == '/': - return self.render_search_page(wbrequest) - + def handle_request(self, wbrequest): try: - with PerfTimer(wbrequest.env.get('X_PERF'), 'query') as t: - response = self.index_reader.load_for_request(wbrequest) + response = self.handle_query(wbrequest) except NotFoundException as nfe: return self.handle_not_found(wbrequest, nfe) @@ -81,11 +117,13 @@ class WBHandler(SearchPageWbUrlHandler): cdx_lines, cdx_callback = response return self.handle_replay(wbrequest, cdx_lines, cdx_callback) + def handle_query(self, wbrequest): + return self.index_reader.load_for_request(wbrequest) + def handle_replay(self, wbrequest, cdx_lines, cdx_callback): - with PerfTimer(wbrequest.env.get('X_PERF'), 'replay') as t: - return self.replay(wbrequest, - cdx_lines, - cdx_callback) + return self.replay.render_content(wbrequest, + cdx_lines, + cdx_callback) def handle_not_found(self, wbrequest, nfe): if (not self.fallback_handler or @@ -154,19 +192,3 @@ class DebugEchoEnvHandler(BaseHandler): # pragma: no cover class DebugEchoHandler(BaseHandler): # pragma: no cover def __call__(self, wbrequest): return WbResponse.text_response(str(wbrequest)) - - -#================================================================= -class PerfTimer: - def __init__(self, perfdict, name): - self.perfdict = perfdict - self.name = name - - def __enter__(self): - self.start = time.clock() - return self - - def __exit__(self, *args): - self.end = time.clock() - if self.perfdict is not None: - self.perfdict[self.name] = str(self.end - self.start) diff --git a/pywb/webapp/live_rewrite_handler.py b/pywb/webapp/live_rewrite_handler.py index e1e2d53e..a343fbee 100644 --- a/pywb/webapp/live_rewrite_handler.py +++ b/pywb/webapp/live_rewrite_handler.py @@ -2,9 +2,11 @@ from pywb.framework.basehandlers import WbUrlHandler from pywb.framework.wbrequestresponse import WbResponse from pywb.framework.archivalrouter import ArchivalRouter, Route -from handlers import StaticHandler, SearchPageWbUrlHandler +from pywb.rewrite.rewrite_live import LiveRewriter +from pywb.rewrite.wburl import WbUrl -from replay_views import RewriteLiveView +from handlers import StaticHandler, SearchPageWbUrlHandler +from views import HeadInsertView from pywb.utils.wbexception import WbException @@ -19,20 +21,44 @@ class LiveResourceException(WbException): class RewriteHandler(SearchPageWbUrlHandler): def __init__(self, config): super(RewriteHandler, self).__init__(config) - self.rewrite_view = RewriteLiveView(config) - def __call__(self, wbrequest): - if wbrequest.wb_url_str == '/': - return self.render_search_page(wbrequest) + default_proxy = config.get('proxyhostport') + self.rewriter = LiveRewriter(is_framed_replay=self.is_frame_mode, + default_proxy=default_proxy) + self.head_insert_view = HeadInsertView.init_from_config(config) + + def handle_request(self, wbrequest): try: - return self.rewrite_view(wbrequest) + return self.render_content(wbrequest) except Exception as exc: url = wbrequest.wb_url.url msg = 'Could not load the url from the live web: ' + url raise LiveResourceException(msg=msg, url=url) + def _live_request_headers(self, wbrequest): + return {} + + def render_content(self, wbrequest): + head_insert_func = self.head_insert_view.create_insert_func(wbrequest) + req_headers = self._live_request_headers(wbrequest) + + ref_wburl_str = wbrequest.extract_referrer_wburl_str() + if ref_wburl_str: + wbrequest.env['REL_REFERER'] = WbUrl(ref_wburl_str).url + + wb_url = wbrequest.wb_url + result = self.rewriter.fetch_request(wb_url, wbrequest.urlrewriter, + head_insert_func=head_insert_func, + req_headers=req_headers, + env=wbrequest.env) + + return self._make_response(wbrequest, *result) + + def _make_response(self, wbrequest, status_headers, gen, is_rewritten): + return WbResponse(status_headers, gen) + def __str__(self): return 'Live Web Rewrite Handler' diff --git a/pywb/webapp/pywb_init.py b/pywb/webapp/pywb_init.py index 2fd02377..02efbf89 100644 --- a/pywb/webapp/pywb_init.py +++ b/pywb/webapp/pywb_init.py @@ -6,7 +6,7 @@ from pywb.framework.wbrequestresponse import WbRequest from pywb.framework.memento import MementoRequest from pywb.framework.basehandlers import BaseHandler -from views import J2TemplateView +from views import J2TemplateView, add_env_globals from views import J2HtmlCapturesView, HeadInsertView from live_rewrite_handler import RewriteHandler @@ -71,7 +71,10 @@ def create_wb_handler(query_handler, config): #================================================================= def create_live_handler(config): - live_handler = RewriteHandler(config) + wb_handler_class = config.get('wb_handler_class', RewriteHandler) + + live_handler = wb_handler_class(config) + return live_handler @@ -92,9 +95,12 @@ def init_collection(route_config): create_template(route_config.get('query_html'), 'Captures Page')) + server_cls = route_config.get('server_cls') + query_handler = QueryHandler.init_from_config(route_config, ds_rules_file, - html_view) + html_view, + server_cls) return query_handler @@ -162,6 +168,11 @@ def create_wb_router(passed_config={}): # store live and replay handlers handler_dict = {} + # setup template globals + template_globals = config.get('template_globals') + if template_globals: + add_env_globals(template_globals) + for name, value in collections.iteritems(): if isinstance(value, BaseHandler): handler_dict[name] = value diff --git a/pywb/webapp/replay_views.py b/pywb/webapp/replay_views.py index 2542aee2..5002a18d 100644 --- a/pywb/webapp/replay_views.py +++ b/pywb/webapp/replay_views.py @@ -1,19 +1,14 @@ import re -import datetime from io import BytesIO from pywb.utils.statusandheaders import StatusAndHeaders from pywb.utils.wbexception import WbException, NotFoundException from pywb.utils.loaders import LimitReader -from pywb.utils.timeutils import datetime_to_timestamp from pywb.framework.wbrequestresponse import WbResponse from pywb.framework.memento import MementoResponse from pywb.rewrite.rewrite_content import RewriteContent -from pywb.rewrite.rewrite_live import LiveRewriter -from pywb.rewrite.wburl import WbUrl - from pywb.warc.recordloader import ArchiveLoadFailed from views import J2TemplateView, add_env_globals @@ -32,92 +27,16 @@ class CaptureException(WbException): #================================================================= -class BaseContentView(object): - def __init__(self, config): - self.is_frame_mode = config.get('framed_replay', False) - - if self.is_frame_mode: - self._mp_mod = 'mp_' - else: - self._mp_mod = '' - - view = config.get('head_insert_view') - if not view: - head_insert = config.get('head_insert_html', - 'ui/head_insert.html') - view = HeadInsertView.create_template(head_insert, 'Head Insert') - - self.head_insert_view = view - - if not self.is_frame_mode: - self.frame_insert_view = None - return - - view = config.get('frame_insert_view') - if not view: - frame_insert = config.get('frame_insert_html', - 'ui/frame_insert.html') - - view = J2TemplateView.create_template(frame_insert, 'Frame Insert') - - self.frame_insert_view = view - - def __call__(self, wbrequest, *args): - # render top level frame if in frame mode - # (not supported in proxy mode) - if (self.is_frame_mode and wbrequest.wb_url and - not wbrequest.wb_url.mod and - not wbrequest.options['is_proxy']): - - embed_url = wbrequest.wb_url.to_str(mod=self._mp_mod) - timestamp = datetime_to_timestamp(datetime.datetime.utcnow()) - url = wbrequest.wb_url.url - ctype = 'text/html' - - return self.frame_insert_view.render_response(embed_url=embed_url, - wbrequest=wbrequest, - timestamp=timestamp, - url=url, - content_type=ctype) - - return self.render_content(wbrequest, *args) - - -#================================================================= -class RewriteLiveView(BaseContentView): - def __init__(self, config): - super(RewriteLiveView, self).__init__(config) - - default_proxy = config.get('proxyhostport') - self.rewriter = LiveRewriter(defmod=self._mp_mod, - default_proxy=default_proxy) - - def render_content(self, wbrequest, *args): - head_insert_func = self.head_insert_view.create_insert_func(wbrequest) - - ref_wburl_str = wbrequest.extract_referrer_wburl_str() - if ref_wburl_str: - wbrequest.env['REL_REFERER'] = WbUrl(ref_wburl_str).url - - wb_url = wbrequest.wb_url - result = self.rewriter.fetch_request(wb_url, wbrequest.urlrewriter, - head_insert_func=head_insert_func, - env=wbrequest.env) - - status_headers, gen, is_rewritten = result - - return WbResponse(status_headers, gen) - - -#================================================================= -class ReplayView(BaseContentView): +class ReplayView(object): STRIP_SCHEME = re.compile('^([\w]+:[/]*)?(.*?)$') def __init__(self, content_loader, config): - super(ReplayView, self).__init__(config) - self.content_loader = content_loader - self.content_rewriter = RewriteContent(defmod=self._mp_mod) + + framed = config.get('framed_replay') + self.content_rewriter = RewriteContent(is_framed_replay=framed) + + self.head_insert_view = HeadInsertView.init_from_config(config) self.buffer_response = config.get('buffer_response', True) @@ -131,12 +50,12 @@ class ReplayView(BaseContentView): self._reporter = config.get('reporter') - def render_content(self, wbrequest, *args): + def render_content(self, wbrequest, cdx_lines, cdx_loader): last_e = None first = True - cdx_lines = args[0] - cdx_loader = args[1] + #cdx_lines = args[0] + #cdx_loader = args[1] # List of already failed w/arcs failed_files = [] diff --git a/pywb/webapp/views.py b/pywb/webapp/views.py index 0fc5589d..e7034050 100644 --- a/pywb/webapp/views.py +++ b/pywb/webapp/views.py @@ -141,6 +141,14 @@ class HeadInsertView(J2TemplateView): return J2TemplateView.create_template(filename, desc, HeadInsertView) + @staticmethod + def init_from_config(config): + view = config.get('head_insert_view') + if not view: + html = config.get('head_insert_html', 'ui/head_insert.html') + view = HeadInsertView.create_template(html, 'Head Insert') + return view + #================================================================= # query views diff --git a/tests/test_integration.py b/tests/test_integration.py index 456d50f8..6cc32482 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -310,7 +310,7 @@ class TestWb: def test_excluded_content(self): - resp = self.testapp.get('/pywb/http://www.iana.org/_img/bookmark_icon.ico', status = 403) + resp = self.testapp.get('/pywb/mp_/http://www.iana.org/_img/bookmark_icon.ico', status = 403) assert resp.status_int == 403 assert 'Excluded' in resp.body @@ -414,7 +414,7 @@ class TestWb: def test_error(self): - resp = self.testapp.get('/pywb/?abc', status = 400) + resp = self.testapp.get('/pywb/mp_/?abc', status = 400) assert resp.status_int == 400 assert 'Invalid Url: http://?abc' in resp.body