From 7694bf06788cfbb2fd376aa734d2fb5dad89bfac Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 1 Jul 2014 16:22:38 -0700 Subject: [PATCH 01/13] update README.rst for master 0.4.7 --- README.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.rst b/README.rst index 2fc4a3f8..9402d69d 100644 --- a/README.rst +++ b/README.rst @@ -1,11 +1,11 @@ PyWb 0.4.7 ========== -.. image:: https://travis-ci.org/ikreymer/pywb.png?branch=develop +.. image:: https://travis-ci.org/ikreymer/pywb.png?branch=master :target: https://travis-ci.org/ikreymer/pywb -.. image:: https://coveralls.io/repos/ikreymer/pywb/badge.png?branch=develop - :target: https://coveralls.io/r/ikreymer/pywb?branch=develop +.. image:: https://coveralls.io/repos/ikreymer/pywb/badge.png?branch=master + :target: https://coveralls.io/r/ikreymer/pywb?branch=master pywb is a python implementation of web archival replay tools, sometimes also known as 'Wayback Machine'. From 160182ec4807dfdeff2889306daa260a21e2d2c7 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 29 Jul 2014 12:20:22 -0700 Subject: [PATCH 02/13] rewrite: add 'bn_' banner only rewrite cleanup rewrite_content/fetch_request api to take a full wb_url add content-length to responses whenever possible (WbResponse) and static files bump version to 0.5.2 --- README.rst | 2 +- pywb/framework/test/test_wbrequestresponse.py | 4 +- pywb/framework/wbrequestresponse.py | 24 +++++--- pywb/rewrite/rewrite_content.py | 46 ++++++++++++-- pywb/rewrite/rewrite_live.py | 60 +++++-------------- pywb/rewrite/wburl.py | 7 ++- pywb/ui/head_insert.html | 5 +- pywb/webapp/handlers.py | 12 +++- pywb/webapp/replay_views.py | 11 ++-- pywb/webapp/views.py | 6 +- setup.py | 2 +- tests/test_integration.py | 14 +++++ 12 files changed, 118 insertions(+), 75 deletions(-) diff --git a/README.rst b/README.rst index 233f347b..90c84b6e 100644 --- a/README.rst +++ b/README.rst @@ -1,4 +1,4 @@ -PyWb 0.5.1 +PyWb 0.5.2 ========== .. image:: https://travis-ci.org/ikreymer/pywb.png?branch=master diff --git a/pywb/framework/test/test_wbrequestresponse.py b/pywb/framework/test/test_wbrequestresponse.py index e066d4d1..5bbb65b8 100644 --- a/pywb/framework/test/test_wbrequestresponse.py +++ b/pywb/framework/test/test_wbrequestresponse.py @@ -40,13 +40,13 @@ # WbResponse Tests # ================= >>> WbResponse.text_response('Test') -{'body': ['Test'], 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [('Content-Type', 'text/plain')])} +{'body': ['Test'], 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [('Content-Type', 'text/plain'), ('Content-Length', '4')])} >>> WbResponse.text_stream(['Test', 'Another'], '404') {'body': ['Test', 'Another'], 'status_headers': StatusAndHeaders(protocol = '', statusline = '404', headers = [('Content-Type', 'text/plain')])} >>> WbResponse.redir_response('http://example.com/otherfile') -{'body': [], 'status_headers': StatusAndHeaders(protocol = '', statusline = '302 Redirect', headers = [('Location', 'http://example.com/otherfile')])} +{'body': [], 'status_headers': StatusAndHeaders(protocol = '', statusline = '302 Redirect', headers = [('Location', 'http://example.com/otherfile'), ('Content-Length', '0')])} """ diff --git a/pywb/framework/wbrequestresponse.py b/pywb/framework/wbrequestresponse.py index 0f1a9f32..da456474 100644 --- a/pywb/framework/wbrequestresponse.py +++ b/pywb/framework/wbrequestresponse.py @@ -125,7 +125,7 @@ class WbRequest(object): if not self.wb_url: return - mime = self.env.get('CONTENT_TYPE') + mime = self.env.get('CONTENT_TYPE').split(';')[0] length = self.env.get('CONTENT_LENGTH') stream = self.env['wsgi.input'] @@ -152,23 +152,31 @@ class WbResponse(object): pass @staticmethod - def text_stream(stream, status='200 OK', content_type='text/plain'): - status_headers = StatusAndHeaders(status, - [('Content-Type', content_type)]) + def text_stream(stream, status='200 OK', content_type='text/plain', + headers=None): + def_headers = [('Content-Type', content_type)] + if headers: + def_headers += headers + + status_headers = StatusAndHeaders(status, def_headers) return WbResponse(status_headers, value=stream) @staticmethod def text_response(text, status='200 OK', content_type='text/plain'): status_headers = StatusAndHeaders(status, - [('Content-Type', content_type)]) + [('Content-Type', content_type), + ('Content-Length', str(len(text)))]) return WbResponse(status_headers, value=[text]) @staticmethod - def redir_response(location, status='302 Redirect'): - return WbResponse(StatusAndHeaders(status, - [('Location', location)])) + def redir_response(location, status='302 Redirect', headers=None): + redir_headers = [('Location', location), ('Content-Length', '0')] + if headers: + redir_headers += headers + + return WbResponse(StatusAndHeaders(status, redir_headers)) def __call__(self, env, start_response): diff --git a/pywb/rewrite/rewrite_content.py b/pywb/rewrite/rewrite_content.py index ec93593a..93ec396b 100644 --- a/pywb/rewrite/rewrite_content.py +++ b/pywb/rewrite/rewrite_content.py @@ -1,6 +1,8 @@ #import chardet import pkgutil import yaml +import re + from chardet.universaldetector import UniversalDetector from io import BytesIO @@ -52,11 +54,12 @@ class RewriteContent: return (rewritten_headers, stream) - def rewrite_content(self, urlrewriter, headers, stream, + def rewrite_content(self, wb_url, urlrewriter, headers, stream, head_insert_func=None, urlkey='', - sanitize_only=False, cdx=None, mod=None): + cdx=None): - if sanitize_only: + if (wb_url.is_identity or + (not head_insert_func and wb_url.is_banner_only)): status_headers, stream = self.sanitize_content(headers, stream) return (status_headers, self.stream_to_gen(stream), False) @@ -78,6 +81,8 @@ class RewriteContent: # see known js/css modifier specified, the context should run # default text_type + mod = wb_url.mod + if mod == 'js_': text_type = 'js' elif mod == 'cs_': @@ -118,6 +123,10 @@ class RewriteContent: if head_insert_func: head_insert_str = head_insert_func(rule, cdx) + if wb_url.is_banner_only: + gen = self._head_insert_only_gen(head_insert_str, stream) + return (status_headers, gen, False) + rewriter = rewriter_class(urlrewriter, js_rewriter_class=rule.rewriters['js'], css_rewriter_class=rule.rewriters['css'], @@ -125,7 +134,10 @@ class RewriteContent: defmod=self.defmod) else: - # apply one of (js, css, xml) rewriters + if wb_url.is_banner_only: + return (status_headers, self.stream_to_gen(stream), False) + + # apply one of (js, css, xml) rewriters rewriter = rewriter_class(urlrewriter) # Create rewriting generator @@ -134,6 +146,32 @@ class RewriteContent: return (status_headers, gen, True) + HEAD_REGEX = re.compile(r'<\s*head\b[^>]*[>]+', re.I) + + def _head_insert_only_gen(self, insert_str, stream): + max_len = 1024 + buff = '' + while max_len > 0: + curr = stream.read(max_len) + if not curr: + break + + max_len -= len(buff) + buff += curr + + matcher = self.HEAD_REGEX.search(buff) + + if matcher: + yield buff[:matcher.end()] + insert_str + yield buff[matcher.end():] + else: + yield insert_str + yield buff + + for buff in self.stream_to_gen(stream): + yield buff + + # Create rewrite stream, may even be chunked by front-end def _rewriting_stream_gen(self, rewriter, encoding, stream, first_buff=None): diff --git a/pywb/rewrite/rewrite_live.py b/pywb/rewrite/rewrite_live.py index b81b0144..97024600 100644 --- a/pywb/rewrite/rewrite_live.py +++ b/pywb/rewrite/rewrite_live.py @@ -14,8 +14,9 @@ from pywb.utils.timeutils import datetime_to_timestamp from pywb.utils.statusandheaders import StatusAndHeaders from pywb.utils.canonicalize import canonicalize -from pywb.rewrite.url_rewriter import UrlRewriter -from pywb.rewrite.rewrite_content import RewriteContent +from url_rewriter import UrlRewriter +from wburl import WbUrl +from rewrite_content import RewriteContent #================================================================= @@ -114,15 +115,20 @@ class LiveRewriter(object): return (status_headers, stream) - def fetch_request(self, url, urlrewriter, + def fetch_request(self, wb_url, urlrewriter, head_insert_func=None, urlkey=None, env=None, req_headers={}, timestamp=None, follow_redirects=False, - proxies=None, - mod=None): + proxies=None): + + if isinstance(wb_url, str): + url = wb_url + wb_url = WbUrl(url) + else: + url = wb_url.url ts_err = url.split('///') @@ -155,13 +161,13 @@ class LiveRewriter(object): } result = (self.rewriter. - rewrite_content(urlrewriter, + rewrite_content(wb_url, + urlrewriter, status_headers, stream, head_insert_func=head_insert_func, urlkey=urlkey, - cdx=cdx, - mod=mod)) + cdx=cdx)) return result @@ -174,41 +180,3 @@ class LiveRewriter(object): buff = ''.join(gen) return (status_headers, buff) - - -#================================================================= -def main(): # pragma: no cover - import sys - - if len(sys.argv) < 2: - msg = 'Usage: {0} url-to-fetch [wb-url-target] [extra-prefix]' - print msg.format(sys.argv[0]) - return 1 - else: - url = sys.argv[1] - - if len(sys.argv) >= 3: - wburl_str = sys.argv[2] - if wburl_str.startswith('/'): - wburl_str = wburl_str[1:] - - prefix, wburl_str = wburl_str.split('/', 1) - prefix = '/' + prefix + '/' - else: - wburl_str = (datetime_to_timestamp(datetime.datetime.now()) + - '/http://example.com/path/sample.html') - prefix = '/pywb_rewrite/' - - urlrewriter = UrlRewriter(wburl_str, prefix) - - liverewriter = LiveRewriter() - - status_headers, buff = liverewriter.get_rewritten(url, urlrewriter) - - sys.stdout.write(buff) - return 0 - - -#================================================================= -if __name__ == "__main__": - exit(main()) diff --git a/pywb/rewrite/wburl.py b/pywb/rewrite/wburl.py index 3cd9ad72..f826108f 100644 --- a/pywb/rewrite/wburl.py +++ b/pywb/rewrite/wburl.py @@ -196,8 +196,11 @@ class WbUrl(BaseWbUrl): @property def is_embed(self): return (self.mod and - self.mod != 'id_' and - self.mod != 'mp_') + self.mod not in ('id_', 'mp_', 'bn_')) + + @property + def is_banner_only(self): + return (self.mod == 'bn_') @property def is_identity(self): diff --git a/pywb/ui/head_insert.html b/pywb/ui/head_insert.html index be810823..b1ff4a26 100644 --- a/pywb/ui/head_insert.html +++ b/pywb/ui/head_insert.html @@ -1,5 +1,5 @@ -{% if rule.js_rewrite_location %} +{% if rule.js_rewrite_location and include_wombat %} diff --git a/pywb/webapp/handlers.py b/pywb/webapp/handlers.py index 6228de3e..ce30793d 100644 --- a/pywb/webapp/handlers.py +++ b/pywb/webapp/handlers.py @@ -115,6 +115,14 @@ class StaticHandler(BaseHandler): try: data = self.block_loader.load(full_path) + try: + data.seek(0, 2) + size = data.tell() + data.seek(0) + headers = [('Content-Length', str(size))] + except IOError: + headers = None + if 'wsgi.file_wrapper' in wbrequest.env: reader = wbrequest.env['wsgi.file_wrapper'](data) else: @@ -122,7 +130,9 @@ class StaticHandler(BaseHandler): content_type, _ = mimetypes.guess_type(full_path) - return WbResponse.text_stream(data, content_type=content_type) + return WbResponse.text_stream(data, + content_type=content_type, + headers=headers) except IOError: raise NotFoundException('Static File Not Found: ' + diff --git a/pywb/webapp/replay_views.py b/pywb/webapp/replay_views.py index c4e0f4f3..2542aee2 100644 --- a/pywb/webapp/replay_views.py +++ b/pywb/webapp/replay_views.py @@ -99,8 +99,8 @@ class RewriteLiveView(BaseContentView): if ref_wburl_str: wbrequest.env['REL_REFERER'] = WbUrl(ref_wburl_str).url - url = wbrequest.wb_url.url - result = self.rewriter.fetch_request(url, wbrequest.urlrewriter, + wb_url = wbrequest.wb_url + result = self.rewriter.fetch_request(wb_url, wbrequest.urlrewriter, head_insert_func=head_insert_func, env=wbrequest.env) @@ -211,14 +211,13 @@ class ReplayView(BaseContentView): create_insert_func(wbrequest)) result = (self.content_rewriter. - rewrite_content(urlrewriter, + rewrite_content(wbrequest.wb_url, + urlrewriter, headers=status_headers, stream=stream, head_insert_func=head_insert_func, urlkey=cdx['urlkey'], - sanitize_only=wbrequest.wb_url.is_identity, - cdx=cdx, - mod=wbrequest.wb_url.mod)) + cdx=cdx)) (status_headers, response_iter, is_rewritten) = result diff --git a/pywb/webapp/views.py b/pywb/webapp/views.py index c49be8c9..0fc5589d 100644 --- a/pywb/webapp/views.py +++ b/pywb/webapp/views.py @@ -121,16 +121,18 @@ def add_env_globals(glb): #================================================================= class HeadInsertView(J2TemplateView): - def create_insert_func(self, wbrequest, include_ts=True): + def create_insert_func(self, wbrequest, + include_ts=True): canon_url = wbrequest.wb_prefix + wbrequest.wb_url.to_str(mod='') - include_ts = include_ts + include_wombat = not wbrequest.wb_url.is_banner_only def make_head_insert(rule, cdx): return (self.render_to_string(wbrequest=wbrequest, cdx=cdx, canon_url=canon_url, include_ts=include_ts, + include_wombat=include_wombat, rule=rule)) return make_head_insert diff --git a/setup.py b/setup.py index 3e89abed..a6e9c885 100755 --- a/setup.py +++ b/setup.py @@ -34,7 +34,7 @@ class PyTest(TestCommand): setup( name='pywb', - version='0.5.1', + version='0.5.2', url='https://github.com/ikreymer/pywb', author='Ilya Kreymer', author_email='ikreymer@gmail.com', diff --git a/tests/test_integration.py b/tests/test_integration.py index 94ce45cf..456d50f8 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -98,6 +98,7 @@ class TestWb: assert 'Mon, Jan 27 2014 17:12:38' in resp.body assert 'wb.js' in resp.body + assert 'WB_wombat_init' in resp.body assert '/pywb/20140127171238mp_/http://www.iana.org/time-zones"' in resp.body def test_replay_non_frame_content(self): @@ -141,6 +142,19 @@ class TestWb: assert lines[0].startswith('org,iana)/_css/2013.1/print.css 20140127171239') + def test_replay_banner_only(self): + resp = self.testapp.get('/pywb/20140126201054bn_/http://www.iana.org/domains/reserved') + + # wb.js header insertion + assert 'wb.js' in resp.body + + # no wombat present + assert 'WB_wombat_init' not in resp.body + + # url not rewritten + #assert '"http://www.iana.org/domains/example"' in resp.body + assert '"/_css/2013.1/screen.css"' in resp.body + def test_replay_identity_1(self): resp = self.testapp.get('/pywb/20140127171251id_/http://example.com') From 8d5415332655f5e51ed38546c180b0c4e8d65bc1 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 4 Aug 2014 01:18:46 -0700 Subject: [PATCH 03/13] refactoring for better extensibility: remove BaseContentView, move top-frame functionality to SearchPageWbUrlHandler remove RewriteLiveView, fold functionality into the handler move default mod setting into RewriteContent --- pywb/rewrite/rewrite_content.py | 8 ++- pywb/rewrite/rewrite_live.py | 9 ++- pywb/webapp/handlers.py | 84 +++++++++++++++--------- pywb/webapp/live_rewrite_handler.py | 40 ++++++++++-- pywb/webapp/pywb_init.py | 17 ++++- pywb/webapp/replay_views.py | 99 +++-------------------------- pywb/webapp/views.py | 8 +++ tests/test_integration.py | 4 +- 8 files changed, 131 insertions(+), 138 deletions(-) diff --git a/pywb/rewrite/rewrite_content.py b/pywb/rewrite/rewrite_content.py index 93ec396b..3a635d4e 100644 --- a/pywb/rewrite/rewrite_content.py +++ b/pywb/rewrite/rewrite_content.py @@ -18,11 +18,15 @@ from pywb.utils.bufferedreaders import ChunkedDataReader #================================================================= class RewriteContent: - def __init__(self, ds_rules_file=None, defmod=''): + def __init__(self, ds_rules_file=None, is_framed_replay=False): self.ruleset = RuleSet(RewriteRules, 'rewrite', default_rule_config={}, ds_rules_file=ds_rules_file) - self.defmod = defmod + + if is_framed_replay: + self.defmod = 'mp_' + else: + self.defmod = '' def sanitize_content(self, status_headers, stream): # remove transfer encoding chunked and wrap in a dechunking stream diff --git a/pywb/rewrite/rewrite_live.py b/pywb/rewrite/rewrite_live.py index 97024600..6c7f33fe 100644 --- a/pywb/rewrite/rewrite_live.py +++ b/pywb/rewrite/rewrite_live.py @@ -21,8 +21,8 @@ from rewrite_content import RewriteContent #================================================================= class LiveRewriter(object): - def __init__(self, defmod='', default_proxy=None): - self.rewriter = RewriteContent(defmod=defmod) + def __init__(self, is_framed_replay=False, default_proxy=None): + self.rewriter = RewriteContent(is_framed_replay=is_framed_replay) self.default_proxy = default_proxy if self.default_proxy: logging.debug('Live Rewrite via proxy ' + self.default_proxy) @@ -73,7 +73,7 @@ class LiveRewriter(object): def fetch_http(self, url, env=None, - req_headers={}, + req_headers=None, follow_redirects=False, proxies=None): @@ -84,6 +84,9 @@ class LiveRewriter(object): proxies = {'http': self.default_proxy, 'https': self.default_proxy} + if not req_headers: + req_headers = {} + if env is not None: method = env['REQUEST_METHOD'].upper() input_ = env['wsgi.input'] diff --git a/pywb/webapp/handlers.py b/pywb/webapp/handlers.py index ce30793d..9b5fa718 100644 --- a/pywb/webapp/handlers.py +++ b/pywb/webapp/handlers.py @@ -2,6 +2,8 @@ import pkgutil import mimetypes import time +from datetime import datetime + from pywb.utils.wbexception import NotFoundException from pywb.utils.loaders import BlockLoader @@ -11,8 +13,9 @@ from pywb.framework.wbrequestresponse import WbResponse from pywb.warc.recordloader import ArcWarcRecordLoader from pywb.warc.resolvingloader import ResolvingLoader -from views import J2TemplateView, add_env_globals +from views import J2TemplateView from replay_views import ReplayView +from pywb.utils.timeutils import datetime_to_timestamp #================================================================= @@ -26,6 +29,15 @@ class SearchPageWbUrlHandler(WbUrlHandler): create_template(config.get('search_html'), 'Search Page')) + self.is_frame_mode = config.get('framed_replay', False) + + if self.is_frame_mode: + html = config.get('frame_insert_html', 'ui/frame_insert.html') + self.frame_insert_view = (J2TemplateView. + create_template(html, 'Frame Insert')) + else: + self.frame_insert_view = None + def render_search_page(self, wbrequest, **kwargs): if self.search_view: return self.search_view.render_response(wbrequest=wbrequest, @@ -34,6 +46,38 @@ class SearchPageWbUrlHandler(WbUrlHandler): else: return WbResponse.text_response('No Lookup Url Specified') + def __call__(self, wbrequest): + # root search page + if wbrequest.wb_url_str == '/': + return self.render_search_page(wbrequest) + + # render top level frame if in frame mode + # (not supported in proxy mode) + if (self.is_frame_mode and wbrequest.wb_url and + not wbrequest.wb_url.is_query() and + not wbrequest.wb_url.mod and + not wbrequest.options['is_proxy']): + + params = self.get_top_frame_params(wbrequest) + + return self.frame_insert_view.render_response(**params) + + return self.handle_request(wbrequest) + + def get_top_frame_params(self, wbrequest): + if wbrequest.wb_url.timestamp: + timestamp = wbrequest.wb_url.timestamp + else: + timestamp = datetime_to_timestamp(datetime.utcnow()) + + embed_url = wbrequest.wb_url.to_str(mod='mp_') + + return dict(embed_url=embed_url, + wbrequest=wbrequest, + timestamp=timestamp, + url=wbrequest.wb_url.url, + content_type='text/html') + #================================================================= # Standard WB Handler @@ -52,10 +96,6 @@ class WBHandler(SearchPageWbUrlHandler): resolving_loader = ResolvingLoader(paths=paths, record_loader=record_loader) - template_globals = config.get('template_globals') - if template_globals: - add_env_globals(template_globals) - self.replay = ReplayView(resolving_loader, config) self.fallback_handler = None @@ -65,13 +105,9 @@ class WBHandler(SearchPageWbUrlHandler): if self.fallback_name: self.fallback_handler = handler_dict.get(self.fallback_name) - def __call__(self, wbrequest): - if wbrequest.wb_url_str == '/': - return self.render_search_page(wbrequest) - + def handle_request(self, wbrequest): try: - with PerfTimer(wbrequest.env.get('X_PERF'), 'query') as t: - response = self.index_reader.load_for_request(wbrequest) + response = self.handle_query(wbrequest) except NotFoundException as nfe: return self.handle_not_found(wbrequest, nfe) @@ -81,11 +117,13 @@ class WBHandler(SearchPageWbUrlHandler): cdx_lines, cdx_callback = response return self.handle_replay(wbrequest, cdx_lines, cdx_callback) + def handle_query(self, wbrequest): + return self.index_reader.load_for_request(wbrequest) + def handle_replay(self, wbrequest, cdx_lines, cdx_callback): - with PerfTimer(wbrequest.env.get('X_PERF'), 'replay') as t: - return self.replay(wbrequest, - cdx_lines, - cdx_callback) + return self.replay.render_content(wbrequest, + cdx_lines, + cdx_callback) def handle_not_found(self, wbrequest, nfe): if (not self.fallback_handler or @@ -154,19 +192,3 @@ class DebugEchoEnvHandler(BaseHandler): # pragma: no cover class DebugEchoHandler(BaseHandler): # pragma: no cover def __call__(self, wbrequest): return WbResponse.text_response(str(wbrequest)) - - -#================================================================= -class PerfTimer: - def __init__(self, perfdict, name): - self.perfdict = perfdict - self.name = name - - def __enter__(self): - self.start = time.clock() - return self - - def __exit__(self, *args): - self.end = time.clock() - if self.perfdict is not None: - self.perfdict[self.name] = str(self.end - self.start) diff --git a/pywb/webapp/live_rewrite_handler.py b/pywb/webapp/live_rewrite_handler.py index e1e2d53e..a343fbee 100644 --- a/pywb/webapp/live_rewrite_handler.py +++ b/pywb/webapp/live_rewrite_handler.py @@ -2,9 +2,11 @@ from pywb.framework.basehandlers import WbUrlHandler from pywb.framework.wbrequestresponse import WbResponse from pywb.framework.archivalrouter import ArchivalRouter, Route -from handlers import StaticHandler, SearchPageWbUrlHandler +from pywb.rewrite.rewrite_live import LiveRewriter +from pywb.rewrite.wburl import WbUrl -from replay_views import RewriteLiveView +from handlers import StaticHandler, SearchPageWbUrlHandler +from views import HeadInsertView from pywb.utils.wbexception import WbException @@ -19,20 +21,44 @@ class LiveResourceException(WbException): class RewriteHandler(SearchPageWbUrlHandler): def __init__(self, config): super(RewriteHandler, self).__init__(config) - self.rewrite_view = RewriteLiveView(config) - def __call__(self, wbrequest): - if wbrequest.wb_url_str == '/': - return self.render_search_page(wbrequest) + default_proxy = config.get('proxyhostport') + self.rewriter = LiveRewriter(is_framed_replay=self.is_frame_mode, + default_proxy=default_proxy) + self.head_insert_view = HeadInsertView.init_from_config(config) + + def handle_request(self, wbrequest): try: - return self.rewrite_view(wbrequest) + return self.render_content(wbrequest) except Exception as exc: url = wbrequest.wb_url.url msg = 'Could not load the url from the live web: ' + url raise LiveResourceException(msg=msg, url=url) + def _live_request_headers(self, wbrequest): + return {} + + def render_content(self, wbrequest): + head_insert_func = self.head_insert_view.create_insert_func(wbrequest) + req_headers = self._live_request_headers(wbrequest) + + ref_wburl_str = wbrequest.extract_referrer_wburl_str() + if ref_wburl_str: + wbrequest.env['REL_REFERER'] = WbUrl(ref_wburl_str).url + + wb_url = wbrequest.wb_url + result = self.rewriter.fetch_request(wb_url, wbrequest.urlrewriter, + head_insert_func=head_insert_func, + req_headers=req_headers, + env=wbrequest.env) + + return self._make_response(wbrequest, *result) + + def _make_response(self, wbrequest, status_headers, gen, is_rewritten): + return WbResponse(status_headers, gen) + def __str__(self): return 'Live Web Rewrite Handler' diff --git a/pywb/webapp/pywb_init.py b/pywb/webapp/pywb_init.py index 2fd02377..02efbf89 100644 --- a/pywb/webapp/pywb_init.py +++ b/pywb/webapp/pywb_init.py @@ -6,7 +6,7 @@ from pywb.framework.wbrequestresponse import WbRequest from pywb.framework.memento import MementoRequest from pywb.framework.basehandlers import BaseHandler -from views import J2TemplateView +from views import J2TemplateView, add_env_globals from views import J2HtmlCapturesView, HeadInsertView from live_rewrite_handler import RewriteHandler @@ -71,7 +71,10 @@ def create_wb_handler(query_handler, config): #================================================================= def create_live_handler(config): - live_handler = RewriteHandler(config) + wb_handler_class = config.get('wb_handler_class', RewriteHandler) + + live_handler = wb_handler_class(config) + return live_handler @@ -92,9 +95,12 @@ def init_collection(route_config): create_template(route_config.get('query_html'), 'Captures Page')) + server_cls = route_config.get('server_cls') + query_handler = QueryHandler.init_from_config(route_config, ds_rules_file, - html_view) + html_view, + server_cls) return query_handler @@ -162,6 +168,11 @@ def create_wb_router(passed_config={}): # store live and replay handlers handler_dict = {} + # setup template globals + template_globals = config.get('template_globals') + if template_globals: + add_env_globals(template_globals) + for name, value in collections.iteritems(): if isinstance(value, BaseHandler): handler_dict[name] = value diff --git a/pywb/webapp/replay_views.py b/pywb/webapp/replay_views.py index 2542aee2..5002a18d 100644 --- a/pywb/webapp/replay_views.py +++ b/pywb/webapp/replay_views.py @@ -1,19 +1,14 @@ import re -import datetime from io import BytesIO from pywb.utils.statusandheaders import StatusAndHeaders from pywb.utils.wbexception import WbException, NotFoundException from pywb.utils.loaders import LimitReader -from pywb.utils.timeutils import datetime_to_timestamp from pywb.framework.wbrequestresponse import WbResponse from pywb.framework.memento import MementoResponse from pywb.rewrite.rewrite_content import RewriteContent -from pywb.rewrite.rewrite_live import LiveRewriter -from pywb.rewrite.wburl import WbUrl - from pywb.warc.recordloader import ArchiveLoadFailed from views import J2TemplateView, add_env_globals @@ -32,92 +27,16 @@ class CaptureException(WbException): #================================================================= -class BaseContentView(object): - def __init__(self, config): - self.is_frame_mode = config.get('framed_replay', False) - - if self.is_frame_mode: - self._mp_mod = 'mp_' - else: - self._mp_mod = '' - - view = config.get('head_insert_view') - if not view: - head_insert = config.get('head_insert_html', - 'ui/head_insert.html') - view = HeadInsertView.create_template(head_insert, 'Head Insert') - - self.head_insert_view = view - - if not self.is_frame_mode: - self.frame_insert_view = None - return - - view = config.get('frame_insert_view') - if not view: - frame_insert = config.get('frame_insert_html', - 'ui/frame_insert.html') - - view = J2TemplateView.create_template(frame_insert, 'Frame Insert') - - self.frame_insert_view = view - - def __call__(self, wbrequest, *args): - # render top level frame if in frame mode - # (not supported in proxy mode) - if (self.is_frame_mode and wbrequest.wb_url and - not wbrequest.wb_url.mod and - not wbrequest.options['is_proxy']): - - embed_url = wbrequest.wb_url.to_str(mod=self._mp_mod) - timestamp = datetime_to_timestamp(datetime.datetime.utcnow()) - url = wbrequest.wb_url.url - ctype = 'text/html' - - return self.frame_insert_view.render_response(embed_url=embed_url, - wbrequest=wbrequest, - timestamp=timestamp, - url=url, - content_type=ctype) - - return self.render_content(wbrequest, *args) - - -#================================================================= -class RewriteLiveView(BaseContentView): - def __init__(self, config): - super(RewriteLiveView, self).__init__(config) - - default_proxy = config.get('proxyhostport') - self.rewriter = LiveRewriter(defmod=self._mp_mod, - default_proxy=default_proxy) - - def render_content(self, wbrequest, *args): - head_insert_func = self.head_insert_view.create_insert_func(wbrequest) - - ref_wburl_str = wbrequest.extract_referrer_wburl_str() - if ref_wburl_str: - wbrequest.env['REL_REFERER'] = WbUrl(ref_wburl_str).url - - wb_url = wbrequest.wb_url - result = self.rewriter.fetch_request(wb_url, wbrequest.urlrewriter, - head_insert_func=head_insert_func, - env=wbrequest.env) - - status_headers, gen, is_rewritten = result - - return WbResponse(status_headers, gen) - - -#================================================================= -class ReplayView(BaseContentView): +class ReplayView(object): STRIP_SCHEME = re.compile('^([\w]+:[/]*)?(.*?)$') def __init__(self, content_loader, config): - super(ReplayView, self).__init__(config) - self.content_loader = content_loader - self.content_rewriter = RewriteContent(defmod=self._mp_mod) + + framed = config.get('framed_replay') + self.content_rewriter = RewriteContent(is_framed_replay=framed) + + self.head_insert_view = HeadInsertView.init_from_config(config) self.buffer_response = config.get('buffer_response', True) @@ -131,12 +50,12 @@ class ReplayView(BaseContentView): self._reporter = config.get('reporter') - def render_content(self, wbrequest, *args): + def render_content(self, wbrequest, cdx_lines, cdx_loader): last_e = None first = True - cdx_lines = args[0] - cdx_loader = args[1] + #cdx_lines = args[0] + #cdx_loader = args[1] # List of already failed w/arcs failed_files = [] diff --git a/pywb/webapp/views.py b/pywb/webapp/views.py index 0fc5589d..e7034050 100644 --- a/pywb/webapp/views.py +++ b/pywb/webapp/views.py @@ -141,6 +141,14 @@ class HeadInsertView(J2TemplateView): return J2TemplateView.create_template(filename, desc, HeadInsertView) + @staticmethod + def init_from_config(config): + view = config.get('head_insert_view') + if not view: + html = config.get('head_insert_html', 'ui/head_insert.html') + view = HeadInsertView.create_template(html, 'Head Insert') + return view + #================================================================= # query views diff --git a/tests/test_integration.py b/tests/test_integration.py index 456d50f8..6cc32482 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -310,7 +310,7 @@ class TestWb: def test_excluded_content(self): - resp = self.testapp.get('/pywb/http://www.iana.org/_img/bookmark_icon.ico', status = 403) + resp = self.testapp.get('/pywb/mp_/http://www.iana.org/_img/bookmark_icon.ico', status = 403) assert resp.status_int == 403 assert 'Excluded' in resp.body @@ -414,7 +414,7 @@ class TestWb: def test_error(self): - resp = self.testapp.get('/pywb/?abc', status = 400) + resp = self.testapp.get('/pywb/mp_/?abc', status = 400) assert resp.status_int == 400 assert 'Invalid Url: http://?abc' in resp.body From 103a1c6455a086f67fab681083a2c8525469e247 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 4 Aug 2014 17:54:33 -0700 Subject: [PATCH 04/13] client js: use iframe onload event to detect when iframe changes, allows setting banner even for non-html captures, instead of frame notifying parent will fix issue mentioned in #41 move script from frame_insert.html -> wb_frame.js --- pywb/rewrite/rewrite_live.py | 4 +- pywb/static/wb.js | 42 +------ pywb/static/wb_frame.js | 163 ++++++++++++++++++++++++++++ pywb/ui/frame_insert.html | 99 +---------------- pywb/webapp/live_rewrite_handler.py | 4 + pywb/webapp/views.py | 2 +- 6 files changed, 178 insertions(+), 136 deletions(-) create mode 100644 pywb/static/wb_frame.js diff --git a/pywb/rewrite/rewrite_live.py b/pywb/rewrite/rewrite_live.py index 6c7f33fe..cbd3f106 100644 --- a/pywb/rewrite/rewrite_live.py +++ b/pywb/rewrite/rewrite_live.py @@ -172,10 +172,12 @@ class LiveRewriter(object): urlkey=urlkey, cdx=cdx)) + if env: + env['pywb.cdx'] = cdx + return result def get_rewritten(self, *args, **kwargs): - result = self.fetch_request(*args, **kwargs) status_headers, gen, is_rewritten = result diff --git a/pywb/static/wb.js b/pywb/static/wb.js index 0244cde8..e304831b 100644 --- a/pywb/static/wb.js +++ b/pywb/static/wb.js @@ -99,49 +99,15 @@ function remove_event(name, func, object) { } } -function notify_top(event) { - if (window.self == window.top) { - return; - } - - if (window.top.top != window.top) { - return; - } - - if (!window.WB_wombat_location) { - return; - } - - if (wbinfo.is_embed) { - return; - } - - if (event.target != window.document) { - return; - } - - if (typeof(window.WB_wombat_location.href) != "string") { - return; - } - - if (window.top.update_wb_url) { - window.top.update_wb_url(window.WB_wombat_location.href, - wbinfo.timestamp, - wbinfo.capture_str, - wbinfo.is_live); - } -} - var detect_on_init = function(event) { - init_banner(); - notify_top(event); + init_banner(); + remove_event("readystatechange", detect_on_init, document); } add_event("readystatechange", detect_on_init, document); - if (wbinfo.is_frame_mp && wbinfo.canon_url && (window.self == window.top) && (window.self.top == window.top) && window.location.href != wbinfo.canon_url) { @@ -149,6 +115,8 @@ if (wbinfo.is_frame_mp && wbinfo.canon_url && window.location.replace(wbinfo.canon_url); } -return {'labels': labels}; +return {'labels': labels, + 'add_event': add_event, + 'remove_event': remove_event}; })(); diff --git a/pywb/static/wb_frame.js b/pywb/static/wb_frame.js new file mode 100644 index 00000000..16bddef8 --- /dev/null +++ b/pywb/static/wb_frame.js @@ -0,0 +1,163 @@ +var update_wb_url = push_state; + +function make_outer_url(url, ts) +{ + if (ts) { + return wbinfo.prefix + ts + "/" + url; + } else { + return wbinfo.prefix + url; + } +} + +function make_inner_url(url, ts) +{ + if (ts) { + return wbinfo.prefix + ts + "mp_/" + url; + } else { + return wbinfo.prefix + "mp_/" + url; + } +} + +function push_state(url, timestamp, capture_str, is_live) { +/* var curr_href = null; + + if (window.frames[0].WB_wombat_location) { + curr_href = window.frames[0].WB_wombat_location.href; + } + + if (url != curr_href) { + update_status(capture_str, is_live); + return; + } + + if (!timestamp) { + timestamp = extract_ts(window.frames[0].location.href); + } +*/ + var state = {} + state.timestamp = timestamp; + state.outer_url = make_outer_url(url, state.timestamp); + state.inner_url = make_inner_url(url, state.timestamp); + state.url = url; + state.capture_str = capture_str; + state.is_live = is_live; + + window.history.replaceState(state, "", state.outer_url); + + update_status(state.capture_str, is_live); +} + +function pop_state(state) { + update_status(state.capture_str, state.is_live); + + window.frames[0].src = state.outer_url; +} + +function extract_ts(url) +{ + var inx = url.indexOf("mp_"); + if (inx < 0) { + return ""; + } + url = url.substring(0, inx); + inx = url.lastIndexOf("/"); + if (inx <= 0) { + return ""; + } + return url.substring(inx + 1); +} + +function extract_replay_url(url) { + var inx = url.indexOf("/http:"); + if (inx < 0) { + inx = url.indexOf("/https:"); + if (inx < 0) { + return ""; + } + } + return url.substring(inx + 1); +} + +function update_status(str, is_live) { + var capture_info = document.getElementById("_wb_capture_info"); + if (capture_info) { + capture_info.innerHTML = str; + } + + var label = document.getElementById("_wb_label"); + if (label) { + if (is_live) { + label.innerHTML = _wb_js.labels.LIVE_MSG; + } else { + label.innerHTML = _wb_js.labels.REPLAY_MSG; + } + } +} + +function ts_to_date(ts, is_gmt) +{ + if (ts.length < 14) { + return ts; + } + + var datestr = (ts.substring(0, 4) + "-" + + ts.substring(4, 6) + "-" + + ts.substring(6, 8) + "T" + + ts.substring(8, 10) + ":" + + ts.substring(10, 12) + ":" + + ts.substring(12, 14) + "-00:00"); + + var date = new Date(datestr); + if (is_gmt) { + return date.toGMTString(); + } else { + return date.toLocaleString(); + } +} + +window.onpopstate = function(event) { + var curr_state = event.state; + + if (curr_state) { + pop_state(curr_state); + } +} + +function extract_ts_cookie(value) { + var regex = /pywb.timestamp=([\d]{1,14})/; + var result = value.match(regex); + if (result) { + return result[1]; + } else { + return ""; + } +} + +function iframe_loaded(event) { + var iframe = window.frames[0]; + var url; + var ts; + var capture_str; + var is_live = false; + + if (iframe.WB_wombat_location) { + url = window.WB_wombat_location.href; + } else { + url = extract_replay_url(iframe.location.href); + } + + if (iframe.wbinfo) { + ts = iframe.wbinfo.timestamp; + is_live = iframe.wbinfo.is_live; + capture_str = iframe.wbinfo.capture_str; + } else { + ts = extract_ts(iframe.location.href); + if (!ts) { + is_live = true; + ts = extract_ts_cookie(iframe.document.cookie); + } + capture_str = ts_to_date(ts, true); + } + + update_wb_url(url, ts, capture_str, is_live); +} diff --git a/pywb/ui/frame_insert.html b/pywb/ui/frame_insert.html index 19426c40..9b8b1e51 100644 --- a/pywb/ui/frame_insert.html +++ b/pywb/ui/frame_insert.html @@ -10,107 +10,12 @@ wbinfo.is_frame = true; - +
-