diff --git a/pywb/framework/wbrequestresponse.py b/pywb/framework/wbrequestresponse.py index ba1f6a02..11fd99db 100644 --- a/pywb/framework/wbrequestresponse.py +++ b/pywb/framework/wbrequestresponse.py @@ -87,17 +87,6 @@ class WbRequest(object): self._parse_extra() - @property - def is_embed(self): - return (self.wb_url and - self.wb_url.mod and - self.wb_url.mod != 'id_') - - @property - def is_identity(self): - return (self.wb_url and - self.wb_url.mod == 'id_') - def _is_ajax(self): value = self.env.get('HTTP_X_REQUESTED_WITH') if value and value.lower() == 'xmlhttprequest': diff --git a/pywb/rewrite/html_rewriter.py b/pywb/rewrite/html_rewriter.py index 5a10d651..99cab8d0 100644 --- a/pywb/rewrite/html_rewriter.py +++ b/pywb/rewrite/html_rewriter.py @@ -19,35 +19,40 @@ class HTMLRewriterMixin(object): to rewriters for script and css """ - REWRITE_TAGS = { - 'a': {'href': ''}, - 'applet': {'codebase': 'oe_', - 'archive': 'oe_'}, - 'area': {'href': ''}, - 'base': {'href': ''}, - 'blockquote': {'cite': ''}, - 'body': {'background': 'im_'}, - 'del': {'cite': ''}, - 'embed': {'src': 'oe_'}, - 'head': {'': ''}, # for head rewriting - 'iframe': {'src': 'if_'}, - 'img': {'src': 'im_'}, - 'ins': {'cite': ''}, - 'input': {'src': 'im_'}, - 'form': {'action': ''}, - 'frame': {'src': 'fr_'}, - 'link': {'href': 'oe_'}, - 'meta': {'content': ''}, - 'object': {'codebase': 'oe_', - 'data': 'oe_'}, - 'q': {'cite': ''}, - 'ref': {'href': 'oe_'}, - 'script': {'src': 'js_'}, - 'div': {'data-src': '', - 'data-uri': ''}, - 'li': {'data-src': '', - 'data-uri': ''}, - } + @staticmethod + def _init_rewrite_tags(defmod): + rewrite_tags = { + 'a': {'href': defmod}, + 'applet': {'codebase': 'oe_', + 'archive': 'oe_'}, + 'area': {'href': defmod}, + 'base': {'href': defmod}, + 'blockquote': {'cite': defmod}, + 'body': {'background': 'im_'}, + 'del': {'cite': defmod}, + 'embed': {'src': 'oe_'}, + 'head': {'': defmod}, # for head rewriting + 'iframe': {'src': 'if_'}, + 'img': {'src': 'im_'}, + 'ins': {'cite': defmod}, + 'input': {'src': 'im_'}, + 'form': {'action': defmod}, + 'frame': {'src': 'fr_'}, + 'link': {'href': 'oe_'}, + 'meta': {'content': defmod}, + 'object': {'codebase': 'oe_', + 'data': 'oe_'}, + 'q': {'cite': defmod}, + 'ref': {'href': 'oe_'}, + 'script': {'src': 'js_'}, + 'div': {'data-src': defmod, + 'data-uri': defmod}, + 'li': {'data-src': defmod, + 'data-uri': defmod}, + } + + return rewrite_tags + STATE_TAGS = ['script', 'style'] @@ -70,7 +75,8 @@ class HTMLRewriterMixin(object): def __init__(self, url_rewriter, head_insert=None, js_rewriter_class=JSRewriter, - css_rewriter_class=CSSRewriter): + css_rewriter_class=CSSRewriter, + defmod=''): self.url_rewriter = url_rewriter self._wb_parse_context = None @@ -79,6 +85,7 @@ class HTMLRewriterMixin(object): self.css_rewriter = css_rewriter_class(url_rewriter) self.head_insert = head_insert + self.rewrite_tags = self._init_rewrite_tags(defmod) # =========================== META_REFRESH_REGEX = re.compile('^[\\d.]+\\s*;\\s*url\\s*=\\s*(.+?)\\s*$', @@ -140,9 +147,9 @@ class HTMLRewriterMixin(object): self.head_insert = None # attr rewriting - handler = self.REWRITE_TAGS.get(tag) + handler = self.rewrite_tags.get(tag) if not handler: - handler = self.REWRITE_TAGS.get('') + handler = self.rewrite_tags.get('') if not handler: return False @@ -245,16 +252,9 @@ class HTMLRewriterMixin(object): #================================================================= class HTMLRewriter(HTMLRewriterMixin, HTMLParser): - def __init__(self, url_rewriter, - head_insert=None, - js_rewriter_class=JSRewriter, - css_rewriter_class=CSSRewriter): - + def __init__(self, *args, **kwargs): HTMLParser.__init__(self) - super(HTMLRewriter, self).__init__(url_rewriter, - head_insert, - js_rewriter_class, - css_rewriter_class) + super(HTMLRewriter, self).__init__(*args, **kwargs) def feed(self, string): try: diff --git a/pywb/rewrite/lxml_html_rewriter.py b/pywb/rewrite/lxml_html_rewriter.py index abf28fc4..29355be4 100644 --- a/pywb/rewrite/lxml_html_rewriter.py +++ b/pywb/rewrite/lxml_html_rewriter.py @@ -17,15 +17,8 @@ from html_rewriter import HTMLRewriterMixin class LXMLHTMLRewriter(HTMLRewriterMixin): END_HTML = re.compile(r'', re.IGNORECASE) - def __init__(self, url_rewriter, - head_insert=None, - js_rewriter_class=JSRewriter, - css_rewriter_class=CSSRewriter): - - super(LXMLHTMLRewriter, self).__init__(url_rewriter, - head_insert, - js_rewriter_class, - css_rewriter_class) + def __init__(self, *args, **kwargs): + super(LXMLHTMLRewriter, self).__init__(*args, **kwargs) self.target = RewriterTarget(self) self.parser = lxml.etree.HTMLParser(remove_pis=False, diff --git a/pywb/rewrite/rewrite_content.py b/pywb/rewrite/rewrite_content.py index c2d17047..ae0ef70d 100644 --- a/pywb/rewrite/rewrite_content.py +++ b/pywb/rewrite/rewrite_content.py @@ -16,10 +16,11 @@ from pywb.utils.bufferedreaders import ChunkedDataReader #================================================================= class RewriteContent: - def __init__(self, ds_rules_file=None): + def __init__(self, ds_rules_file=None, defmod=''): self.ruleset = RuleSet(RewriteRules, 'rewrite', default_rule_config={}, ds_rules_file=ds_rules_file) + self.defmod = defmod def sanitize_content(self, status_headers, stream): # remove transfer encoding chunked and wrap in a dechunking stream @@ -111,7 +112,8 @@ class RewriteContent: rewriter = rewriter_class(urlrewriter, js_rewriter_class=rule.rewriters['js'], css_rewriter_class=rule.rewriters['css'], - head_insert=head_insert_str) + head_insert=head_insert_str, + defmod=self.defmod) else: # apply one of (js, css, xml) rewriters diff --git a/pywb/rewrite/wburl.py b/pywb/rewrite/wburl.py index 67bab4fb..982743ae 100644 --- a/pywb/rewrite/wburl.py +++ b/pywb/rewrite/wburl.py @@ -194,6 +194,21 @@ class WbUrl(BaseWbUrl): else: return url + @property + def is_mainpage(self): + return (not self.mod or + self.mod == 'mp_') + + @property + def is_embed(self): + return (self.mod and + self.mod != 'id_' and + self.mod != 'mp_') + + @property + def is_identity(self): + return (self.mod == 'id_') + def __str__(self): return self.to_str() diff --git a/pywb/ui/frame_insert.html b/pywb/ui/frame_insert.html index fd772251..71ddbd31 100644 --- a/pywb/ui/frame_insert.html +++ b/pywb/ui/frame_insert.html @@ -15,9 +15,9 @@ window.addEventListener("message", update_url, false); function push_state(url) { state = {} - state.inner_url = wbinfo.prefix + url; - state.outer_url = wbinfo.prefix + "fr_/" + url; - + state.outer_url = wbinfo.prefix + url; + state.inner_url = wbinfo.prefix + "mp_/" + url; + if (url == wbinfo.capture_url) { return; } @@ -30,6 +30,7 @@ function pop_state(url) { } function update_url(event) { + console.log(event); if (event.source == window.frames[0]) { push_state(event.data); } diff --git a/pywb/ui/head_insert.html b/pywb/ui/head_insert.html index 19c14fa0..9b6f3de7 100644 --- a/pywb/ui/head_insert.html +++ b/pywb/ui/head_insert.html @@ -11,7 +11,7 @@ diff --git a/pywb/webapp/pywb_init.py b/pywb/webapp/pywb_init.py index 337c48c6..cb0edee4 100644 --- a/pywb/webapp/pywb_init.py +++ b/pywb/webapp/pywb_init.py @@ -11,7 +11,9 @@ from pywb.warc.resolvingloader import ResolvingLoader from pywb.rewrite.rewrite_content import RewriteContent from pywb.rewrite.rewriterules import use_lxml_parser -from views import load_template_file, load_query_template, add_env_globals +from views import J2TemplateView, add_env_globals +from views import J2HtmlCapturesView, HeadInsertView + from replay_views import ReplayView from query_handler import QueryHandler @@ -78,8 +80,9 @@ def create_wb_handler(query_handler, config, if template_globals: add_env_globals(template_globals) - head_insert_view = load_template_file(config.get('head_insert_html'), - 'Head Insert') + head_insert_view = (HeadInsertView. + create_template(config.get('head_insert_html'), + 'Head Insert')) replayer = ReplayView( content_loader=resolving_loader, @@ -97,8 +100,9 @@ def create_wb_handler(query_handler, config, reporter=config.get('reporter') ) - search_view = load_template_file(config.get('search_html'), - 'Search Page') + search_view = (J2TemplateView. + create_template(config.get('search_html'), + 'Search Page')) wb_handler_class = config.get('wb_handler_class', WBHandler) @@ -120,8 +124,9 @@ def init_collection(value, config): ds_rules_file = route_config.get('domain_specific_rules', None) - html_view = load_query_template(config.get('query_html'), - 'Captures Page') + html_view = (J2HtmlCapturesView. + create_template(config.get('query_html'), + 'Captures Page')) query_handler = QueryHandler.init_from_config(route_config, ds_rules_file, @@ -247,9 +252,9 @@ def create_wb_router(passed_config={}): abs_path=config.get('absolute_paths', True), - home_view=load_template_file(config.get('home_html'), - 'Home Page'), + home_view=J2TemplateView.create_template(config.get('home_html'), + 'Home Page'), - error_view=load_template_file(config.get('error_html'), - 'Error Page') + error_view=J2TemplateView.create_template(config.get('error_html'), + 'Error Page') ) diff --git a/pywb/webapp/replay_views.py b/pywb/webapp/replay_views.py index 31fe4b57..c45b5983 100644 --- a/pywb/webapp/replay_views.py +++ b/pywb/webapp/replay_views.py @@ -113,7 +113,10 @@ class ReplayView(object): urlrewriter = wbrequest.urlrewriter - head_insert_func = self.get_head_insert_func(wbrequest, cdx) + head_insert_func = None + if self.head_insert_view: + head_insert_func = self.head_insert_view.create_insert_func(wbrequest, + cdx) result = (self.content_rewriter. rewrite_content(urlrewriter, @@ -121,7 +124,7 @@ class ReplayView(object): stream=stream, head_insert_func=head_insert_func, urlkey=cdx['urlkey'], - sanitize_only=wbrequest.is_identity)) + sanitize_only=wbrequest.wb_url.is_identity)) (status_headers, response_iter, is_rewritten) = result @@ -141,18 +144,6 @@ class ReplayView(object): return response - def get_head_insert_func(self, wbrequest, cdx): - # no head insert specified - if not self.head_insert_view: - return None - - def make_head_insert(rule): - return (self.head_insert_view. - render_to_string(wbrequest=wbrequest, - cdx=cdx, - rule=rule)) - return make_head_insert - # Buffer rewrite iterator and return a response from a string def buffered_response(self, status_headers, iterator): out = BytesIO() @@ -207,7 +198,7 @@ class ReplayView(object): # skip all 304s if (status_headers.statusline.startswith('304') and - not wbrequest.is_identity): + not wbrequest.wb_url.is_identity): raise CaptureException('Skipping 304 Modified: ' + str(cdx)) diff --git a/pywb/webapp/rewrite_handler.py b/pywb/webapp/rewrite_handler.py index 07f6644c..894aae39 100644 --- a/pywb/webapp/rewrite_handler.py +++ b/pywb/webapp/rewrite_handler.py @@ -13,22 +13,24 @@ from pywb.utils.statusandheaders import StatusAndHeaders from pywb.rewrite.rewriterules import use_lxml_parser import datetime -#import urllib2 -import urlparse -import httplib import requests from io import BytesIO, BufferedReader -from views import load_template_file +from views import J2TemplateView, HeadInsertView class RewriteHandler(WbUrlHandler): # pragma: no cover def __init__(self, head_insert_view=None): #use_lxml_parser() - self.rewriter = RewriteContent() - self.head_insert_view = load_template_file('ui/head_insert.html', 'Head Insert') - self.frame_insert_view = load_template_file('ui/frame_insert.html', 'Frame Insert') + self.rewriter = RewriteContent(defmod='mp_') + self.head_insert_view = (HeadInsertView. + create_template('ui/head_insert.html', + 'Head Insert')) + + self.frame_insert_view = (J2TemplateView. + create_template('ui/frame_insert.html', + 'Frame Insert')) def proxy_request(self, url, env): @@ -76,36 +78,12 @@ class RewriteHandler(WbUrlHandler): # pragma: no cover stream=True) return req - def do_request(self, method, url, data, req_headers): - splits = urlparse.urlsplit(url) - - hostport = splits.netloc.split(':', 1) - host = hostport[0] - - if len(hostport) == 2: - port = hostport[1] - else: - port = None - - path = splits.path - - if splits.query: - path += '?' + splits.query - - if splits.scheme == 'https': - conn = httplib.HTTPSConnection(host, port) - else: - conn = httplib.HTTPConnection(host, port) - - conn.request(method.upper(), path, data, req_headers) - return conn.getresponse() - def __call__(self, wbrequest): url = wbrequest.wb_url.url - if wbrequest.wb_url.mod == 'fr_': - embed_url = wbrequest.wb_url.to_str(mod='') + if not wbrequest.wb_url.mod: + embed_url = wbrequest.wb_url.to_str(mod='mp_') timestamp = datetime_to_timestamp(datetime.datetime.utcnow()) return self.frame_insert_view.render_response(embed_url=embed_url, @@ -133,7 +111,9 @@ class RewriteHandler(WbUrlHandler): # pragma: no cover } - head_insert_func = self.get_head_insert_func(wbrequest, cdx) + #head_insert_func = self.get_head_insert_func(wbrequest, cdx) + head_insert_func = self.head_insert_view.create_insert_func(wbrequest, + cdx) result = self.rewriter.rewrite_content(wbrequest.urlrewriter, status_headers, diff --git a/pywb/webapp/views.py b/pywb/webapp/views.py index c452d0e0..9aedc230 100644 --- a/pywb/webapp/views.py +++ b/pywb/webapp/views.py @@ -101,6 +101,14 @@ class J2TemplateView: status=status, content_type=content_type) + @staticmethod + def create_template(filename, desc='', view_class=None): + if not view_class: + view_class = J2TemplateView + + logging.debug('Adding {0}: {1}'.format(desc, filename)) + return view_class(filename) + #================================================================= def add_env_globals(glb): @@ -108,17 +116,18 @@ def add_env_globals(glb): #================================================================= -def load_template_file(file, desc=None, view_class=J2TemplateView): - if file: - logging.debug('Adding {0}: {1}'.format(desc if desc else name, file)) - file = view_class(file) +class HeadInsertView(J2TemplateView): + def create_insert_func(self, wbrequest, cdx): + def make_head_insert(rule): + return (self.render_to_string(wbrequest=wbrequest, + cdx=cdx, + rule=rule)) + return make_head_insert - return file - - -#================================================================= -def load_query_template(file, desc=None): - return load_template_file(file, desc, J2HtmlCapturesView) + @staticmethod + def create_template(filename, desc=''): + return J2TemplateView.create_template(filename, desc, + HeadInsertView) #================================================================= @@ -132,6 +141,11 @@ class J2HtmlCapturesView(J2TemplateView): type=wbrequest.wb_url.type, prefix=wbrequest.wb_prefix) + @staticmethod + def create_template(filename, desc=''): + return J2TemplateView.create_template(filename, desc, + J2HtmlCapturesView) + #================================================================= class MementoTimemapView(object):