mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
refactor:
- move is_identity(), is_embed() to wburl from wbrequest - add is_mainpage() predicate - add create_template() to each J2TemplateView to create itself - add HeadInsertView to create a reusable head insert for RewriteContent - add 'mp_' as modifier for frames mode to be used as possible modifier with HTMLRewriter
This commit is contained in:
parent
1fb6f5eff7
commit
19f2df4717
@ -87,17 +87,6 @@ class WbRequest(object):
|
||||
|
||||
self._parse_extra()
|
||||
|
||||
@property
|
||||
def is_embed(self):
|
||||
return (self.wb_url and
|
||||
self.wb_url.mod and
|
||||
self.wb_url.mod != 'id_')
|
||||
|
||||
@property
|
||||
def is_identity(self):
|
||||
return (self.wb_url and
|
||||
self.wb_url.mod == 'id_')
|
||||
|
||||
def _is_ajax(self):
|
||||
value = self.env.get('HTTP_X_REQUESTED_WITH')
|
||||
if value and value.lower() == 'xmlhttprequest':
|
||||
|
@ -19,35 +19,40 @@ class HTMLRewriterMixin(object):
|
||||
to rewriters for script and css
|
||||
"""
|
||||
|
||||
REWRITE_TAGS = {
|
||||
'a': {'href': ''},
|
||||
'applet': {'codebase': 'oe_',
|
||||
'archive': 'oe_'},
|
||||
'area': {'href': ''},
|
||||
'base': {'href': ''},
|
||||
'blockquote': {'cite': ''},
|
||||
'body': {'background': 'im_'},
|
||||
'del': {'cite': ''},
|
||||
'embed': {'src': 'oe_'},
|
||||
'head': {'': ''}, # for head rewriting
|
||||
'iframe': {'src': 'if_'},
|
||||
'img': {'src': 'im_'},
|
||||
'ins': {'cite': ''},
|
||||
'input': {'src': 'im_'},
|
||||
'form': {'action': ''},
|
||||
'frame': {'src': 'fr_'},
|
||||
'link': {'href': 'oe_'},
|
||||
'meta': {'content': ''},
|
||||
'object': {'codebase': 'oe_',
|
||||
'data': 'oe_'},
|
||||
'q': {'cite': ''},
|
||||
'ref': {'href': 'oe_'},
|
||||
'script': {'src': 'js_'},
|
||||
'div': {'data-src': '',
|
||||
'data-uri': ''},
|
||||
'li': {'data-src': '',
|
||||
'data-uri': ''},
|
||||
}
|
||||
@staticmethod
|
||||
def _init_rewrite_tags(defmod):
|
||||
rewrite_tags = {
|
||||
'a': {'href': defmod},
|
||||
'applet': {'codebase': 'oe_',
|
||||
'archive': 'oe_'},
|
||||
'area': {'href': defmod},
|
||||
'base': {'href': defmod},
|
||||
'blockquote': {'cite': defmod},
|
||||
'body': {'background': 'im_'},
|
||||
'del': {'cite': defmod},
|
||||
'embed': {'src': 'oe_'},
|
||||
'head': {'': defmod}, # for head rewriting
|
||||
'iframe': {'src': 'if_'},
|
||||
'img': {'src': 'im_'},
|
||||
'ins': {'cite': defmod},
|
||||
'input': {'src': 'im_'},
|
||||
'form': {'action': defmod},
|
||||
'frame': {'src': 'fr_'},
|
||||
'link': {'href': 'oe_'},
|
||||
'meta': {'content': defmod},
|
||||
'object': {'codebase': 'oe_',
|
||||
'data': 'oe_'},
|
||||
'q': {'cite': defmod},
|
||||
'ref': {'href': 'oe_'},
|
||||
'script': {'src': 'js_'},
|
||||
'div': {'data-src': defmod,
|
||||
'data-uri': defmod},
|
||||
'li': {'data-src': defmod,
|
||||
'data-uri': defmod},
|
||||
}
|
||||
|
||||
return rewrite_tags
|
||||
|
||||
|
||||
STATE_TAGS = ['script', 'style']
|
||||
|
||||
@ -70,7 +75,8 @@ class HTMLRewriterMixin(object):
|
||||
def __init__(self, url_rewriter,
|
||||
head_insert=None,
|
||||
js_rewriter_class=JSRewriter,
|
||||
css_rewriter_class=CSSRewriter):
|
||||
css_rewriter_class=CSSRewriter,
|
||||
defmod=''):
|
||||
|
||||
self.url_rewriter = url_rewriter
|
||||
self._wb_parse_context = None
|
||||
@ -79,6 +85,7 @@ class HTMLRewriterMixin(object):
|
||||
self.css_rewriter = css_rewriter_class(url_rewriter)
|
||||
|
||||
self.head_insert = head_insert
|
||||
self.rewrite_tags = self._init_rewrite_tags(defmod)
|
||||
|
||||
# ===========================
|
||||
META_REFRESH_REGEX = re.compile('^[\\d.]+\\s*;\\s*url\\s*=\\s*(.+?)\\s*$',
|
||||
@ -140,9 +147,9 @@ class HTMLRewriterMixin(object):
|
||||
self.head_insert = None
|
||||
|
||||
# attr rewriting
|
||||
handler = self.REWRITE_TAGS.get(tag)
|
||||
handler = self.rewrite_tags.get(tag)
|
||||
if not handler:
|
||||
handler = self.REWRITE_TAGS.get('')
|
||||
handler = self.rewrite_tags.get('')
|
||||
|
||||
if not handler:
|
||||
return False
|
||||
@ -245,16 +252,9 @@ class HTMLRewriterMixin(object):
|
||||
|
||||
#=================================================================
|
||||
class HTMLRewriter(HTMLRewriterMixin, HTMLParser):
|
||||
def __init__(self, url_rewriter,
|
||||
head_insert=None,
|
||||
js_rewriter_class=JSRewriter,
|
||||
css_rewriter_class=CSSRewriter):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
HTMLParser.__init__(self)
|
||||
super(HTMLRewriter, self).__init__(url_rewriter,
|
||||
head_insert,
|
||||
js_rewriter_class,
|
||||
css_rewriter_class)
|
||||
super(HTMLRewriter, self).__init__(*args, **kwargs)
|
||||
|
||||
def feed(self, string):
|
||||
try:
|
||||
|
@ -17,15 +17,8 @@ from html_rewriter import HTMLRewriterMixin
|
||||
class LXMLHTMLRewriter(HTMLRewriterMixin):
|
||||
END_HTML = re.compile(r'</\s*html\s*>', re.IGNORECASE)
|
||||
|
||||
def __init__(self, url_rewriter,
|
||||
head_insert=None,
|
||||
js_rewriter_class=JSRewriter,
|
||||
css_rewriter_class=CSSRewriter):
|
||||
|
||||
super(LXMLHTMLRewriter, self).__init__(url_rewriter,
|
||||
head_insert,
|
||||
js_rewriter_class,
|
||||
css_rewriter_class)
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(LXMLHTMLRewriter, self).__init__(*args, **kwargs)
|
||||
|
||||
self.target = RewriterTarget(self)
|
||||
self.parser = lxml.etree.HTMLParser(remove_pis=False,
|
||||
|
@ -16,10 +16,11 @@ from pywb.utils.bufferedreaders import ChunkedDataReader
|
||||
|
||||
#=================================================================
|
||||
class RewriteContent:
|
||||
def __init__(self, ds_rules_file=None):
|
||||
def __init__(self, ds_rules_file=None, defmod=''):
|
||||
self.ruleset = RuleSet(RewriteRules, 'rewrite',
|
||||
default_rule_config={},
|
||||
ds_rules_file=ds_rules_file)
|
||||
self.defmod = defmod
|
||||
|
||||
def sanitize_content(self, status_headers, stream):
|
||||
# remove transfer encoding chunked and wrap in a dechunking stream
|
||||
@ -111,7 +112,8 @@ class RewriteContent:
|
||||
rewriter = rewriter_class(urlrewriter,
|
||||
js_rewriter_class=rule.rewriters['js'],
|
||||
css_rewriter_class=rule.rewriters['css'],
|
||||
head_insert=head_insert_str)
|
||||
head_insert=head_insert_str,
|
||||
defmod=self.defmod)
|
||||
|
||||
else:
|
||||
# apply one of (js, css, xml) rewriters
|
||||
|
@ -194,6 +194,21 @@ class WbUrl(BaseWbUrl):
|
||||
else:
|
||||
return url
|
||||
|
||||
@property
|
||||
def is_mainpage(self):
|
||||
return (not self.mod or
|
||||
self.mod == 'mp_')
|
||||
|
||||
@property
|
||||
def is_embed(self):
|
||||
return (self.mod and
|
||||
self.mod != 'id_' and
|
||||
self.mod != 'mp_')
|
||||
|
||||
@property
|
||||
def is_identity(self):
|
||||
return (self.mod == 'id_')
|
||||
|
||||
def __str__(self):
|
||||
return self.to_str()
|
||||
|
||||
|
@ -15,9 +15,9 @@ window.addEventListener("message", update_url, false);
|
||||
|
||||
function push_state(url) {
|
||||
state = {}
|
||||
state.inner_url = wbinfo.prefix + url;
|
||||
state.outer_url = wbinfo.prefix + "fr_/" + url;
|
||||
|
||||
state.outer_url = wbinfo.prefix + url;
|
||||
state.inner_url = wbinfo.prefix + "mp_/" + url;
|
||||
|
||||
if (url == wbinfo.capture_url) {
|
||||
return;
|
||||
}
|
||||
@ -30,6 +30,7 @@ function pop_state(url) {
|
||||
}
|
||||
|
||||
function update_url(event) {
|
||||
console.log(event);
|
||||
if (event.source == window.frames[0]) {
|
||||
push_state(event.data);
|
||||
}
|
||||
|
@ -11,7 +11,7 @@
|
||||
<script>
|
||||
wbinfo = {}
|
||||
wbinfo.capture_str = "{{ cdx.timestamp | format_ts }}";
|
||||
wbinfo.is_embed = {{"true" if wbrequest.is_embed else "false"}};
|
||||
wbinfo.is_embed = {{"true" if wbrequest.wb_url.is_embed else "false"}};
|
||||
</script>
|
||||
<script src='{{ wbrequest.host_prefix }}/{{ static_path }}/wb.js'> </script>
|
||||
<link rel='stylesheet' href='{{ wbrequest.host_prefix }}/{{ static_path }}/wb.css'/>
|
||||
|
@ -11,7 +11,9 @@ from pywb.warc.resolvingloader import ResolvingLoader
|
||||
from pywb.rewrite.rewrite_content import RewriteContent
|
||||
from pywb.rewrite.rewriterules import use_lxml_parser
|
||||
|
||||
from views import load_template_file, load_query_template, add_env_globals
|
||||
from views import J2TemplateView, add_env_globals
|
||||
from views import J2HtmlCapturesView, HeadInsertView
|
||||
|
||||
from replay_views import ReplayView
|
||||
|
||||
from query_handler import QueryHandler
|
||||
@ -78,8 +80,9 @@ def create_wb_handler(query_handler, config,
|
||||
if template_globals:
|
||||
add_env_globals(template_globals)
|
||||
|
||||
head_insert_view = load_template_file(config.get('head_insert_html'),
|
||||
'Head Insert')
|
||||
head_insert_view = (HeadInsertView.
|
||||
create_template(config.get('head_insert_html'),
|
||||
'Head Insert'))
|
||||
|
||||
replayer = ReplayView(
|
||||
content_loader=resolving_loader,
|
||||
@ -97,8 +100,9 @@ def create_wb_handler(query_handler, config,
|
||||
reporter=config.get('reporter')
|
||||
)
|
||||
|
||||
search_view = load_template_file(config.get('search_html'),
|
||||
'Search Page')
|
||||
search_view = (J2TemplateView.
|
||||
create_template(config.get('search_html'),
|
||||
'Search Page'))
|
||||
|
||||
wb_handler_class = config.get('wb_handler_class', WBHandler)
|
||||
|
||||
@ -120,8 +124,9 @@ def init_collection(value, config):
|
||||
|
||||
ds_rules_file = route_config.get('domain_specific_rules', None)
|
||||
|
||||
html_view = load_query_template(config.get('query_html'),
|
||||
'Captures Page')
|
||||
html_view = (J2HtmlCapturesView.
|
||||
create_template(config.get('query_html'),
|
||||
'Captures Page'))
|
||||
|
||||
query_handler = QueryHandler.init_from_config(route_config,
|
||||
ds_rules_file,
|
||||
@ -247,9 +252,9 @@ def create_wb_router(passed_config={}):
|
||||
|
||||
abs_path=config.get('absolute_paths', True),
|
||||
|
||||
home_view=load_template_file(config.get('home_html'),
|
||||
'Home Page'),
|
||||
home_view=J2TemplateView.create_template(config.get('home_html'),
|
||||
'Home Page'),
|
||||
|
||||
error_view=load_template_file(config.get('error_html'),
|
||||
'Error Page')
|
||||
error_view=J2TemplateView.create_template(config.get('error_html'),
|
||||
'Error Page')
|
||||
)
|
||||
|
@ -113,7 +113,10 @@ class ReplayView(object):
|
||||
|
||||
urlrewriter = wbrequest.urlrewriter
|
||||
|
||||
head_insert_func = self.get_head_insert_func(wbrequest, cdx)
|
||||
head_insert_func = None
|
||||
if self.head_insert_view:
|
||||
head_insert_func = self.head_insert_view.create_insert_func(wbrequest,
|
||||
cdx)
|
||||
|
||||
result = (self.content_rewriter.
|
||||
rewrite_content(urlrewriter,
|
||||
@ -121,7 +124,7 @@ class ReplayView(object):
|
||||
stream=stream,
|
||||
head_insert_func=head_insert_func,
|
||||
urlkey=cdx['urlkey'],
|
||||
sanitize_only=wbrequest.is_identity))
|
||||
sanitize_only=wbrequest.wb_url.is_identity))
|
||||
|
||||
(status_headers, response_iter, is_rewritten) = result
|
||||
|
||||
@ -141,18 +144,6 @@ class ReplayView(object):
|
||||
|
||||
return response
|
||||
|
||||
def get_head_insert_func(self, wbrequest, cdx):
|
||||
# no head insert specified
|
||||
if not self.head_insert_view:
|
||||
return None
|
||||
|
||||
def make_head_insert(rule):
|
||||
return (self.head_insert_view.
|
||||
render_to_string(wbrequest=wbrequest,
|
||||
cdx=cdx,
|
||||
rule=rule))
|
||||
return make_head_insert
|
||||
|
||||
# Buffer rewrite iterator and return a response from a string
|
||||
def buffered_response(self, status_headers, iterator):
|
||||
out = BytesIO()
|
||||
@ -207,7 +198,7 @@ class ReplayView(object):
|
||||
|
||||
# skip all 304s
|
||||
if (status_headers.statusline.startswith('304') and
|
||||
not wbrequest.is_identity):
|
||||
not wbrequest.wb_url.is_identity):
|
||||
|
||||
raise CaptureException('Skipping 304 Modified: ' + str(cdx))
|
||||
|
||||
|
@ -13,22 +13,24 @@ from pywb.utils.statusandheaders import StatusAndHeaders
|
||||
from pywb.rewrite.rewriterules import use_lxml_parser
|
||||
|
||||
import datetime
|
||||
#import urllib2
|
||||
import urlparse
|
||||
import httplib
|
||||
import requests
|
||||
|
||||
from io import BytesIO, BufferedReader
|
||||
|
||||
from views import load_template_file
|
||||
from views import J2TemplateView, HeadInsertView
|
||||
|
||||
|
||||
class RewriteHandler(WbUrlHandler): # pragma: no cover
|
||||
def __init__(self, head_insert_view=None):
|
||||
#use_lxml_parser()
|
||||
self.rewriter = RewriteContent()
|
||||
self.head_insert_view = load_template_file('ui/head_insert.html', 'Head Insert')
|
||||
self.frame_insert_view = load_template_file('ui/frame_insert.html', 'Frame Insert')
|
||||
self.rewriter = RewriteContent(defmod='mp_')
|
||||
self.head_insert_view = (HeadInsertView.
|
||||
create_template('ui/head_insert.html',
|
||||
'Head Insert'))
|
||||
|
||||
self.frame_insert_view = (J2TemplateView.
|
||||
create_template('ui/frame_insert.html',
|
||||
'Frame Insert'))
|
||||
|
||||
def proxy_request(self, url, env):
|
||||
|
||||
@ -76,36 +78,12 @@ class RewriteHandler(WbUrlHandler): # pragma: no cover
|
||||
stream=True)
|
||||
return req
|
||||
|
||||
def do_request(self, method, url, data, req_headers):
|
||||
splits = urlparse.urlsplit(url)
|
||||
|
||||
hostport = splits.netloc.split(':', 1)
|
||||
host = hostport[0]
|
||||
|
||||
if len(hostport) == 2:
|
||||
port = hostport[1]
|
||||
else:
|
||||
port = None
|
||||
|
||||
path = splits.path
|
||||
|
||||
if splits.query:
|
||||
path += '?' + splits.query
|
||||
|
||||
if splits.scheme == 'https':
|
||||
conn = httplib.HTTPSConnection(host, port)
|
||||
else:
|
||||
conn = httplib.HTTPConnection(host, port)
|
||||
|
||||
conn.request(method.upper(), path, data, req_headers)
|
||||
return conn.getresponse()
|
||||
|
||||
def __call__(self, wbrequest):
|
||||
|
||||
url = wbrequest.wb_url.url
|
||||
|
||||
if wbrequest.wb_url.mod == 'fr_':
|
||||
embed_url = wbrequest.wb_url.to_str(mod='')
|
||||
if not wbrequest.wb_url.mod:
|
||||
embed_url = wbrequest.wb_url.to_str(mod='mp_')
|
||||
timestamp = datetime_to_timestamp(datetime.datetime.utcnow())
|
||||
|
||||
return self.frame_insert_view.render_response(embed_url=embed_url,
|
||||
@ -133,7 +111,9 @@ class RewriteHandler(WbUrlHandler): # pragma: no cover
|
||||
}
|
||||
|
||||
|
||||
head_insert_func = self.get_head_insert_func(wbrequest, cdx)
|
||||
#head_insert_func = self.get_head_insert_func(wbrequest, cdx)
|
||||
head_insert_func = self.head_insert_view.create_insert_func(wbrequest,
|
||||
cdx)
|
||||
|
||||
result = self.rewriter.rewrite_content(wbrequest.urlrewriter,
|
||||
status_headers,
|
||||
|
@ -101,6 +101,14 @@ class J2TemplateView:
|
||||
status=status,
|
||||
content_type=content_type)
|
||||
|
||||
@staticmethod
|
||||
def create_template(filename, desc='', view_class=None):
|
||||
if not view_class:
|
||||
view_class = J2TemplateView
|
||||
|
||||
logging.debug('Adding {0}: {1}'.format(desc, filename))
|
||||
return view_class(filename)
|
||||
|
||||
|
||||
#=================================================================
|
||||
def add_env_globals(glb):
|
||||
@ -108,17 +116,18 @@ def add_env_globals(glb):
|
||||
|
||||
|
||||
#=================================================================
|
||||
def load_template_file(file, desc=None, view_class=J2TemplateView):
|
||||
if file:
|
||||
logging.debug('Adding {0}: {1}'.format(desc if desc else name, file))
|
||||
file = view_class(file)
|
||||
class HeadInsertView(J2TemplateView):
|
||||
def create_insert_func(self, wbrequest, cdx):
|
||||
def make_head_insert(rule):
|
||||
return (self.render_to_string(wbrequest=wbrequest,
|
||||
cdx=cdx,
|
||||
rule=rule))
|
||||
return make_head_insert
|
||||
|
||||
return file
|
||||
|
||||
|
||||
#=================================================================
|
||||
def load_query_template(file, desc=None):
|
||||
return load_template_file(file, desc, J2HtmlCapturesView)
|
||||
@staticmethod
|
||||
def create_template(filename, desc=''):
|
||||
return J2TemplateView.create_template(filename, desc,
|
||||
HeadInsertView)
|
||||
|
||||
|
||||
#=================================================================
|
||||
@ -132,6 +141,11 @@ class J2HtmlCapturesView(J2TemplateView):
|
||||
type=wbrequest.wb_url.type,
|
||||
prefix=wbrequest.wb_prefix)
|
||||
|
||||
@staticmethod
|
||||
def create_template(filename, desc=''):
|
||||
return J2TemplateView.create_template(filename, desc,
|
||||
J2HtmlCapturesView)
|
||||
|
||||
|
||||
#=================================================================
|
||||
class MementoTimemapView(object):
|
||||
|
Loading…
x
Reference in New Issue
Block a user