1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

refactor:

- move is_identity(), is_embed() to wburl from wbrequest
- add is_mainpage() predicate
- add create_template() to each J2TemplateView to create itself
- add HeadInsertView to create a reusable head insert for
RewriteContent
- add 'mp_' as modifier for frames mode to be used as possible
  modifier with HTMLRewriter
This commit is contained in:
Ilya Kreymer 2014-04-09 10:01:44 -07:00
parent 1fb6f5eff7
commit 19f2df4717
11 changed files with 127 additions and 137 deletions

View File

@ -87,17 +87,6 @@ class WbRequest(object):
self._parse_extra()
@property
def is_embed(self):
return (self.wb_url and
self.wb_url.mod and
self.wb_url.mod != 'id_')
@property
def is_identity(self):
return (self.wb_url and
self.wb_url.mod == 'id_')
def _is_ajax(self):
value = self.env.get('HTTP_X_REQUESTED_WITH')
if value and value.lower() == 'xmlhttprequest':

View File

@ -19,35 +19,40 @@ class HTMLRewriterMixin(object):
to rewriters for script and css
"""
REWRITE_TAGS = {
'a': {'href': ''},
'applet': {'codebase': 'oe_',
'archive': 'oe_'},
'area': {'href': ''},
'base': {'href': ''},
'blockquote': {'cite': ''},
'body': {'background': 'im_'},
'del': {'cite': ''},
'embed': {'src': 'oe_'},
'head': {'': ''}, # for head rewriting
'iframe': {'src': 'if_'},
'img': {'src': 'im_'},
'ins': {'cite': ''},
'input': {'src': 'im_'},
'form': {'action': ''},
'frame': {'src': 'fr_'},
'link': {'href': 'oe_'},
'meta': {'content': ''},
'object': {'codebase': 'oe_',
'data': 'oe_'},
'q': {'cite': ''},
'ref': {'href': 'oe_'},
'script': {'src': 'js_'},
'div': {'data-src': '',
'data-uri': ''},
'li': {'data-src': '',
'data-uri': ''},
}
@staticmethod
def _init_rewrite_tags(defmod):
rewrite_tags = {
'a': {'href': defmod},
'applet': {'codebase': 'oe_',
'archive': 'oe_'},
'area': {'href': defmod},
'base': {'href': defmod},
'blockquote': {'cite': defmod},
'body': {'background': 'im_'},
'del': {'cite': defmod},
'embed': {'src': 'oe_'},
'head': {'': defmod}, # for head rewriting
'iframe': {'src': 'if_'},
'img': {'src': 'im_'},
'ins': {'cite': defmod},
'input': {'src': 'im_'},
'form': {'action': defmod},
'frame': {'src': 'fr_'},
'link': {'href': 'oe_'},
'meta': {'content': defmod},
'object': {'codebase': 'oe_',
'data': 'oe_'},
'q': {'cite': defmod},
'ref': {'href': 'oe_'},
'script': {'src': 'js_'},
'div': {'data-src': defmod,
'data-uri': defmod},
'li': {'data-src': defmod,
'data-uri': defmod},
}
return rewrite_tags
STATE_TAGS = ['script', 'style']
@ -70,7 +75,8 @@ class HTMLRewriterMixin(object):
def __init__(self, url_rewriter,
head_insert=None,
js_rewriter_class=JSRewriter,
css_rewriter_class=CSSRewriter):
css_rewriter_class=CSSRewriter,
defmod=''):
self.url_rewriter = url_rewriter
self._wb_parse_context = None
@ -79,6 +85,7 @@ class HTMLRewriterMixin(object):
self.css_rewriter = css_rewriter_class(url_rewriter)
self.head_insert = head_insert
self.rewrite_tags = self._init_rewrite_tags(defmod)
# ===========================
META_REFRESH_REGEX = re.compile('^[\\d.]+\\s*;\\s*url\\s*=\\s*(.+?)\\s*$',
@ -140,9 +147,9 @@ class HTMLRewriterMixin(object):
self.head_insert = None
# attr rewriting
handler = self.REWRITE_TAGS.get(tag)
handler = self.rewrite_tags.get(tag)
if not handler:
handler = self.REWRITE_TAGS.get('')
handler = self.rewrite_tags.get('')
if not handler:
return False
@ -245,16 +252,9 @@ class HTMLRewriterMixin(object):
#=================================================================
class HTMLRewriter(HTMLRewriterMixin, HTMLParser):
def __init__(self, url_rewriter,
head_insert=None,
js_rewriter_class=JSRewriter,
css_rewriter_class=CSSRewriter):
def __init__(self, *args, **kwargs):
HTMLParser.__init__(self)
super(HTMLRewriter, self).__init__(url_rewriter,
head_insert,
js_rewriter_class,
css_rewriter_class)
super(HTMLRewriter, self).__init__(*args, **kwargs)
def feed(self, string):
try:

View File

@ -17,15 +17,8 @@ from html_rewriter import HTMLRewriterMixin
class LXMLHTMLRewriter(HTMLRewriterMixin):
END_HTML = re.compile(r'</\s*html\s*>', re.IGNORECASE)
def __init__(self, url_rewriter,
head_insert=None,
js_rewriter_class=JSRewriter,
css_rewriter_class=CSSRewriter):
super(LXMLHTMLRewriter, self).__init__(url_rewriter,
head_insert,
js_rewriter_class,
css_rewriter_class)
def __init__(self, *args, **kwargs):
super(LXMLHTMLRewriter, self).__init__(*args, **kwargs)
self.target = RewriterTarget(self)
self.parser = lxml.etree.HTMLParser(remove_pis=False,

View File

@ -16,10 +16,11 @@ from pywb.utils.bufferedreaders import ChunkedDataReader
#=================================================================
class RewriteContent:
def __init__(self, ds_rules_file=None):
def __init__(self, ds_rules_file=None, defmod=''):
self.ruleset = RuleSet(RewriteRules, 'rewrite',
default_rule_config={},
ds_rules_file=ds_rules_file)
self.defmod = defmod
def sanitize_content(self, status_headers, stream):
# remove transfer encoding chunked and wrap in a dechunking stream
@ -111,7 +112,8 @@ class RewriteContent:
rewriter = rewriter_class(urlrewriter,
js_rewriter_class=rule.rewriters['js'],
css_rewriter_class=rule.rewriters['css'],
head_insert=head_insert_str)
head_insert=head_insert_str,
defmod=self.defmod)
else:
# apply one of (js, css, xml) rewriters

View File

@ -194,6 +194,21 @@ class WbUrl(BaseWbUrl):
else:
return url
@property
def is_mainpage(self):
return (not self.mod or
self.mod == 'mp_')
@property
def is_embed(self):
return (self.mod and
self.mod != 'id_' and
self.mod != 'mp_')
@property
def is_identity(self):
return (self.mod == 'id_')
def __str__(self):
return self.to_str()

View File

@ -15,9 +15,9 @@ window.addEventListener("message", update_url, false);
function push_state(url) {
state = {}
state.inner_url = wbinfo.prefix + url;
state.outer_url = wbinfo.prefix + "fr_/" + url;
state.outer_url = wbinfo.prefix + url;
state.inner_url = wbinfo.prefix + "mp_/" + url;
if (url == wbinfo.capture_url) {
return;
}
@ -30,6 +30,7 @@ function pop_state(url) {
}
function update_url(event) {
console.log(event);
if (event.source == window.frames[0]) {
push_state(event.data);
}

View File

@ -11,7 +11,7 @@
<script>
wbinfo = {}
wbinfo.capture_str = "{{ cdx.timestamp | format_ts }}";
wbinfo.is_embed = {{"true" if wbrequest.is_embed else "false"}};
wbinfo.is_embed = {{"true" if wbrequest.wb_url.is_embed else "false"}};
</script>
<script src='{{ wbrequest.host_prefix }}/{{ static_path }}/wb.js'> </script>
<link rel='stylesheet' href='{{ wbrequest.host_prefix }}/{{ static_path }}/wb.css'/>

View File

@ -11,7 +11,9 @@ from pywb.warc.resolvingloader import ResolvingLoader
from pywb.rewrite.rewrite_content import RewriteContent
from pywb.rewrite.rewriterules import use_lxml_parser
from views import load_template_file, load_query_template, add_env_globals
from views import J2TemplateView, add_env_globals
from views import J2HtmlCapturesView, HeadInsertView
from replay_views import ReplayView
from query_handler import QueryHandler
@ -78,8 +80,9 @@ def create_wb_handler(query_handler, config,
if template_globals:
add_env_globals(template_globals)
head_insert_view = load_template_file(config.get('head_insert_html'),
'Head Insert')
head_insert_view = (HeadInsertView.
create_template(config.get('head_insert_html'),
'Head Insert'))
replayer = ReplayView(
content_loader=resolving_loader,
@ -97,8 +100,9 @@ def create_wb_handler(query_handler, config,
reporter=config.get('reporter')
)
search_view = load_template_file(config.get('search_html'),
'Search Page')
search_view = (J2TemplateView.
create_template(config.get('search_html'),
'Search Page'))
wb_handler_class = config.get('wb_handler_class', WBHandler)
@ -120,8 +124,9 @@ def init_collection(value, config):
ds_rules_file = route_config.get('domain_specific_rules', None)
html_view = load_query_template(config.get('query_html'),
'Captures Page')
html_view = (J2HtmlCapturesView.
create_template(config.get('query_html'),
'Captures Page'))
query_handler = QueryHandler.init_from_config(route_config,
ds_rules_file,
@ -247,9 +252,9 @@ def create_wb_router(passed_config={}):
abs_path=config.get('absolute_paths', True),
home_view=load_template_file(config.get('home_html'),
'Home Page'),
home_view=J2TemplateView.create_template(config.get('home_html'),
'Home Page'),
error_view=load_template_file(config.get('error_html'),
'Error Page')
error_view=J2TemplateView.create_template(config.get('error_html'),
'Error Page')
)

View File

@ -113,7 +113,10 @@ class ReplayView(object):
urlrewriter = wbrequest.urlrewriter
head_insert_func = self.get_head_insert_func(wbrequest, cdx)
head_insert_func = None
if self.head_insert_view:
head_insert_func = self.head_insert_view.create_insert_func(wbrequest,
cdx)
result = (self.content_rewriter.
rewrite_content(urlrewriter,
@ -121,7 +124,7 @@ class ReplayView(object):
stream=stream,
head_insert_func=head_insert_func,
urlkey=cdx['urlkey'],
sanitize_only=wbrequest.is_identity))
sanitize_only=wbrequest.wb_url.is_identity))
(status_headers, response_iter, is_rewritten) = result
@ -141,18 +144,6 @@ class ReplayView(object):
return response
def get_head_insert_func(self, wbrequest, cdx):
# no head insert specified
if not self.head_insert_view:
return None
def make_head_insert(rule):
return (self.head_insert_view.
render_to_string(wbrequest=wbrequest,
cdx=cdx,
rule=rule))
return make_head_insert
# Buffer rewrite iterator and return a response from a string
def buffered_response(self, status_headers, iterator):
out = BytesIO()
@ -207,7 +198,7 @@ class ReplayView(object):
# skip all 304s
if (status_headers.statusline.startswith('304') and
not wbrequest.is_identity):
not wbrequest.wb_url.is_identity):
raise CaptureException('Skipping 304 Modified: ' + str(cdx))

View File

@ -13,22 +13,24 @@ from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.rewrite.rewriterules import use_lxml_parser
import datetime
#import urllib2
import urlparse
import httplib
import requests
from io import BytesIO, BufferedReader
from views import load_template_file
from views import J2TemplateView, HeadInsertView
class RewriteHandler(WbUrlHandler): # pragma: no cover
def __init__(self, head_insert_view=None):
#use_lxml_parser()
self.rewriter = RewriteContent()
self.head_insert_view = load_template_file('ui/head_insert.html', 'Head Insert')
self.frame_insert_view = load_template_file('ui/frame_insert.html', 'Frame Insert')
self.rewriter = RewriteContent(defmod='mp_')
self.head_insert_view = (HeadInsertView.
create_template('ui/head_insert.html',
'Head Insert'))
self.frame_insert_view = (J2TemplateView.
create_template('ui/frame_insert.html',
'Frame Insert'))
def proxy_request(self, url, env):
@ -76,36 +78,12 @@ class RewriteHandler(WbUrlHandler): # pragma: no cover
stream=True)
return req
def do_request(self, method, url, data, req_headers):
splits = urlparse.urlsplit(url)
hostport = splits.netloc.split(':', 1)
host = hostport[0]
if len(hostport) == 2:
port = hostport[1]
else:
port = None
path = splits.path
if splits.query:
path += '?' + splits.query
if splits.scheme == 'https':
conn = httplib.HTTPSConnection(host, port)
else:
conn = httplib.HTTPConnection(host, port)
conn.request(method.upper(), path, data, req_headers)
return conn.getresponse()
def __call__(self, wbrequest):
url = wbrequest.wb_url.url
if wbrequest.wb_url.mod == 'fr_':
embed_url = wbrequest.wb_url.to_str(mod='')
if not wbrequest.wb_url.mod:
embed_url = wbrequest.wb_url.to_str(mod='mp_')
timestamp = datetime_to_timestamp(datetime.datetime.utcnow())
return self.frame_insert_view.render_response(embed_url=embed_url,
@ -133,7 +111,9 @@ class RewriteHandler(WbUrlHandler): # pragma: no cover
}
head_insert_func = self.get_head_insert_func(wbrequest, cdx)
#head_insert_func = self.get_head_insert_func(wbrequest, cdx)
head_insert_func = self.head_insert_view.create_insert_func(wbrequest,
cdx)
result = self.rewriter.rewrite_content(wbrequest.urlrewriter,
status_headers,

View File

@ -101,6 +101,14 @@ class J2TemplateView:
status=status,
content_type=content_type)
@staticmethod
def create_template(filename, desc='', view_class=None):
if not view_class:
view_class = J2TemplateView
logging.debug('Adding {0}: {1}'.format(desc, filename))
return view_class(filename)
#=================================================================
def add_env_globals(glb):
@ -108,17 +116,18 @@ def add_env_globals(glb):
#=================================================================
def load_template_file(file, desc=None, view_class=J2TemplateView):
if file:
logging.debug('Adding {0}: {1}'.format(desc if desc else name, file))
file = view_class(file)
class HeadInsertView(J2TemplateView):
def create_insert_func(self, wbrequest, cdx):
def make_head_insert(rule):
return (self.render_to_string(wbrequest=wbrequest,
cdx=cdx,
rule=rule))
return make_head_insert
return file
#=================================================================
def load_query_template(file, desc=None):
return load_template_file(file, desc, J2HtmlCapturesView)
@staticmethod
def create_template(filename, desc=''):
return J2TemplateView.create_template(filename, desc,
HeadInsertView)
#=================================================================
@ -132,6 +141,11 @@ class J2HtmlCapturesView(J2TemplateView):
type=wbrequest.wb_url.type,
prefix=wbrequest.wb_prefix)
@staticmethod
def create_template(filename, desc=''):
return J2TemplateView.create_template(filename, desc,
J2HtmlCapturesView)
#=================================================================
class MementoTimemapView(object):