mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
add rewriter_handler, frame wrapper support!
This commit is contained in:
parent
8897a0a7c9
commit
1fb6f5eff7
16
pywb/apps/rewrite_live.py
Normal file
16
pywb/apps/rewrite_live.py
Normal file
@ -0,0 +1,16 @@
|
||||
from pywb.framework.wsgi_wrappers import init_app, start_wsgi_server
|
||||
|
||||
from pywb.webapp.rewrite_handler import create_rewrite_app
|
||||
|
||||
#=================================================================
|
||||
# init cdx server app
|
||||
#=================================================================
|
||||
|
||||
application = init_app(create_rewrite_app, load_yaml=False)
|
||||
|
||||
|
||||
def main(): # pragma: no cover
|
||||
start_wsgi_server(application, 'Rewrite App', default_port=8090)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
@ -5,11 +5,12 @@
|
||||
top: 0px !important;
|
||||
left: 0px !important;
|
||||
font-family: "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif !important;
|
||||
position: absolute !important;
|
||||
padding: 4px !important;
|
||||
position: fixed !important;
|
||||
/* padding: 4px !important; */
|
||||
height: 40px !important;
|
||||
width: 100% !important;
|
||||
font-size: 24px !important;
|
||||
border: 1px solid !important;
|
||||
/* border: 1px solid !important; */
|
||||
background-color: lightYellow !important;
|
||||
color: black !important;
|
||||
text-align: center !important;
|
||||
@ -17,3 +18,35 @@
|
||||
line-height: normal !important;
|
||||
}
|
||||
|
||||
.wb_iframe_div
|
||||
{
|
||||
width: 100%;
|
||||
height: 100%;
|
||||
padding: 40px 8px 8px 0px;
|
||||
border: none;
|
||||
box-sizing: border-box;
|
||||
-moz-box-sizing: border-box;
|
||||
-webkit-box-sizing: border-box;
|
||||
}
|
||||
|
||||
.wb_iframe
|
||||
{
|
||||
width: 100%;
|
||||
height: 100%;
|
||||
border: 4px solid firebrick;
|
||||
}
|
||||
|
||||
.wb_iframe_all
|
||||
{
|
||||
width: 100%;
|
||||
height: 100%;
|
||||
border: none;
|
||||
background-color: firebrick;
|
||||
padding: 44px 4px 4px 4px;
|
||||
box-sizing: border-box;
|
||||
-moz-box-sizing: border-box;
|
||||
-webkit-box-sizing: border-box;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
@ -26,6 +26,10 @@ function init_banner() {
|
||||
return;
|
||||
}
|
||||
|
||||
if (window.top != window.self) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (!banner) {
|
||||
banner = document.createElement("wb_div");
|
||||
banner.setAttribute("id", BANNER_ID);
|
||||
@ -41,12 +45,54 @@ function init_banner() {
|
||||
}
|
||||
}
|
||||
|
||||
var readyStateCheckInterval = setInterval(function() {
|
||||
function add_event(name, func, object) {
|
||||
if (object.addEventListener) {
|
||||
object.addEventListener(name, func);
|
||||
return true;
|
||||
} else if (object.attachEvent) {
|
||||
object.attachEvent("on" + name, func);
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
function remove_event(name, func, object) {
|
||||
if (object.removeEventListener) {
|
||||
object.removeEventListener(name, func);
|
||||
return true;
|
||||
} else if (object.detachEvent) {
|
||||
object.detachEvent("on" + name, func);
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
var notified_top = false;
|
||||
|
||||
var detect_on_init = function() {
|
||||
if (!notified_top && window && window.top && (window.self != window.top) && window.WB_wombat_location) {
|
||||
if (!wbinfo.is_embed) {
|
||||
window.top.postMessage(window.WB_wombat_location.href, "*");
|
||||
}
|
||||
notified_top = true;
|
||||
}
|
||||
|
||||
if (document.readyState === "interactive" ||
|
||||
document.readyState === "complete") {
|
||||
|
||||
init_banner();
|
||||
|
||||
clearInterval(readyStateCheckInterval);
|
||||
|
||||
remove_event("readystatechange", detect_on_init, document);
|
||||
}
|
||||
}, 10);
|
||||
}
|
||||
|
||||
add_event("readystatechange", detect_on_init, document);
|
||||
|
||||
/*
|
||||
if ((window.self == window.top) && !wbinfo.is_embed && window.location.href.indexOf("/rewrite/fr_/") == -1) {
|
||||
new_loc = window.location.href.replace("/rewrite/", "/rewrite/fr_/");
|
||||
window.location.replace(new_loc);
|
||||
}
|
||||
*/
|
||||
|
@ -60,18 +60,28 @@ WB_wombat_init = (function() {
|
||||
}
|
||||
}
|
||||
|
||||
function ends_with(str, suffix) {
|
||||
if (str.indexOf(suffix, str.length - suffix.length) !== -1) {
|
||||
return suffix;
|
||||
} else {
|
||||
return undefined;
|
||||
}
|
||||
}
|
||||
|
||||
//============================================
|
||||
/* function rewrite_url_debug(url) {
|
||||
var rewrite_url = rewrite_url_debug;
|
||||
|
||||
function rewrite_url_debug(url) {
|
||||
rewritten = rewrite_url_(url);
|
||||
if (url != rewritten) {
|
||||
console.log('REWRITE: ' + url + ' -> ' + rewritten);
|
||||
} else {
|
||||
console.log('NOT REWRITTEN ' + url);
|
||||
//console.log('NOT REWRITTEN ' + url);
|
||||
}
|
||||
return rewritten;
|
||||
}
|
||||
*/
|
||||
function rewrite_url(url) {
|
||||
|
||||
function rewrite_url_(url) {
|
||||
var http_prefix = "http://";
|
||||
var https_prefix = "https://";
|
||||
var rel_prefix = "//";
|
||||
@ -144,13 +154,22 @@ WB_wombat_init = (function() {
|
||||
if (!href) {
|
||||
return "";
|
||||
}
|
||||
|
||||
href = href.toString();
|
||||
|
||||
var index = href.indexOf("/http", 1);
|
||||
|
||||
// extract original url from wburl
|
||||
if (index > 0) {
|
||||
return href.substr(index + 1);
|
||||
} else {
|
||||
return href;
|
||||
href = href.substr(index + 1);
|
||||
}
|
||||
|
||||
// remove trailing slash
|
||||
if (ends_with(href, "/")) {
|
||||
href = href.substring(0, href.length - 1);
|
||||
}
|
||||
|
||||
return href;
|
||||
}
|
||||
|
||||
//============================================
|
||||
@ -196,26 +215,39 @@ WB_wombat_init = (function() {
|
||||
}
|
||||
|
||||
//============================================
|
||||
function update_location(req_href, orig_href, location) {
|
||||
if (req_href && (extract_orig(orig_href) != extract_orig(req_href))) {
|
||||
var final_href = rewrite_url(req_href);
|
||||
|
||||
location.href = final_href;
|
||||
function update_location(req_href, orig_href, actual_location) {
|
||||
if (!req_href || req_href == orig_href) {
|
||||
return;
|
||||
}
|
||||
|
||||
ext_orig = extract_orig(orig_href);
|
||||
ext_req = extract_orig(req_href);
|
||||
|
||||
if (!ext_orig || ext_orig == ext_req) {
|
||||
return;
|
||||
}
|
||||
|
||||
var final_href = rewrite_url(req_href);
|
||||
|
||||
console.log(actual_location.href + ' -> ' + final_href);
|
||||
|
||||
actual_location.href = final_href;
|
||||
}
|
||||
|
||||
//============================================
|
||||
function check_location_change(loc, is_top) {
|
||||
var locType = (typeof loc);
|
||||
|
||||
var location = (is_top ? window.top.location : window.location);
|
||||
var actual_location = (is_top ? window.top.location : window.location);
|
||||
|
||||
//console.log(loc.href);
|
||||
|
||||
// String has been assigned to location, so assign it
|
||||
if (locType == "string") {
|
||||
update_location(loc, location.href, location)
|
||||
update_location(loc, actual_location.href, actual_location)
|
||||
|
||||
} else if (locType == "object") {
|
||||
update_location(loc.href, loc._orig_href, location);
|
||||
update_location(loc.href, loc._orig_href, actual_location);
|
||||
}
|
||||
}
|
||||
|
||||
@ -306,7 +338,6 @@ WB_wombat_init = (function() {
|
||||
window.Worker = undefined;
|
||||
}
|
||||
|
||||
|
||||
function rewrite_attr(elem, name) {
|
||||
if (!elem || !elem.getAttribute) {
|
||||
return;
|
||||
@ -324,25 +355,41 @@ WB_wombat_init = (function() {
|
||||
|
||||
orig_value = value;
|
||||
value = rewrite_url(value);
|
||||
|
||||
|
||||
elem.setAttribute(name, value);
|
||||
}
|
||||
|
||||
function init_dom_override() {
|
||||
if (!Element ||
|
||||
!Element.prototype) {
|
||||
if (!Node || !Node.prototype) {
|
||||
return;
|
||||
}
|
||||
|
||||
function replace_dom_func(funcname) {
|
||||
var orig = Node.prototype[funcname];
|
||||
|
||||
var orig = Element.prototype[funcname];
|
||||
|
||||
Element.prototype[funcname] = function() {
|
||||
Node.prototype[funcname] = function() {
|
||||
rewrite_attr(arguments[0], "src");
|
||||
rewrite_attr(arguments[0], "href");
|
||||
|
||||
return orig.apply(this, arguments);
|
||||
child = arguments[0];
|
||||
|
||||
var desc;
|
||||
|
||||
if (child instanceof DocumentFragment) {
|
||||
desc = child.querySelectorAll("*[href],*[src]");
|
||||
} else if (child.getElementsByTagName) {
|
||||
desc = child.getElementsByTagName("*");
|
||||
}
|
||||
|
||||
if (desc) {
|
||||
for (var i = 0; i < desc.length; i++) {
|
||||
rewrite_attr(desc[i], "src");
|
||||
rewrite_attr(desc[i], "href");
|
||||
}
|
||||
}
|
||||
|
||||
result = orig.apply(this, arguments);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
@ -363,13 +410,14 @@ WB_wombat_init = (function() {
|
||||
window.WB_wombat_location = copy_location_obj(window.self.location);
|
||||
document.WB_wombat_location = window.WB_wombat_location;
|
||||
|
||||
if (window.self.location != window.top.location) {
|
||||
window.top.WB_wombat_location = copy_location_obj(window.top.location);
|
||||
}
|
||||
//if (window.self.location != window.top.location) {
|
||||
// window.top.WB_wombat_location = copy_location_obj(window.top.location);
|
||||
//}
|
||||
window.top.WB_wombat_location = window.WB_wombat_location;
|
||||
|
||||
if (window.opener) {
|
||||
window.opener.WB_wombat_location = copy_location_obj(window.opener.location);
|
||||
}
|
||||
//if (window.opener) {
|
||||
// window.opener.WB_wombat_location = copy_location_obj(window.opener.location);
|
||||
//}
|
||||
|
||||
// Domain
|
||||
document.WB_wombat_domain = orig_host;
|
||||
|
54
pywb/ui/frame_insert.html
Normal file
54
pywb/ui/frame_insert.html
Normal file
@ -0,0 +1,54 @@
|
||||
<html>
|
||||
<head>
|
||||
<!-- Start WB Insert -->
|
||||
<script>
|
||||
wbinfo = {}
|
||||
wbinfo.capture_str = "{{ timestamp | format_ts }}";
|
||||
wbinfo.is_embed = false;
|
||||
wbinfo.prefix = "{{ wbrequest.wb_prefix }}";
|
||||
wbinfo.capture_url = "{{ url }}";
|
||||
</script>
|
||||
<script src='{{ wbrequest.host_prefix }}/{{ static_path }}/wb.js'> </script>
|
||||
<script>
|
||||
|
||||
window.addEventListener("message", update_url, false);
|
||||
|
||||
function push_state(url) {
|
||||
state = {}
|
||||
state.inner_url = wbinfo.prefix + url;
|
||||
state.outer_url = wbinfo.prefix + "fr_/" + url;
|
||||
|
||||
if (url == wbinfo.capture_url) {
|
||||
return;
|
||||
}
|
||||
|
||||
window.history.replaceState(state, "", state.outer_url);
|
||||
}
|
||||
|
||||
function pop_state(url) {
|
||||
window.frames[0].src = url;
|
||||
}
|
||||
|
||||
function update_url(event) {
|
||||
if (event.source == window.frames[0]) {
|
||||
push_state(event.data);
|
||||
}
|
||||
}
|
||||
|
||||
window.onpopstate = function(event) {
|
||||
var curr_state = event.state;
|
||||
|
||||
if (curr_state) {
|
||||
pop_state(curr_state.outer_url);
|
||||
}
|
||||
}
|
||||
|
||||
</script>
|
||||
<link rel='stylesheet' href='{{ wbrequest.host_prefix }}/{{ static_path }}/wb.css'/>
|
||||
<!-- End WB Insert -->
|
||||
<body style="margin: 0px; padding: 0px;">
|
||||
<div class="wb_iframe_div">
|
||||
<iframe src="{{ wbrequest.wb_prefix + embed_url }}" seamless="seamless" frameborder="0" scrolling="yes" class="wb_iframe"/>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
165
pywb/webapp/rewrite_handler.py
Normal file
165
pywb/webapp/rewrite_handler.py
Normal file
@ -0,0 +1,165 @@
|
||||
from pywb.framework.basehandlers import WbUrlHandler
|
||||
from pywb.framework.wbrequestresponse import WbResponse
|
||||
from pywb.framework.archivalrouter import ArchivalRouter, Route
|
||||
|
||||
from pywb.rewrite.rewrite_content import RewriteContent
|
||||
|
||||
from handlers import StaticHandler
|
||||
|
||||
from pywb.utils.canonicalize import canonicalize
|
||||
from pywb.utils.timeutils import datetime_to_timestamp
|
||||
from pywb.utils.statusandheaders import StatusAndHeaders
|
||||
|
||||
from pywb.rewrite.rewriterules import use_lxml_parser
|
||||
|
||||
import datetime
|
||||
#import urllib2
|
||||
import urlparse
|
||||
import httplib
|
||||
import requests
|
||||
|
||||
from io import BytesIO, BufferedReader
|
||||
|
||||
from views import load_template_file
|
||||
|
||||
|
||||
class RewriteHandler(WbUrlHandler): # pragma: no cover
|
||||
def __init__(self, head_insert_view=None):
|
||||
#use_lxml_parser()
|
||||
self.rewriter = RewriteContent()
|
||||
self.head_insert_view = load_template_file('ui/head_insert.html', 'Head Insert')
|
||||
self.frame_insert_view = load_template_file('ui/frame_insert.html', 'Frame Insert')
|
||||
|
||||
def proxy_request(self, url, env):
|
||||
|
||||
method = env['REQUEST_METHOD'].upper()
|
||||
input_ = env['wsgi.input']
|
||||
|
||||
ua = env['HTTP_USER_AGENT']
|
||||
|
||||
req_headers = {'User-Agent': ua}
|
||||
|
||||
if url.startswith('//'):
|
||||
url = 'http:' + url
|
||||
|
||||
if method in ('POST', 'PUT'):
|
||||
data = input_
|
||||
else:
|
||||
data = None
|
||||
|
||||
response = self.do_http_request(method,
|
||||
url,
|
||||
data,
|
||||
req_headers)
|
||||
code = response.status_code
|
||||
|
||||
# remove transfer-encoding as raw stream
|
||||
# is already de-chunked
|
||||
try:
|
||||
del response.headers['transfer-encoding']
|
||||
except KeyError:
|
||||
pass
|
||||
|
||||
headers = response.headers.items()
|
||||
stream = response.raw
|
||||
|
||||
status_headers = StatusAndHeaders(str(code), headers)
|
||||
|
||||
return (status_headers, stream)
|
||||
|
||||
def do_http_request(self, method, url, data, req_headers):
|
||||
req = requests.request(method=method,
|
||||
url=url,
|
||||
data=data,
|
||||
headers=req_headers,
|
||||
allow_redirects=False,
|
||||
stream=True)
|
||||
return req
|
||||
|
||||
def do_request(self, method, url, data, req_headers):
|
||||
splits = urlparse.urlsplit(url)
|
||||
|
||||
hostport = splits.netloc.split(':', 1)
|
||||
host = hostport[0]
|
||||
|
||||
if len(hostport) == 2:
|
||||
port = hostport[1]
|
||||
else:
|
||||
port = None
|
||||
|
||||
path = splits.path
|
||||
|
||||
if splits.query:
|
||||
path += '?' + splits.query
|
||||
|
||||
if splits.scheme == 'https':
|
||||
conn = httplib.HTTPSConnection(host, port)
|
||||
else:
|
||||
conn = httplib.HTTPConnection(host, port)
|
||||
|
||||
conn.request(method.upper(), path, data, req_headers)
|
||||
return conn.getresponse()
|
||||
|
||||
def __call__(self, wbrequest):
|
||||
|
||||
url = wbrequest.wb_url.url
|
||||
|
||||
if wbrequest.wb_url.mod == 'fr_':
|
||||
embed_url = wbrequest.wb_url.to_str(mod='')
|
||||
timestamp = datetime_to_timestamp(datetime.datetime.utcnow())
|
||||
|
||||
return self.frame_insert_view.render_response(embed_url=embed_url,
|
||||
wbrequest=wbrequest,
|
||||
timestamp=timestamp,
|
||||
url=url)
|
||||
|
||||
ts_err = url.split('///')
|
||||
if len(ts_err) > 1:
|
||||
url = 'http://' + ts_err[1]
|
||||
|
||||
try:
|
||||
status_headers, stream = self.proxy_request(url, wbrequest.env)
|
||||
except Exception:
|
||||
print 'ERR on ', url
|
||||
raise
|
||||
|
||||
urlkey = canonicalize(url)
|
||||
|
||||
cdx = {'urlkey': urlkey,
|
||||
'timestamp': datetime_to_timestamp(datetime.datetime.utcnow()),
|
||||
'original': url,
|
||||
'statuscode' : status_headers.statusline.split(' ')[0],
|
||||
'mimetype' : status_headers.get_header('Content-Type')
|
||||
}
|
||||
|
||||
|
||||
head_insert_func = self.get_head_insert_func(wbrequest, cdx)
|
||||
|
||||
result = self.rewriter.rewrite_content(wbrequest.urlrewriter,
|
||||
status_headers,
|
||||
stream,
|
||||
head_insert_func=head_insert_func,
|
||||
urlkey=urlkey)
|
||||
|
||||
status_headers, gen, is_rewritten = result
|
||||
|
||||
return WbResponse(status_headers, gen)
|
||||
|
||||
|
||||
def get_head_insert_func(self, wbrequest, cdx):
|
||||
# no head insert specified
|
||||
if not self.head_insert_view:
|
||||
return None
|
||||
|
||||
def make_head_insert(rule):
|
||||
return (self.head_insert_view.
|
||||
render_to_string(wbrequest=wbrequest,
|
||||
cdx=cdx,
|
||||
rule=rule))
|
||||
return make_head_insert
|
||||
|
||||
def create_rewrite_app(): # pragma: no cover
|
||||
routes = [Route('rewrite', RewriteHandler()),
|
||||
Route('static/default', StaticHandler('pywb/static/'))
|
||||
]
|
||||
return ArchivalRouter(routes, hostpaths=['http://localhost:8080'])
|
Loading…
x
Reference in New Issue
Block a user