1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

add rewriter_handler, frame wrapper support!

This commit is contained in:
Ilya Kreymer 2014-04-08 22:43:32 -07:00
parent 8897a0a7c9
commit 1fb6f5eff7
6 changed files with 398 additions and 36 deletions

16
pywb/apps/rewrite_live.py Normal file
View File

@ -0,0 +1,16 @@
from pywb.framework.wsgi_wrappers import init_app, start_wsgi_server
from pywb.webapp.rewrite_handler import create_rewrite_app
#=================================================================
# init cdx server app
#=================================================================
application = init_app(create_rewrite_app, load_yaml=False)
def main(): # pragma: no cover
start_wsgi_server(application, 'Rewrite App', default_port=8090)
if __name__ == "__main__":
main()

View File

@ -5,11 +5,12 @@
top: 0px !important;
left: 0px !important;
font-family: "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif !important;
position: absolute !important;
padding: 4px !important;
position: fixed !important;
/* padding: 4px !important; */
height: 40px !important;
width: 100% !important;
font-size: 24px !important;
border: 1px solid !important;
/* border: 1px solid !important; */
background-color: lightYellow !important;
color: black !important;
text-align: center !important;
@ -17,3 +18,35 @@
line-height: normal !important;
}
.wb_iframe_div
{
width: 100%;
height: 100%;
padding: 40px 8px 8px 0px;
border: none;
box-sizing: border-box;
-moz-box-sizing: border-box;
-webkit-box-sizing: border-box;
}
.wb_iframe
{
width: 100%;
height: 100%;
border: 4px solid firebrick;
}
.wb_iframe_all
{
width: 100%;
height: 100%;
border: none;
background-color: firebrick;
padding: 44px 4px 4px 4px;
box-sizing: border-box;
-moz-box-sizing: border-box;
-webkit-box-sizing: border-box;
}

View File

@ -26,6 +26,10 @@ function init_banner() {
return;
}
if (window.top != window.self) {
return;
}
if (!banner) {
banner = document.createElement("wb_div");
banner.setAttribute("id", BANNER_ID);
@ -41,12 +45,54 @@ function init_banner() {
}
}
var readyStateCheckInterval = setInterval(function() {
function add_event(name, func, object) {
if (object.addEventListener) {
object.addEventListener(name, func);
return true;
} else if (object.attachEvent) {
object.attachEvent("on" + name, func);
return true;
} else {
return false;
}
}
function remove_event(name, func, object) {
if (object.removeEventListener) {
object.removeEventListener(name, func);
return true;
} else if (object.detachEvent) {
object.detachEvent("on" + name, func);
return true;
} else {
return false;
}
}
var notified_top = false;
var detect_on_init = function() {
if (!notified_top && window && window.top && (window.self != window.top) && window.WB_wombat_location) {
if (!wbinfo.is_embed) {
window.top.postMessage(window.WB_wombat_location.href, "*");
}
notified_top = true;
}
if (document.readyState === "interactive" ||
document.readyState === "complete") {
init_banner();
clearInterval(readyStateCheckInterval);
remove_event("readystatechange", detect_on_init, document);
}
}, 10);
}
add_event("readystatechange", detect_on_init, document);
/*
if ((window.self == window.top) && !wbinfo.is_embed && window.location.href.indexOf("/rewrite/fr_/") == -1) {
new_loc = window.location.href.replace("/rewrite/", "/rewrite/fr_/");
window.location.replace(new_loc);
}
*/

View File

@ -60,18 +60,28 @@ WB_wombat_init = (function() {
}
}
function ends_with(str, suffix) {
if (str.indexOf(suffix, str.length - suffix.length) !== -1) {
return suffix;
} else {
return undefined;
}
}
//============================================
/* function rewrite_url_debug(url) {
var rewrite_url = rewrite_url_debug;
function rewrite_url_debug(url) {
rewritten = rewrite_url_(url);
if (url != rewritten) {
console.log('REWRITE: ' + url + ' -> ' + rewritten);
} else {
console.log('NOT REWRITTEN ' + url);
//console.log('NOT REWRITTEN ' + url);
}
return rewritten;
}
*/
function rewrite_url(url) {
function rewrite_url_(url) {
var http_prefix = "http://";
var https_prefix = "https://";
var rel_prefix = "//";
@ -144,13 +154,22 @@ WB_wombat_init = (function() {
if (!href) {
return "";
}
href = href.toString();
var index = href.indexOf("/http", 1);
// extract original url from wburl
if (index > 0) {
return href.substr(index + 1);
} else {
return href;
href = href.substr(index + 1);
}
// remove trailing slash
if (ends_with(href, "/")) {
href = href.substring(0, href.length - 1);
}
return href;
}
//============================================
@ -196,26 +215,39 @@ WB_wombat_init = (function() {
}
//============================================
function update_location(req_href, orig_href, location) {
if (req_href && (extract_orig(orig_href) != extract_orig(req_href))) {
var final_href = rewrite_url(req_href);
location.href = final_href;
function update_location(req_href, orig_href, actual_location) {
if (!req_href || req_href == orig_href) {
return;
}
ext_orig = extract_orig(orig_href);
ext_req = extract_orig(req_href);
if (!ext_orig || ext_orig == ext_req) {
return;
}
var final_href = rewrite_url(req_href);
console.log(actual_location.href + ' -> ' + final_href);
actual_location.href = final_href;
}
//============================================
function check_location_change(loc, is_top) {
var locType = (typeof loc);
var location = (is_top ? window.top.location : window.location);
var actual_location = (is_top ? window.top.location : window.location);
//console.log(loc.href);
// String has been assigned to location, so assign it
if (locType == "string") {
update_location(loc, location.href, location)
update_location(loc, actual_location.href, actual_location)
} else if (locType == "object") {
update_location(loc.href, loc._orig_href, location);
update_location(loc.href, loc._orig_href, actual_location);
}
}
@ -306,7 +338,6 @@ WB_wombat_init = (function() {
window.Worker = undefined;
}
function rewrite_attr(elem, name) {
if (!elem || !elem.getAttribute) {
return;
@ -324,25 +355,41 @@ WB_wombat_init = (function() {
orig_value = value;
value = rewrite_url(value);
elem.setAttribute(name, value);
}
function init_dom_override() {
if (!Element ||
!Element.prototype) {
if (!Node || !Node.prototype) {
return;
}
function replace_dom_func(funcname) {
var orig = Node.prototype[funcname];
var orig = Element.prototype[funcname];
Element.prototype[funcname] = function() {
Node.prototype[funcname] = function() {
rewrite_attr(arguments[0], "src");
rewrite_attr(arguments[0], "href");
return orig.apply(this, arguments);
child = arguments[0];
var desc;
if (child instanceof DocumentFragment) {
desc = child.querySelectorAll("*[href],*[src]");
} else if (child.getElementsByTagName) {
desc = child.getElementsByTagName("*");
}
if (desc) {
for (var i = 0; i < desc.length; i++) {
rewrite_attr(desc[i], "src");
rewrite_attr(desc[i], "href");
}
}
result = orig.apply(this, arguments);
return result;
}
}
@ -363,13 +410,14 @@ WB_wombat_init = (function() {
window.WB_wombat_location = copy_location_obj(window.self.location);
document.WB_wombat_location = window.WB_wombat_location;
if (window.self.location != window.top.location) {
window.top.WB_wombat_location = copy_location_obj(window.top.location);
}
//if (window.self.location != window.top.location) {
// window.top.WB_wombat_location = copy_location_obj(window.top.location);
//}
window.top.WB_wombat_location = window.WB_wombat_location;
if (window.opener) {
window.opener.WB_wombat_location = copy_location_obj(window.opener.location);
}
//if (window.opener) {
// window.opener.WB_wombat_location = copy_location_obj(window.opener.location);
//}
// Domain
document.WB_wombat_domain = orig_host;

54
pywb/ui/frame_insert.html Normal file
View File

@ -0,0 +1,54 @@
<html>
<head>
<!-- Start WB Insert -->
<script>
wbinfo = {}
wbinfo.capture_str = "{{ timestamp | format_ts }}";
wbinfo.is_embed = false;
wbinfo.prefix = "{{ wbrequest.wb_prefix }}";
wbinfo.capture_url = "{{ url }}";
</script>
<script src='{{ wbrequest.host_prefix }}/{{ static_path }}/wb.js'> </script>
<script>
window.addEventListener("message", update_url, false);
function push_state(url) {
state = {}
state.inner_url = wbinfo.prefix + url;
state.outer_url = wbinfo.prefix + "fr_/" + url;
if (url == wbinfo.capture_url) {
return;
}
window.history.replaceState(state, "", state.outer_url);
}
function pop_state(url) {
window.frames[0].src = url;
}
function update_url(event) {
if (event.source == window.frames[0]) {
push_state(event.data);
}
}
window.onpopstate = function(event) {
var curr_state = event.state;
if (curr_state) {
pop_state(curr_state.outer_url);
}
}
</script>
<link rel='stylesheet' href='{{ wbrequest.host_prefix }}/{{ static_path }}/wb.css'/>
<!-- End WB Insert -->
<body style="margin: 0px; padding: 0px;">
<div class="wb_iframe_div">
<iframe src="{{ wbrequest.wb_prefix + embed_url }}" seamless="seamless" frameborder="0" scrolling="yes" class="wb_iframe"/>
</div>
</body>
</html>

View File

@ -0,0 +1,165 @@
from pywb.framework.basehandlers import WbUrlHandler
from pywb.framework.wbrequestresponse import WbResponse
from pywb.framework.archivalrouter import ArchivalRouter, Route
from pywb.rewrite.rewrite_content import RewriteContent
from handlers import StaticHandler
from pywb.utils.canonicalize import canonicalize
from pywb.utils.timeutils import datetime_to_timestamp
from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.rewrite.rewriterules import use_lxml_parser
import datetime
#import urllib2
import urlparse
import httplib
import requests
from io import BytesIO, BufferedReader
from views import load_template_file
class RewriteHandler(WbUrlHandler): # pragma: no cover
def __init__(self, head_insert_view=None):
#use_lxml_parser()
self.rewriter = RewriteContent()
self.head_insert_view = load_template_file('ui/head_insert.html', 'Head Insert')
self.frame_insert_view = load_template_file('ui/frame_insert.html', 'Frame Insert')
def proxy_request(self, url, env):
method = env['REQUEST_METHOD'].upper()
input_ = env['wsgi.input']
ua = env['HTTP_USER_AGENT']
req_headers = {'User-Agent': ua}
if url.startswith('//'):
url = 'http:' + url
if method in ('POST', 'PUT'):
data = input_
else:
data = None
response = self.do_http_request(method,
url,
data,
req_headers)
code = response.status_code
# remove transfer-encoding as raw stream
# is already de-chunked
try:
del response.headers['transfer-encoding']
except KeyError:
pass
headers = response.headers.items()
stream = response.raw
status_headers = StatusAndHeaders(str(code), headers)
return (status_headers, stream)
def do_http_request(self, method, url, data, req_headers):
req = requests.request(method=method,
url=url,
data=data,
headers=req_headers,
allow_redirects=False,
stream=True)
return req
def do_request(self, method, url, data, req_headers):
splits = urlparse.urlsplit(url)
hostport = splits.netloc.split(':', 1)
host = hostport[0]
if len(hostport) == 2:
port = hostport[1]
else:
port = None
path = splits.path
if splits.query:
path += '?' + splits.query
if splits.scheme == 'https':
conn = httplib.HTTPSConnection(host, port)
else:
conn = httplib.HTTPConnection(host, port)
conn.request(method.upper(), path, data, req_headers)
return conn.getresponse()
def __call__(self, wbrequest):
url = wbrequest.wb_url.url
if wbrequest.wb_url.mod == 'fr_':
embed_url = wbrequest.wb_url.to_str(mod='')
timestamp = datetime_to_timestamp(datetime.datetime.utcnow())
return self.frame_insert_view.render_response(embed_url=embed_url,
wbrequest=wbrequest,
timestamp=timestamp,
url=url)
ts_err = url.split('///')
if len(ts_err) > 1:
url = 'http://' + ts_err[1]
try:
status_headers, stream = self.proxy_request(url, wbrequest.env)
except Exception:
print 'ERR on ', url
raise
urlkey = canonicalize(url)
cdx = {'urlkey': urlkey,
'timestamp': datetime_to_timestamp(datetime.datetime.utcnow()),
'original': url,
'statuscode' : status_headers.statusline.split(' ')[0],
'mimetype' : status_headers.get_header('Content-Type')
}
head_insert_func = self.get_head_insert_func(wbrequest, cdx)
result = self.rewriter.rewrite_content(wbrequest.urlrewriter,
status_headers,
stream,
head_insert_func=head_insert_func,
urlkey=urlkey)
status_headers, gen, is_rewritten = result
return WbResponse(status_headers, gen)
def get_head_insert_func(self, wbrequest, cdx):
# no head insert specified
if not self.head_insert_view:
return None
def make_head_insert(rule):
return (self.head_insert_view.
render_to_string(wbrequest=wbrequest,
cdx=cdx,
rule=rule))
return make_head_insert
def create_rewrite_app(): # pragma: no cover
routes = [Route('rewrite', RewriteHandler()),
Route('static/default', StaticHandler('pywb/static/'))
]
return ArchivalRouter(routes, hostpaths=['http://localhost:8080'])