1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

live rewriter: integrate handler with rewrite_live.py module,

clean up css, add unit and integration tests
clean up cli server now known as 'live-rewrite-server', which performs live rewrite using
iframe paradigm
This commit is contained in:
Ilya Kreymer 2014-04-09 15:46:03 -07:00
parent 11202c462f
commit bfc2e63793
14 changed files with 245 additions and 195 deletions

View File

@ -1,16 +1,16 @@
from pywb.framework.wsgi_wrappers import init_app, start_wsgi_server
from pywb.webapp.rewrite_handler import create_rewrite_app
from pywb.webapp.live_rewrite_handler import create_live_rewriter_app
#=================================================================
# init cdx server app
#=================================================================
application = init_app(create_rewrite_app, load_yaml=False)
application = init_app(create_live_rewriter_app, load_yaml=False)
def main(): # pragma: no cover
start_wsgi_server(application, 'Rewrite App', default_port=8090)
start_wsgi_server(application, 'Live Rewriter App', default_port=8090)
if __name__ == "__main__":
main()

View File

@ -53,7 +53,6 @@ class HTMLRewriterMixin(object):
return rewrite_tags
STATE_TAGS = ['script', 'style']
# tags allowed in the <head> of an html document

View File

@ -54,7 +54,7 @@ class RewriteContent:
def rewrite_content(self, urlrewriter, headers, stream,
head_insert_func=None, urlkey='',
sanitize_only=False):
sanitize_only=False, cdx=None):
if sanitize_only:
status_headers, stream = self.sanitize_content(headers, stream)
@ -107,7 +107,7 @@ class RewriteContent:
head_insert_str = ''
if head_insert_func:
head_insert_str = head_insert_func(rule)
head_insert_str = head_insert_func(rule, cdx)
rewriter = rewriter_class(urlrewriter,
js_rewriter_class=rule.rewriters['js'],

View File

@ -2,9 +2,7 @@
Fetch a url from live web and apply rewriting rules
"""
import urllib2
import os
import sys
import requests
import datetime
import mimetypes
@ -18,61 +16,124 @@ from pywb.rewrite.rewrite_content import RewriteContent
#=================================================================
def get_status_and_stream(url):
resp = urllib2.urlopen(url)
class LiveRewriter(object):
PROXY_HEADER_LIST = [('HTTP_USER_AGENT', 'User-Agent')]
headers = []
for name, value in resp.info().dict.iteritems():
headers.append((name, value))
def __init__(self, defmod=''):
self.rewriter = RewriteContent(defmod=defmod)
status_headers = StatusAndHeaders('200 OK', headers)
stream = resp
def fetch_local_file(self, uri):
fh = open(uri)
return (status_headers, stream)
content_type, _ = mimetypes.guess_type(uri)
# create fake headers for local file
status_headers = StatusAndHeaders('200 OK',
[('Content-Type', content_type)])
stream = fh
#=================================================================
def get_local_file(uri):
fh = open(uri)
return (status_headers, stream)
content_type, _ = mimetypes.guess_type(uri)
def translate_headers(self, env, header_list=None):
headers = {}
# create fake headers for local file
status_headers = StatusAndHeaders('200 OK',
[('Content-Type', content_type)])
stream = fh
if not header_list:
header_list = self.PROXY_HEADER_LIST
return (status_headers, stream)
for env_name, req_name in header_list:
value = env.get(env_name)
if value is not None:
headers[req_name] = value
return headers
#=================================================================
def get_rewritten(url, urlrewriter, urlkey=None, head_insert_func=None):
if is_http(url):
(status_headers, stream) = get_status_and_stream(url)
else:
(status_headers, stream) = get_local_file(url)
def fetch_http(self, url,
env=None,
req_headers={},
follow_redirects=False):
# explicit urlkey may be passed in (say for testing)
if not urlkey:
urlkey = canonicalize(url)
method = 'GET'
data = None
rewriter = RewriteContent()
if env is not None:
method = env['REQUEST_METHOD'].upper()
input_ = env['wsgi.input']
result = rewriter.rewrite_content(urlrewriter,
status_headers,
stream,
head_insert_func=head_insert_func,
urlkey=urlkey)
req_headers.update(self.translate_headers(env))
status_headers, gen, is_rewritten = result
if method in ('POST', 'PUT'):
data = input_
buff = ''.join(gen)
response = requests.request(method=method,
url=url,
data=data,
headers=req_headers,
allow_redirects=follow_redirects,
stream=True)
return (status_headers, buff)
statusline = str(response.status_code) + ' ' + response.reason
headers = response.headers.items()
stream = response.raw
status_headers = StatusAndHeaders(statusline, headers)
return (status_headers, stream)
def fetch_request(self, url, urlrewriter,
head_insert_func=None, urlkey=None,
env=None, req_headers={}, follow_redirects=False):
ts_err = url.split('///')
if len(ts_err) > 1:
url = 'http://' + ts_err[1]
if url.startswith('//'):
url = 'http:' + url
if is_http(url):
(status_headers, stream) = self.fetch_http(url, env, req_headers,
follow_redirects)
else:
(status_headers, stream) = self.fetch_local_file(url)
# explicit urlkey may be passed in (say for testing)
if not urlkey:
urlkey = canonicalize(url)
cdx = {'urlkey': urlkey,
'timestamp': datetime_to_timestamp(datetime.datetime.utcnow()),
'original': url,
'statuscode': status_headers.get_statuscode(),
'mimetype': status_headers.get_header('Content-Type')
}
result = (self.rewriter.
rewrite_content(urlrewriter,
status_headers,
stream,
head_insert_func=head_insert_func,
urlkey=urlkey,
cdx=cdx))
return result
def get_rewritten(self, *args, **kwargs):
result = self.fetch_request(*args, **kwargs)
status_headers, gen, is_rewritten = result
buff = ''.join(gen)
return (status_headers, buff)
#=================================================================
def main(): # pragma: no cover
import sys
if len(sys.argv) < 2:
msg = 'Usage: {0} url-to-fetch [wb-url-target] [extra-prefix]'
print msg.format(sys.argv[0])
@ -94,7 +155,9 @@ def main(): # pragma: no cover
urlrewriter = UrlRewriter(wburl_str, prefix)
status_headers, buff = get_rewritten(url, urlrewriter)
liverewriter = LiveRewriter()
status_headers, buff = liverewriter.get_rewritten(url, urlrewriter)
sys.stdout.write(buff)
return 0

View File

@ -30,9 +30,11 @@ def use_lxml_parser():
return _is_lxml
#=================================================================
def is_lxml():
return _is_lxml
#=================================================================
class RewriteRules(BaseRule):
def __init__(self, url_prefix, config={}):

View File

@ -1,4 +1,4 @@
from pywb.rewrite.rewrite_live import get_rewritten
from pywb.rewrite.rewrite_live import LiveRewriter
from pywb.rewrite.url_rewriter import UrlRewriter
from pywb import get_test_dir
@ -8,7 +8,7 @@ from pywb import get_test_dir
urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/pywb/')
def head_insert_func(rule):
def head_insert_func(rule, cdx):
if rule.js_rewrite_location == True:
return '<script src="/static/default/wombat.js"> </script>'
else:
@ -18,8 +18,8 @@ def head_insert_func(rule):
def test_local_1():
status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html',
urlrewriter,
'com,example,test)/',
head_insert_func)
head_insert_func,
'com,example,test)/')
# wombat insert added
assert '<head><script src="/static/default/wombat.js"> </script>' in buff
@ -34,8 +34,8 @@ def test_local_1():
def test_local_2_no_js_location_rewrite():
status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html',
urlrewriter,
'example,example,test)/nolocation_rewrite',
head_insert_func)
head_insert_func,
'example,example,test)/nolocation_rewrite')
# no wombat insert
assert '<head><script src="/static/default/wombat.js"> </script>' not in buff
@ -46,28 +46,40 @@ def test_local_2_no_js_location_rewrite():
# still link rewrite
assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff
def test_example_1():
status_headers, buff = get_rewritten('http://example.com/', urlrewriter)
# verify header rewriting
assert (('X-Archive-Orig-connection', 'close') in status_headers.headers), status_headers
def test_example_2():
status_headers, buff = get_rewritten('http://example.com/', urlrewriter)
status_headers, buff = get_rewritten('http://example.com/', urlrewriter, req_headers={'Connection': 'close'})
# verify header rewriting
assert (('X-Archive-Orig-connection', 'close') in status_headers.headers), status_headers
assert '/pywb/20131226101010/http://www.iana.org/domains/example' in buff, buff
def test_example_2_redirect():
status_headers, buff = get_rewritten('http://facebook.com/', urlrewriter)
# redirect, no content
assert status_headers.get_statuscode() == '301'
assert len(buff) == 0
def test_example_3_rel():
status_headers, buff = get_rewritten('//example.com/', urlrewriter)
assert status_headers.get_statuscode() == '200'
def test_example_4_rewrite_err():
# may occur in case of rewrite mismatch, the /// gets stripped off
status_headers, buff = get_rewritten('http://localhost:8080///example.com/', urlrewriter)
assert status_headers.get_statuscode() == '200'
def test_example_domain_specific_3():
urlrewriter2 = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/pywb/')
status_headers, buff = get_rewritten('http://facebook.com/digitalpreservation', urlrewriter2)
status_headers, buff = get_rewritten('http://facebook.com/digitalpreservation', urlrewriter2, follow_redirects=True)
# comment out bootloader
assert '/* Bootloader.configurePage' in buff
def get_rewritten(*args, **kwargs):
return LiveRewriter().get_rewritten(*args, **kwargs)

View File

@ -32,7 +32,7 @@
{
width: 100%;
height: 100%;
padding: 40px 8px 8px 0px;
padding: 40px 4px 4px 0px;
border: none;
box-sizing: border-box;
-moz-box-sizing: border-box;
@ -43,5 +43,5 @@
{
width: 100%;
height: 100%;
border: 4px solid firebrick;
border: 2px solid tan;
}

View File

@ -57,12 +57,20 @@ class StatusAndHeaders(object):
return False
def get_statuscode(self):
"""
Return the statuscode part of the status response line
(Assumes no protocol in the statusline)
"""
code = self.statusline.split(' ', 1)[0]
return code
def validate_statusline(self, valid_statusline):
"""
Check that the statusline is valid, eg. starts with a numeric
code. If not, replace with passed in valid_statusline
"""
code = self.statusline.split(' ', 1)[0]
code = self.get_statuscode()
try:
code = int(code)
assert(code > 0)

View File

@ -0,0 +1,65 @@
from pywb.framework.basehandlers import WbUrlHandler
from pywb.framework.wbrequestresponse import WbResponse
from pywb.framework.archivalrouter import ArchivalRouter, Route
from pywb.rewrite.rewrite_live import LiveRewriter
from handlers import StaticHandler
from pywb.utils.canonicalize import canonicalize
from pywb.utils.timeutils import datetime_to_timestamp
from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.rewrite.rewriterules import use_lxml_parser
import datetime
from views import J2TemplateView, HeadInsertView
class RewriteHandler(WbUrlHandler):
def __init__(self, config={}):
#use_lxml_parser()
self.rewriter = LiveRewriter(defmod='mp_')
head_insert = config.get('head_insert_html',
'ui/head_insert.html')
frame_insert = config.get('frame_insert_html',
'ui/frame_insert.html')
view = HeadInsertView.create_template(head_insert, 'Head Insert')
self.head_insert_view = view
view = J2TemplateView.create_template(frame_insert, 'Frame Insert')
self.frame_insert_view = view
def __call__(self, wbrequest):
url = wbrequest.wb_url.url
if not wbrequest.wb_url.mod:
embed_url = wbrequest.wb_url.to_str(mod='mp_')
timestamp = datetime_to_timestamp(datetime.datetime.utcnow())
return self.frame_insert_view.render_response(embed_url=embed_url,
wbrequest=wbrequest,
timestamp=timestamp,
url=url)
head_insert_func = self.head_insert_view.create_insert_func(wbrequest)
result = self.rewriter.fetch_request(url, wbrequest.urlrewriter,
head_insert_func=head_insert_func,
env=wbrequest.env)
status_headers, gen, is_rewritten = result
return WbResponse(status_headers, gen)
def create_live_rewriter_app():
routes = [Route('rewrite', RewriteHandler()),
Route('static/default', StaticHandler('pywb/static/'))
]
return ArchivalRouter(routes, hostpaths=['http://localhost:8080'])

View File

@ -115,8 +115,8 @@ class ReplayView(object):
head_insert_func = None
if self.head_insert_view:
head_insert_func = self.head_insert_view.create_insert_func(wbrequest,
cdx)
head_insert_func = (self.head_insert_view.
create_insert_func(wbrequest))
result = (self.content_rewriter.
rewrite_content(urlrewriter,
@ -124,7 +124,8 @@ class ReplayView(object):
stream=stream,
head_insert_func=head_insert_func,
urlkey=cdx['urlkey'],
sanitize_only=wbrequest.wb_url.is_identity))
sanitize_only=wbrequest.wb_url.is_identity,
cdx=cdx))
(status_headers, response_iter, is_rewritten) = result

View File

@ -1,126 +0,0 @@
from pywb.framework.basehandlers import WbUrlHandler
from pywb.framework.wbrequestresponse import WbResponse
from pywb.framework.archivalrouter import ArchivalRouter, Route
from pywb.rewrite.rewrite_content import RewriteContent
from handlers import StaticHandler
from pywb.utils.canonicalize import canonicalize
from pywb.utils.timeutils import datetime_to_timestamp
from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.rewrite.rewriterules import use_lxml_parser
import datetime
import requests
from io import BytesIO, BufferedReader
from views import J2TemplateView, HeadInsertView
class RewriteHandler(WbUrlHandler): # pragma: no cover
def __init__(self, head_insert_view=None):
#use_lxml_parser()
self.rewriter = RewriteContent(defmod='mp_')
self.head_insert_view = (HeadInsertView.
create_template('ui/head_insert.html',
'Head Insert'))
self.frame_insert_view = (J2TemplateView.
create_template('ui/frame_insert.html',
'Frame Insert'))
def proxy_request(self, url, env):
method = env['REQUEST_METHOD'].upper()
input_ = env['wsgi.input']
ua = env['HTTP_USER_AGENT']
req_headers = {'User-Agent': ua}
if url.startswith('//'):
url = 'http:' + url
if method in ('POST', 'PUT'):
data = input_
else:
data = None
response = self.do_http_request(method,
url,
data,
req_headers)
code = response.status_code
headers = response.headers.items()
stream = response.raw
status_headers = StatusAndHeaders(str(code), headers)
return (status_headers, stream)
def do_http_request(self, method, url, data, req_headers):
req = requests.request(method=method,
url=url,
data=data,
headers=req_headers,
allow_redirects=False,
stream=True)
return req
def __call__(self, wbrequest):
url = wbrequest.wb_url.url
if not wbrequest.wb_url.mod:
embed_url = wbrequest.wb_url.to_str(mod='mp_')
timestamp = datetime_to_timestamp(datetime.datetime.utcnow())
return self.frame_insert_view.render_response(embed_url=embed_url,
wbrequest=wbrequest,
timestamp=timestamp,
url=url)
ts_err = url.split('///')
if len(ts_err) > 1:
url = 'http://' + ts_err[1]
try:
status_headers, stream = self.proxy_request(url, wbrequest.env)
except Exception:
print 'ERR on ', url
raise
urlkey = canonicalize(url)
cdx = {'urlkey': urlkey,
'timestamp': datetime_to_timestamp(datetime.datetime.utcnow()),
'original': url,
'statuscode' : status_headers.statusline.split(' ')[0],
'mimetype' : status_headers.get_header('Content-Type')
}
#head_insert_func = self.get_head_insert_func(wbrequest, cdx)
head_insert_func = self.head_insert_view.create_insert_func(wbrequest,
cdx)
result = self.rewriter.rewrite_content(wbrequest.urlrewriter,
status_headers,
stream,
head_insert_func=head_insert_func,
urlkey=urlkey)
status_headers, gen, is_rewritten = result
return WbResponse(status_headers, gen)
def create_rewrite_app(): # pragma: no cover
routes = [Route('rewrite', RewriteHandler()),
Route('static/default', StaticHandler('pywb/static/'))
]
return ArchivalRouter(routes, hostpaths=['http://localhost:8080'])

View File

@ -117,11 +117,11 @@ def add_env_globals(glb):
#=================================================================
class HeadInsertView(J2TemplateView):
def create_insert_func(self, wbrequest, cdx):
def create_insert_func(self, wbrequest):
canon_url = wbrequest.wb_prefix + wbrequest.wb_url.to_str(mod='')
def make_head_insert(rule):
def make_head_insert(rule, cdx):
return (self.render_to_string(wbrequest=wbrequest,
cdx=cdx,
canon_url=canon_url,

View File

@ -66,6 +66,7 @@ setup(
install_requires=[
'rfc3987',
'chardet',
'requests',
'redis',
'jinja2',
'surt',
@ -84,8 +85,8 @@ setup(
[console_scripts]
wayback = pywb.apps.wayback:main
cdx-server = pywb.apps.cdx_server:main
rewrite-live = pywb.apps.rewrite_live:main
cdx-indexer = pywb.warc.archiveindexer:main
live-rewrite-server = pywb.apps.live_rewrite_server:main
""",
zip_safe=False,
classifiers=[

View File

@ -0,0 +1,25 @@
from pywb.webapp.live_rewrite_handler import create_live_rewriter_app
from pywb.framework.wsgi_wrappers import init_app
import webtest
class TestLiveRewriter:
def setup(self):
self.app = init_app(create_live_rewriter_app, load_yaml=False)
self.testapp = webtest.TestApp(self.app)
def test_live_rewrite_1(self):
headers = [('User-Agent', 'python')]
resp = self.testapp.get('/rewrite/mp_/http://example.com/', headers=headers)
assert resp.status_int == 200
def test_live_rewrite_redirect_2(self):
resp = self.testapp.get('/rewrite/mp_/http://facebook.com/')
assert resp.status_int == 301
def test_live_rewrite_frame(self):
resp = self.testapp.get('/rewrite/http://example.com/')
assert resp.status_int == 200
assert '<iframe ' in resp.body
assert 'src="/rewrite/mp_/http://example.com/"' in resp.body