mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
live rewriter: integrate handler with rewrite_live.py module,
clean up css, add unit and integration tests clean up cli server now known as 'live-rewrite-server', which performs live rewrite using iframe paradigm
This commit is contained in:
parent
11202c462f
commit
bfc2e63793
@ -1,16 +1,16 @@
|
|||||||
from pywb.framework.wsgi_wrappers import init_app, start_wsgi_server
|
from pywb.framework.wsgi_wrappers import init_app, start_wsgi_server
|
||||||
|
|
||||||
from pywb.webapp.rewrite_handler import create_rewrite_app
|
from pywb.webapp.live_rewrite_handler import create_live_rewriter_app
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
# init cdx server app
|
# init cdx server app
|
||||||
#=================================================================
|
#=================================================================
|
||||||
|
|
||||||
application = init_app(create_rewrite_app, load_yaml=False)
|
application = init_app(create_live_rewriter_app, load_yaml=False)
|
||||||
|
|
||||||
|
|
||||||
def main(): # pragma: no cover
|
def main(): # pragma: no cover
|
||||||
start_wsgi_server(application, 'Rewrite App', default_port=8090)
|
start_wsgi_server(application, 'Live Rewriter App', default_port=8090)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
@ -53,7 +53,6 @@ class HTMLRewriterMixin(object):
|
|||||||
|
|
||||||
return rewrite_tags
|
return rewrite_tags
|
||||||
|
|
||||||
|
|
||||||
STATE_TAGS = ['script', 'style']
|
STATE_TAGS = ['script', 'style']
|
||||||
|
|
||||||
# tags allowed in the <head> of an html document
|
# tags allowed in the <head> of an html document
|
||||||
|
@ -54,7 +54,7 @@ class RewriteContent:
|
|||||||
|
|
||||||
def rewrite_content(self, urlrewriter, headers, stream,
|
def rewrite_content(self, urlrewriter, headers, stream,
|
||||||
head_insert_func=None, urlkey='',
|
head_insert_func=None, urlkey='',
|
||||||
sanitize_only=False):
|
sanitize_only=False, cdx=None):
|
||||||
|
|
||||||
if sanitize_only:
|
if sanitize_only:
|
||||||
status_headers, stream = self.sanitize_content(headers, stream)
|
status_headers, stream = self.sanitize_content(headers, stream)
|
||||||
@ -107,7 +107,7 @@ class RewriteContent:
|
|||||||
head_insert_str = ''
|
head_insert_str = ''
|
||||||
|
|
||||||
if head_insert_func:
|
if head_insert_func:
|
||||||
head_insert_str = head_insert_func(rule)
|
head_insert_str = head_insert_func(rule, cdx)
|
||||||
|
|
||||||
rewriter = rewriter_class(urlrewriter,
|
rewriter = rewriter_class(urlrewriter,
|
||||||
js_rewriter_class=rule.rewriters['js'],
|
js_rewriter_class=rule.rewriters['js'],
|
||||||
|
@ -2,9 +2,7 @@
|
|||||||
Fetch a url from live web and apply rewriting rules
|
Fetch a url from live web and apply rewriting rules
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import urllib2
|
import requests
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
import datetime
|
import datetime
|
||||||
import mimetypes
|
import mimetypes
|
||||||
|
|
||||||
@ -18,61 +16,124 @@ from pywb.rewrite.rewrite_content import RewriteContent
|
|||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
def get_status_and_stream(url):
|
class LiveRewriter(object):
|
||||||
resp = urllib2.urlopen(url)
|
PROXY_HEADER_LIST = [('HTTP_USER_AGENT', 'User-Agent')]
|
||||||
|
|
||||||
headers = []
|
def __init__(self, defmod=''):
|
||||||
for name, value in resp.info().dict.iteritems():
|
self.rewriter = RewriteContent(defmod=defmod)
|
||||||
headers.append((name, value))
|
|
||||||
|
|
||||||
status_headers = StatusAndHeaders('200 OK', headers)
|
def fetch_local_file(self, uri):
|
||||||
stream = resp
|
fh = open(uri)
|
||||||
|
|
||||||
return (status_headers, stream)
|
content_type, _ = mimetypes.guess_type(uri)
|
||||||
|
|
||||||
|
# create fake headers for local file
|
||||||
|
status_headers = StatusAndHeaders('200 OK',
|
||||||
|
[('Content-Type', content_type)])
|
||||||
|
stream = fh
|
||||||
|
|
||||||
#=================================================================
|
return (status_headers, stream)
|
||||||
def get_local_file(uri):
|
|
||||||
fh = open(uri)
|
|
||||||
|
|
||||||
content_type, _ = mimetypes.guess_type(uri)
|
def translate_headers(self, env, header_list=None):
|
||||||
|
headers = {}
|
||||||
|
|
||||||
# create fake headers for local file
|
if not header_list:
|
||||||
status_headers = StatusAndHeaders('200 OK',
|
header_list = self.PROXY_HEADER_LIST
|
||||||
[('Content-Type', content_type)])
|
|
||||||
stream = fh
|
|
||||||
|
|
||||||
return (status_headers, stream)
|
for env_name, req_name in header_list:
|
||||||
|
value = env.get(env_name)
|
||||||
|
if value is not None:
|
||||||
|
headers[req_name] = value
|
||||||
|
|
||||||
|
return headers
|
||||||
|
|
||||||
#=================================================================
|
def fetch_http(self, url,
|
||||||
def get_rewritten(url, urlrewriter, urlkey=None, head_insert_func=None):
|
env=None,
|
||||||
if is_http(url):
|
req_headers={},
|
||||||
(status_headers, stream) = get_status_and_stream(url)
|
follow_redirects=False):
|
||||||
else:
|
|
||||||
(status_headers, stream) = get_local_file(url)
|
|
||||||
|
|
||||||
# explicit urlkey may be passed in (say for testing)
|
method = 'GET'
|
||||||
if not urlkey:
|
data = None
|
||||||
urlkey = canonicalize(url)
|
|
||||||
|
|
||||||
rewriter = RewriteContent()
|
if env is not None:
|
||||||
|
method = env['REQUEST_METHOD'].upper()
|
||||||
|
input_ = env['wsgi.input']
|
||||||
|
|
||||||
result = rewriter.rewrite_content(urlrewriter,
|
req_headers.update(self.translate_headers(env))
|
||||||
status_headers,
|
|
||||||
stream,
|
|
||||||
head_insert_func=head_insert_func,
|
|
||||||
urlkey=urlkey)
|
|
||||||
|
|
||||||
status_headers, gen, is_rewritten = result
|
if method in ('POST', 'PUT'):
|
||||||
|
data = input_
|
||||||
|
|
||||||
buff = ''.join(gen)
|
response = requests.request(method=method,
|
||||||
|
url=url,
|
||||||
|
data=data,
|
||||||
|
headers=req_headers,
|
||||||
|
allow_redirects=follow_redirects,
|
||||||
|
stream=True)
|
||||||
|
|
||||||
return (status_headers, buff)
|
statusline = str(response.status_code) + ' ' + response.reason
|
||||||
|
|
||||||
|
headers = response.headers.items()
|
||||||
|
stream = response.raw
|
||||||
|
|
||||||
|
status_headers = StatusAndHeaders(statusline, headers)
|
||||||
|
|
||||||
|
return (status_headers, stream)
|
||||||
|
|
||||||
|
def fetch_request(self, url, urlrewriter,
|
||||||
|
head_insert_func=None, urlkey=None,
|
||||||
|
env=None, req_headers={}, follow_redirects=False):
|
||||||
|
|
||||||
|
ts_err = url.split('///')
|
||||||
|
|
||||||
|
if len(ts_err) > 1:
|
||||||
|
url = 'http://' + ts_err[1]
|
||||||
|
|
||||||
|
if url.startswith('//'):
|
||||||
|
url = 'http:' + url
|
||||||
|
|
||||||
|
if is_http(url):
|
||||||
|
(status_headers, stream) = self.fetch_http(url, env, req_headers,
|
||||||
|
follow_redirects)
|
||||||
|
else:
|
||||||
|
(status_headers, stream) = self.fetch_local_file(url)
|
||||||
|
|
||||||
|
# explicit urlkey may be passed in (say for testing)
|
||||||
|
if not urlkey:
|
||||||
|
urlkey = canonicalize(url)
|
||||||
|
|
||||||
|
cdx = {'urlkey': urlkey,
|
||||||
|
'timestamp': datetime_to_timestamp(datetime.datetime.utcnow()),
|
||||||
|
'original': url,
|
||||||
|
'statuscode': status_headers.get_statuscode(),
|
||||||
|
'mimetype': status_headers.get_header('Content-Type')
|
||||||
|
}
|
||||||
|
|
||||||
|
result = (self.rewriter.
|
||||||
|
rewrite_content(urlrewriter,
|
||||||
|
status_headers,
|
||||||
|
stream,
|
||||||
|
head_insert_func=head_insert_func,
|
||||||
|
urlkey=urlkey,
|
||||||
|
cdx=cdx))
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
def get_rewritten(self, *args, **kwargs):
|
||||||
|
|
||||||
|
result = self.fetch_request(*args, **kwargs)
|
||||||
|
|
||||||
|
status_headers, gen, is_rewritten = result
|
||||||
|
|
||||||
|
buff = ''.join(gen)
|
||||||
|
|
||||||
|
return (status_headers, buff)
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
def main(): # pragma: no cover
|
def main(): # pragma: no cover
|
||||||
|
import sys
|
||||||
|
|
||||||
if len(sys.argv) < 2:
|
if len(sys.argv) < 2:
|
||||||
msg = 'Usage: {0} url-to-fetch [wb-url-target] [extra-prefix]'
|
msg = 'Usage: {0} url-to-fetch [wb-url-target] [extra-prefix]'
|
||||||
print msg.format(sys.argv[0])
|
print msg.format(sys.argv[0])
|
||||||
@ -94,7 +155,9 @@ def main(): # pragma: no cover
|
|||||||
|
|
||||||
urlrewriter = UrlRewriter(wburl_str, prefix)
|
urlrewriter = UrlRewriter(wburl_str, prefix)
|
||||||
|
|
||||||
status_headers, buff = get_rewritten(url, urlrewriter)
|
liverewriter = LiveRewriter()
|
||||||
|
|
||||||
|
status_headers, buff = liverewriter.get_rewritten(url, urlrewriter)
|
||||||
|
|
||||||
sys.stdout.write(buff)
|
sys.stdout.write(buff)
|
||||||
return 0
|
return 0
|
||||||
|
@ -30,9 +30,11 @@ def use_lxml_parser():
|
|||||||
return _is_lxml
|
return _is_lxml
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
def is_lxml():
|
def is_lxml():
|
||||||
return _is_lxml
|
return _is_lxml
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class RewriteRules(BaseRule):
|
class RewriteRules(BaseRule):
|
||||||
def __init__(self, url_prefix, config={}):
|
def __init__(self, url_prefix, config={}):
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
from pywb.rewrite.rewrite_live import get_rewritten
|
from pywb.rewrite.rewrite_live import LiveRewriter
|
||||||
from pywb.rewrite.url_rewriter import UrlRewriter
|
from pywb.rewrite.url_rewriter import UrlRewriter
|
||||||
|
|
||||||
from pywb import get_test_dir
|
from pywb import get_test_dir
|
||||||
@ -8,7 +8,7 @@ from pywb import get_test_dir
|
|||||||
|
|
||||||
urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/pywb/')
|
urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/pywb/')
|
||||||
|
|
||||||
def head_insert_func(rule):
|
def head_insert_func(rule, cdx):
|
||||||
if rule.js_rewrite_location == True:
|
if rule.js_rewrite_location == True:
|
||||||
return '<script src="/static/default/wombat.js"> </script>'
|
return '<script src="/static/default/wombat.js"> </script>'
|
||||||
else:
|
else:
|
||||||
@ -18,8 +18,8 @@ def head_insert_func(rule):
|
|||||||
def test_local_1():
|
def test_local_1():
|
||||||
status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html',
|
status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html',
|
||||||
urlrewriter,
|
urlrewriter,
|
||||||
'com,example,test)/',
|
head_insert_func,
|
||||||
head_insert_func)
|
'com,example,test)/')
|
||||||
|
|
||||||
# wombat insert added
|
# wombat insert added
|
||||||
assert '<head><script src="/static/default/wombat.js"> </script>' in buff
|
assert '<head><script src="/static/default/wombat.js"> </script>' in buff
|
||||||
@ -34,8 +34,8 @@ def test_local_1():
|
|||||||
def test_local_2_no_js_location_rewrite():
|
def test_local_2_no_js_location_rewrite():
|
||||||
status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html',
|
status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html',
|
||||||
urlrewriter,
|
urlrewriter,
|
||||||
'example,example,test)/nolocation_rewrite',
|
head_insert_func,
|
||||||
head_insert_func)
|
'example,example,test)/nolocation_rewrite')
|
||||||
|
|
||||||
# no wombat insert
|
# no wombat insert
|
||||||
assert '<head><script src="/static/default/wombat.js"> </script>' not in buff
|
assert '<head><script src="/static/default/wombat.js"> </script>' not in buff
|
||||||
@ -46,28 +46,40 @@ def test_local_2_no_js_location_rewrite():
|
|||||||
# still link rewrite
|
# still link rewrite
|
||||||
assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff
|
assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff
|
||||||
|
|
||||||
|
|
||||||
def test_example_1():
|
def test_example_1():
|
||||||
status_headers, buff = get_rewritten('http://example.com/', urlrewriter)
|
status_headers, buff = get_rewritten('http://example.com/', urlrewriter, req_headers={'Connection': 'close'})
|
||||||
|
|
||||||
# verify header rewriting
|
|
||||||
assert (('X-Archive-Orig-connection', 'close') in status_headers.headers), status_headers
|
|
||||||
|
|
||||||
|
|
||||||
def test_example_2():
|
|
||||||
status_headers, buff = get_rewritten('http://example.com/', urlrewriter)
|
|
||||||
|
|
||||||
# verify header rewriting
|
# verify header rewriting
|
||||||
assert (('X-Archive-Orig-connection', 'close') in status_headers.headers), status_headers
|
assert (('X-Archive-Orig-connection', 'close') in status_headers.headers), status_headers
|
||||||
|
|
||||||
assert '/pywb/20131226101010/http://www.iana.org/domains/example' in buff, buff
|
assert '/pywb/20131226101010/http://www.iana.org/domains/example' in buff, buff
|
||||||
|
|
||||||
|
def test_example_2_redirect():
|
||||||
|
status_headers, buff = get_rewritten('http://facebook.com/', urlrewriter)
|
||||||
|
|
||||||
|
# redirect, no content
|
||||||
|
assert status_headers.get_statuscode() == '301'
|
||||||
|
assert len(buff) == 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_example_3_rel():
|
||||||
|
status_headers, buff = get_rewritten('//example.com/', urlrewriter)
|
||||||
|
assert status_headers.get_statuscode() == '200'
|
||||||
|
|
||||||
|
|
||||||
|
def test_example_4_rewrite_err():
|
||||||
|
# may occur in case of rewrite mismatch, the /// gets stripped off
|
||||||
|
status_headers, buff = get_rewritten('http://localhost:8080///example.com/', urlrewriter)
|
||||||
|
assert status_headers.get_statuscode() == '200'
|
||||||
|
|
||||||
def test_example_domain_specific_3():
|
def test_example_domain_specific_3():
|
||||||
urlrewriter2 = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/pywb/')
|
urlrewriter2 = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/pywb/')
|
||||||
status_headers, buff = get_rewritten('http://facebook.com/digitalpreservation', urlrewriter2)
|
status_headers, buff = get_rewritten('http://facebook.com/digitalpreservation', urlrewriter2, follow_redirects=True)
|
||||||
|
|
||||||
# comment out bootloader
|
# comment out bootloader
|
||||||
assert '/* Bootloader.configurePage' in buff
|
assert '/* Bootloader.configurePage' in buff
|
||||||
|
|
||||||
|
|
||||||
|
def get_rewritten(*args, **kwargs):
|
||||||
|
return LiveRewriter().get_rewritten(*args, **kwargs)
|
||||||
|
@ -32,7 +32,7 @@
|
|||||||
{
|
{
|
||||||
width: 100%;
|
width: 100%;
|
||||||
height: 100%;
|
height: 100%;
|
||||||
padding: 40px 8px 8px 0px;
|
padding: 40px 4px 4px 0px;
|
||||||
border: none;
|
border: none;
|
||||||
box-sizing: border-box;
|
box-sizing: border-box;
|
||||||
-moz-box-sizing: border-box;
|
-moz-box-sizing: border-box;
|
||||||
@ -43,5 +43,5 @@
|
|||||||
{
|
{
|
||||||
width: 100%;
|
width: 100%;
|
||||||
height: 100%;
|
height: 100%;
|
||||||
border: 4px solid firebrick;
|
border: 2px solid tan;
|
||||||
}
|
}
|
||||||
|
@ -57,12 +57,20 @@ class StatusAndHeaders(object):
|
|||||||
|
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
def get_statuscode(self):
|
||||||
|
"""
|
||||||
|
Return the statuscode part of the status response line
|
||||||
|
(Assumes no protocol in the statusline)
|
||||||
|
"""
|
||||||
|
code = self.statusline.split(' ', 1)[0]
|
||||||
|
return code
|
||||||
|
|
||||||
def validate_statusline(self, valid_statusline):
|
def validate_statusline(self, valid_statusline):
|
||||||
"""
|
"""
|
||||||
Check that the statusline is valid, eg. starts with a numeric
|
Check that the statusline is valid, eg. starts with a numeric
|
||||||
code. If not, replace with passed in valid_statusline
|
code. If not, replace with passed in valid_statusline
|
||||||
"""
|
"""
|
||||||
code = self.statusline.split(' ', 1)[0]
|
code = self.get_statuscode()
|
||||||
try:
|
try:
|
||||||
code = int(code)
|
code = int(code)
|
||||||
assert(code > 0)
|
assert(code > 0)
|
||||||
|
65
pywb/webapp/live_rewrite_handler.py
Normal file
65
pywb/webapp/live_rewrite_handler.py
Normal file
@ -0,0 +1,65 @@
|
|||||||
|
from pywb.framework.basehandlers import WbUrlHandler
|
||||||
|
from pywb.framework.wbrequestresponse import WbResponse
|
||||||
|
from pywb.framework.archivalrouter import ArchivalRouter, Route
|
||||||
|
|
||||||
|
from pywb.rewrite.rewrite_live import LiveRewriter
|
||||||
|
|
||||||
|
from handlers import StaticHandler
|
||||||
|
|
||||||
|
from pywb.utils.canonicalize import canonicalize
|
||||||
|
from pywb.utils.timeutils import datetime_to_timestamp
|
||||||
|
from pywb.utils.statusandheaders import StatusAndHeaders
|
||||||
|
|
||||||
|
from pywb.rewrite.rewriterules import use_lxml_parser
|
||||||
|
|
||||||
|
import datetime
|
||||||
|
|
||||||
|
from views import J2TemplateView, HeadInsertView
|
||||||
|
|
||||||
|
|
||||||
|
class RewriteHandler(WbUrlHandler):
|
||||||
|
def __init__(self, config={}):
|
||||||
|
#use_lxml_parser()
|
||||||
|
self.rewriter = LiveRewriter(defmod='mp_')
|
||||||
|
|
||||||
|
head_insert = config.get('head_insert_html',
|
||||||
|
'ui/head_insert.html')
|
||||||
|
|
||||||
|
frame_insert = config.get('frame_insert_html',
|
||||||
|
'ui/frame_insert.html')
|
||||||
|
|
||||||
|
view = HeadInsertView.create_template(head_insert, 'Head Insert')
|
||||||
|
self.head_insert_view = view
|
||||||
|
|
||||||
|
view = J2TemplateView.create_template(frame_insert, 'Frame Insert')
|
||||||
|
self.frame_insert_view = view
|
||||||
|
|
||||||
|
def __call__(self, wbrequest):
|
||||||
|
|
||||||
|
url = wbrequest.wb_url.url
|
||||||
|
|
||||||
|
if not wbrequest.wb_url.mod:
|
||||||
|
embed_url = wbrequest.wb_url.to_str(mod='mp_')
|
||||||
|
timestamp = datetime_to_timestamp(datetime.datetime.utcnow())
|
||||||
|
|
||||||
|
return self.frame_insert_view.render_response(embed_url=embed_url,
|
||||||
|
wbrequest=wbrequest,
|
||||||
|
timestamp=timestamp,
|
||||||
|
url=url)
|
||||||
|
|
||||||
|
head_insert_func = self.head_insert_view.create_insert_func(wbrequest)
|
||||||
|
|
||||||
|
result = self.rewriter.fetch_request(url, wbrequest.urlrewriter,
|
||||||
|
head_insert_func=head_insert_func,
|
||||||
|
env=wbrequest.env)
|
||||||
|
|
||||||
|
status_headers, gen, is_rewritten = result
|
||||||
|
|
||||||
|
return WbResponse(status_headers, gen)
|
||||||
|
|
||||||
|
|
||||||
|
def create_live_rewriter_app():
|
||||||
|
routes = [Route('rewrite', RewriteHandler()),
|
||||||
|
Route('static/default', StaticHandler('pywb/static/'))
|
||||||
|
]
|
||||||
|
return ArchivalRouter(routes, hostpaths=['http://localhost:8080'])
|
@ -115,8 +115,8 @@ class ReplayView(object):
|
|||||||
|
|
||||||
head_insert_func = None
|
head_insert_func = None
|
||||||
if self.head_insert_view:
|
if self.head_insert_view:
|
||||||
head_insert_func = self.head_insert_view.create_insert_func(wbrequest,
|
head_insert_func = (self.head_insert_view.
|
||||||
cdx)
|
create_insert_func(wbrequest))
|
||||||
|
|
||||||
result = (self.content_rewriter.
|
result = (self.content_rewriter.
|
||||||
rewrite_content(urlrewriter,
|
rewrite_content(urlrewriter,
|
||||||
@ -124,7 +124,8 @@ class ReplayView(object):
|
|||||||
stream=stream,
|
stream=stream,
|
||||||
head_insert_func=head_insert_func,
|
head_insert_func=head_insert_func,
|
||||||
urlkey=cdx['urlkey'],
|
urlkey=cdx['urlkey'],
|
||||||
sanitize_only=wbrequest.wb_url.is_identity))
|
sanitize_only=wbrequest.wb_url.is_identity,
|
||||||
|
cdx=cdx))
|
||||||
|
|
||||||
(status_headers, response_iter, is_rewritten) = result
|
(status_headers, response_iter, is_rewritten) = result
|
||||||
|
|
||||||
|
@ -1,126 +0,0 @@
|
|||||||
from pywb.framework.basehandlers import WbUrlHandler
|
|
||||||
from pywb.framework.wbrequestresponse import WbResponse
|
|
||||||
from pywb.framework.archivalrouter import ArchivalRouter, Route
|
|
||||||
|
|
||||||
from pywb.rewrite.rewrite_content import RewriteContent
|
|
||||||
|
|
||||||
from handlers import StaticHandler
|
|
||||||
|
|
||||||
from pywb.utils.canonicalize import canonicalize
|
|
||||||
from pywb.utils.timeutils import datetime_to_timestamp
|
|
||||||
from pywb.utils.statusandheaders import StatusAndHeaders
|
|
||||||
|
|
||||||
from pywb.rewrite.rewriterules import use_lxml_parser
|
|
||||||
|
|
||||||
import datetime
|
|
||||||
import requests
|
|
||||||
|
|
||||||
from io import BytesIO, BufferedReader
|
|
||||||
|
|
||||||
from views import J2TemplateView, HeadInsertView
|
|
||||||
|
|
||||||
|
|
||||||
class RewriteHandler(WbUrlHandler): # pragma: no cover
|
|
||||||
def __init__(self, head_insert_view=None):
|
|
||||||
#use_lxml_parser()
|
|
||||||
self.rewriter = RewriteContent(defmod='mp_')
|
|
||||||
self.head_insert_view = (HeadInsertView.
|
|
||||||
create_template('ui/head_insert.html',
|
|
||||||
'Head Insert'))
|
|
||||||
|
|
||||||
self.frame_insert_view = (J2TemplateView.
|
|
||||||
create_template('ui/frame_insert.html',
|
|
||||||
'Frame Insert'))
|
|
||||||
|
|
||||||
def proxy_request(self, url, env):
|
|
||||||
|
|
||||||
method = env['REQUEST_METHOD'].upper()
|
|
||||||
input_ = env['wsgi.input']
|
|
||||||
|
|
||||||
ua = env['HTTP_USER_AGENT']
|
|
||||||
|
|
||||||
req_headers = {'User-Agent': ua}
|
|
||||||
|
|
||||||
if url.startswith('//'):
|
|
||||||
url = 'http:' + url
|
|
||||||
|
|
||||||
if method in ('POST', 'PUT'):
|
|
||||||
data = input_
|
|
||||||
else:
|
|
||||||
data = None
|
|
||||||
|
|
||||||
response = self.do_http_request(method,
|
|
||||||
url,
|
|
||||||
data,
|
|
||||||
req_headers)
|
|
||||||
code = response.status_code
|
|
||||||
|
|
||||||
headers = response.headers.items()
|
|
||||||
stream = response.raw
|
|
||||||
|
|
||||||
status_headers = StatusAndHeaders(str(code), headers)
|
|
||||||
|
|
||||||
return (status_headers, stream)
|
|
||||||
|
|
||||||
def do_http_request(self, method, url, data, req_headers):
|
|
||||||
req = requests.request(method=method,
|
|
||||||
url=url,
|
|
||||||
data=data,
|
|
||||||
headers=req_headers,
|
|
||||||
allow_redirects=False,
|
|
||||||
stream=True)
|
|
||||||
return req
|
|
||||||
|
|
||||||
def __call__(self, wbrequest):
|
|
||||||
|
|
||||||
url = wbrequest.wb_url.url
|
|
||||||
|
|
||||||
if not wbrequest.wb_url.mod:
|
|
||||||
embed_url = wbrequest.wb_url.to_str(mod='mp_')
|
|
||||||
timestamp = datetime_to_timestamp(datetime.datetime.utcnow())
|
|
||||||
|
|
||||||
return self.frame_insert_view.render_response(embed_url=embed_url,
|
|
||||||
wbrequest=wbrequest,
|
|
||||||
timestamp=timestamp,
|
|
||||||
url=url)
|
|
||||||
|
|
||||||
ts_err = url.split('///')
|
|
||||||
if len(ts_err) > 1:
|
|
||||||
url = 'http://' + ts_err[1]
|
|
||||||
|
|
||||||
try:
|
|
||||||
status_headers, stream = self.proxy_request(url, wbrequest.env)
|
|
||||||
except Exception:
|
|
||||||
print 'ERR on ', url
|
|
||||||
raise
|
|
||||||
|
|
||||||
urlkey = canonicalize(url)
|
|
||||||
|
|
||||||
cdx = {'urlkey': urlkey,
|
|
||||||
'timestamp': datetime_to_timestamp(datetime.datetime.utcnow()),
|
|
||||||
'original': url,
|
|
||||||
'statuscode' : status_headers.statusline.split(' ')[0],
|
|
||||||
'mimetype' : status_headers.get_header('Content-Type')
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
#head_insert_func = self.get_head_insert_func(wbrequest, cdx)
|
|
||||||
head_insert_func = self.head_insert_view.create_insert_func(wbrequest,
|
|
||||||
cdx)
|
|
||||||
|
|
||||||
result = self.rewriter.rewrite_content(wbrequest.urlrewriter,
|
|
||||||
status_headers,
|
|
||||||
stream,
|
|
||||||
head_insert_func=head_insert_func,
|
|
||||||
urlkey=urlkey)
|
|
||||||
|
|
||||||
status_headers, gen, is_rewritten = result
|
|
||||||
|
|
||||||
return WbResponse(status_headers, gen)
|
|
||||||
|
|
||||||
|
|
||||||
def create_rewrite_app(): # pragma: no cover
|
|
||||||
routes = [Route('rewrite', RewriteHandler()),
|
|
||||||
Route('static/default', StaticHandler('pywb/static/'))
|
|
||||||
]
|
|
||||||
return ArchivalRouter(routes, hostpaths=['http://localhost:8080'])
|
|
@ -117,11 +117,11 @@ def add_env_globals(glb):
|
|||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class HeadInsertView(J2TemplateView):
|
class HeadInsertView(J2TemplateView):
|
||||||
def create_insert_func(self, wbrequest, cdx):
|
def create_insert_func(self, wbrequest):
|
||||||
|
|
||||||
canon_url = wbrequest.wb_prefix + wbrequest.wb_url.to_str(mod='')
|
canon_url = wbrequest.wb_prefix + wbrequest.wb_url.to_str(mod='')
|
||||||
|
|
||||||
def make_head_insert(rule):
|
def make_head_insert(rule, cdx):
|
||||||
return (self.render_to_string(wbrequest=wbrequest,
|
return (self.render_to_string(wbrequest=wbrequest,
|
||||||
cdx=cdx,
|
cdx=cdx,
|
||||||
canon_url=canon_url,
|
canon_url=canon_url,
|
||||||
|
3
setup.py
3
setup.py
@ -66,6 +66,7 @@ setup(
|
|||||||
install_requires=[
|
install_requires=[
|
||||||
'rfc3987',
|
'rfc3987',
|
||||||
'chardet',
|
'chardet',
|
||||||
|
'requests',
|
||||||
'redis',
|
'redis',
|
||||||
'jinja2',
|
'jinja2',
|
||||||
'surt',
|
'surt',
|
||||||
@ -84,8 +85,8 @@ setup(
|
|||||||
[console_scripts]
|
[console_scripts]
|
||||||
wayback = pywb.apps.wayback:main
|
wayback = pywb.apps.wayback:main
|
||||||
cdx-server = pywb.apps.cdx_server:main
|
cdx-server = pywb.apps.cdx_server:main
|
||||||
rewrite-live = pywb.apps.rewrite_live:main
|
|
||||||
cdx-indexer = pywb.warc.archiveindexer:main
|
cdx-indexer = pywb.warc.archiveindexer:main
|
||||||
|
live-rewrite-server = pywb.apps.live_rewrite_server:main
|
||||||
""",
|
""",
|
||||||
zip_safe=False,
|
zip_safe=False,
|
||||||
classifiers=[
|
classifiers=[
|
||||||
|
25
tests/test_live_rewriter.py
Normal file
25
tests/test_live_rewriter.py
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
from pywb.webapp.live_rewrite_handler import create_live_rewriter_app
|
||||||
|
from pywb.framework.wsgi_wrappers import init_app
|
||||||
|
import webtest
|
||||||
|
|
||||||
|
class TestLiveRewriter:
|
||||||
|
def setup(self):
|
||||||
|
self.app = init_app(create_live_rewriter_app, load_yaml=False)
|
||||||
|
self.testapp = webtest.TestApp(self.app)
|
||||||
|
|
||||||
|
def test_live_rewrite_1(self):
|
||||||
|
headers = [('User-Agent', 'python')]
|
||||||
|
resp = self.testapp.get('/rewrite/mp_/http://example.com/', headers=headers)
|
||||||
|
assert resp.status_int == 200
|
||||||
|
|
||||||
|
def test_live_rewrite_redirect_2(self):
|
||||||
|
resp = self.testapp.get('/rewrite/mp_/http://facebook.com/')
|
||||||
|
assert resp.status_int == 301
|
||||||
|
|
||||||
|
def test_live_rewrite_frame(self):
|
||||||
|
resp = self.testapp.get('/rewrite/http://example.com/')
|
||||||
|
assert resp.status_int == 200
|
||||||
|
assert '<iframe ' in resp.body
|
||||||
|
assert 'src="/rewrite/mp_/http://example.com/"' in resp.body
|
||||||
|
|
||||||
|
|
Loading…
x
Reference in New Issue
Block a user