diff --git a/pywb/apps/rewrite_live.py b/pywb/apps/live_rewrite_server.py similarity index 58% rename from pywb/apps/rewrite_live.py rename to pywb/apps/live_rewrite_server.py index e3b8f45b..9b29e42b 100644 --- a/pywb/apps/rewrite_live.py +++ b/pywb/apps/live_rewrite_server.py @@ -1,16 +1,16 @@ from pywb.framework.wsgi_wrappers import init_app, start_wsgi_server -from pywb.webapp.rewrite_handler import create_rewrite_app +from pywb.webapp.live_rewrite_handler import create_live_rewriter_app #================================================================= # init cdx server app #================================================================= -application = init_app(create_rewrite_app, load_yaml=False) +application = init_app(create_live_rewriter_app, load_yaml=False) def main(): # pragma: no cover - start_wsgi_server(application, 'Rewrite App', default_port=8090) + start_wsgi_server(application, 'Live Rewriter App', default_port=8090) if __name__ == "__main__": main() diff --git a/pywb/rewrite/html_rewriter.py b/pywb/rewrite/html_rewriter.py index 99cab8d0..36601e98 100644 --- a/pywb/rewrite/html_rewriter.py +++ b/pywb/rewrite/html_rewriter.py @@ -53,7 +53,6 @@ class HTMLRewriterMixin(object): return rewrite_tags - STATE_TAGS = ['script', 'style'] # tags allowed in the of an html document diff --git a/pywb/rewrite/rewrite_content.py b/pywb/rewrite/rewrite_content.py index ae0ef70d..61f2641c 100644 --- a/pywb/rewrite/rewrite_content.py +++ b/pywb/rewrite/rewrite_content.py @@ -54,7 +54,7 @@ class RewriteContent: def rewrite_content(self, urlrewriter, headers, stream, head_insert_func=None, urlkey='', - sanitize_only=False): + sanitize_only=False, cdx=None): if sanitize_only: status_headers, stream = self.sanitize_content(headers, stream) @@ -107,7 +107,7 @@ class RewriteContent: head_insert_str = '' if head_insert_func: - head_insert_str = head_insert_func(rule) + head_insert_str = head_insert_func(rule, cdx) rewriter = rewriter_class(urlrewriter, js_rewriter_class=rule.rewriters['js'], diff --git a/pywb/rewrite/rewrite_live.py b/pywb/rewrite/rewrite_live.py index ebedd73d..7bc1f8db 100644 --- a/pywb/rewrite/rewrite_live.py +++ b/pywb/rewrite/rewrite_live.py @@ -2,9 +2,7 @@ Fetch a url from live web and apply rewriting rules """ -import urllib2 -import os -import sys +import requests import datetime import mimetypes @@ -18,61 +16,124 @@ from pywb.rewrite.rewrite_content import RewriteContent #================================================================= -def get_status_and_stream(url): - resp = urllib2.urlopen(url) +class LiveRewriter(object): + PROXY_HEADER_LIST = [('HTTP_USER_AGENT', 'User-Agent')] - headers = [] - for name, value in resp.info().dict.iteritems(): - headers.append((name, value)) + def __init__(self, defmod=''): + self.rewriter = RewriteContent(defmod=defmod) - status_headers = StatusAndHeaders('200 OK', headers) - stream = resp + def fetch_local_file(self, uri): + fh = open(uri) - return (status_headers, stream) + content_type, _ = mimetypes.guess_type(uri) + # create fake headers for local file + status_headers = StatusAndHeaders('200 OK', + [('Content-Type', content_type)]) + stream = fh -#================================================================= -def get_local_file(uri): - fh = open(uri) + return (status_headers, stream) - content_type, _ = mimetypes.guess_type(uri) + def translate_headers(self, env, header_list=None): + headers = {} - # create fake headers for local file - status_headers = StatusAndHeaders('200 OK', - [('Content-Type', content_type)]) - stream = fh + if not header_list: + header_list = self.PROXY_HEADER_LIST - return (status_headers, stream) + for env_name, req_name in header_list: + value = env.get(env_name) + if value is not None: + headers[req_name] = value + return headers -#================================================================= -def get_rewritten(url, urlrewriter, urlkey=None, head_insert_func=None): - if is_http(url): - (status_headers, stream) = get_status_and_stream(url) - else: - (status_headers, stream) = get_local_file(url) + def fetch_http(self, url, + env=None, + req_headers={}, + follow_redirects=False): - # explicit urlkey may be passed in (say for testing) - if not urlkey: - urlkey = canonicalize(url) + method = 'GET' + data = None - rewriter = RewriteContent() + if env is not None: + method = env['REQUEST_METHOD'].upper() + input_ = env['wsgi.input'] - result = rewriter.rewrite_content(urlrewriter, - status_headers, - stream, - head_insert_func=head_insert_func, - urlkey=urlkey) + req_headers.update(self.translate_headers(env)) - status_headers, gen, is_rewritten = result + if method in ('POST', 'PUT'): + data = input_ - buff = ''.join(gen) + response = requests.request(method=method, + url=url, + data=data, + headers=req_headers, + allow_redirects=follow_redirects, + stream=True) - return (status_headers, buff) + statusline = str(response.status_code) + ' ' + response.reason + + headers = response.headers.items() + stream = response.raw + + status_headers = StatusAndHeaders(statusline, headers) + + return (status_headers, stream) + + def fetch_request(self, url, urlrewriter, + head_insert_func=None, urlkey=None, + env=None, req_headers={}, follow_redirects=False): + + ts_err = url.split('///') + + if len(ts_err) > 1: + url = 'http://' + ts_err[1] + + if url.startswith('//'): + url = 'http:' + url + + if is_http(url): + (status_headers, stream) = self.fetch_http(url, env, req_headers, + follow_redirects) + else: + (status_headers, stream) = self.fetch_local_file(url) + + # explicit urlkey may be passed in (say for testing) + if not urlkey: + urlkey = canonicalize(url) + + cdx = {'urlkey': urlkey, + 'timestamp': datetime_to_timestamp(datetime.datetime.utcnow()), + 'original': url, + 'statuscode': status_headers.get_statuscode(), + 'mimetype': status_headers.get_header('Content-Type') + } + + result = (self.rewriter. + rewrite_content(urlrewriter, + status_headers, + stream, + head_insert_func=head_insert_func, + urlkey=urlkey, + cdx=cdx)) + + return result + + def get_rewritten(self, *args, **kwargs): + + result = self.fetch_request(*args, **kwargs) + + status_headers, gen, is_rewritten = result + + buff = ''.join(gen) + + return (status_headers, buff) #================================================================= def main(): # pragma: no cover + import sys + if len(sys.argv) < 2: msg = 'Usage: {0} url-to-fetch [wb-url-target] [extra-prefix]' print msg.format(sys.argv[0]) @@ -94,7 +155,9 @@ def main(): # pragma: no cover urlrewriter = UrlRewriter(wburl_str, prefix) - status_headers, buff = get_rewritten(url, urlrewriter) + liverewriter = LiveRewriter() + + status_headers, buff = liverewriter.get_rewritten(url, urlrewriter) sys.stdout.write(buff) return 0 diff --git a/pywb/rewrite/rewriterules.py b/pywb/rewrite/rewriterules.py index a7737248..f9eae0b9 100644 --- a/pywb/rewrite/rewriterules.py +++ b/pywb/rewrite/rewriterules.py @@ -30,9 +30,11 @@ def use_lxml_parser(): return _is_lxml +#================================================================= def is_lxml(): return _is_lxml + #================================================================= class RewriteRules(BaseRule): def __init__(self, url_prefix, config={}): diff --git a/pywb/rewrite/test/test_rewrite_live.py b/pywb/rewrite/test/test_rewrite_live.py index 13a941ea..1e8fa25e 100644 --- a/pywb/rewrite/test/test_rewrite_live.py +++ b/pywb/rewrite/test/test_rewrite_live.py @@ -1,4 +1,4 @@ -from pywb.rewrite.rewrite_live import get_rewritten +from pywb.rewrite.rewrite_live import LiveRewriter from pywb.rewrite.url_rewriter import UrlRewriter from pywb import get_test_dir @@ -8,7 +8,7 @@ from pywb import get_test_dir urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/pywb/') -def head_insert_func(rule): +def head_insert_func(rule, cdx): if rule.js_rewrite_location == True: return '' else: @@ -18,8 +18,8 @@ def head_insert_func(rule): def test_local_1(): status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html', urlrewriter, - 'com,example,test)/', - head_insert_func) + head_insert_func, + 'com,example,test)/') # wombat insert added assert '' in buff @@ -34,8 +34,8 @@ def test_local_1(): def test_local_2_no_js_location_rewrite(): status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html', urlrewriter, - 'example,example,test)/nolocation_rewrite', - head_insert_func) + head_insert_func, + 'example,example,test)/nolocation_rewrite') # no wombat insert assert '' not in buff @@ -46,28 +46,40 @@ def test_local_2_no_js_location_rewrite(): # still link rewrite assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff + def test_example_1(): - status_headers, buff = get_rewritten('http://example.com/', urlrewriter) - - # verify header rewriting - assert (('X-Archive-Orig-connection', 'close') in status_headers.headers), status_headers - - -def test_example_2(): - status_headers, buff = get_rewritten('http://example.com/', urlrewriter) + status_headers, buff = get_rewritten('http://example.com/', urlrewriter, req_headers={'Connection': 'close'}) # verify header rewriting assert (('X-Archive-Orig-connection', 'close') in status_headers.headers), status_headers assert '/pywb/20131226101010/http://www.iana.org/domains/example' in buff, buff +def test_example_2_redirect(): + status_headers, buff = get_rewritten('http://facebook.com/', urlrewriter) + # redirect, no content + assert status_headers.get_statuscode() == '301' + assert len(buff) == 0 + + +def test_example_3_rel(): + status_headers, buff = get_rewritten('//example.com/', urlrewriter) + assert status_headers.get_statuscode() == '200' + + +def test_example_4_rewrite_err(): + # may occur in case of rewrite mismatch, the /// gets stripped off + status_headers, buff = get_rewritten('http://localhost:8080///example.com/', urlrewriter) + assert status_headers.get_statuscode() == '200' def test_example_domain_specific_3(): urlrewriter2 = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/pywb/') - status_headers, buff = get_rewritten('http://facebook.com/digitalpreservation', urlrewriter2) + status_headers, buff = get_rewritten('http://facebook.com/digitalpreservation', urlrewriter2, follow_redirects=True) # comment out bootloader assert '/* Bootloader.configurePage' in buff +def get_rewritten(*args, **kwargs): + return LiveRewriter().get_rewritten(*args, **kwargs) diff --git a/pywb/static/wb.css b/pywb/static/wb.css index 90c62927..880f0890 100644 --- a/pywb/static/wb.css +++ b/pywb/static/wb.css @@ -32,7 +32,7 @@ { width: 100%; height: 100%; - padding: 40px 8px 8px 0px; + padding: 40px 4px 4px 0px; border: none; box-sizing: border-box; -moz-box-sizing: border-box; @@ -43,5 +43,5 @@ { width: 100%; height: 100%; - border: 4px solid firebrick; + border: 2px solid tan; } diff --git a/pywb/utils/statusandheaders.py b/pywb/utils/statusandheaders.py index 85805cb2..ae3fc261 100644 --- a/pywb/utils/statusandheaders.py +++ b/pywb/utils/statusandheaders.py @@ -57,12 +57,20 @@ class StatusAndHeaders(object): return False + def get_statuscode(self): + """ + Return the statuscode part of the status response line + (Assumes no protocol in the statusline) + """ + code = self.statusline.split(' ', 1)[0] + return code + def validate_statusline(self, valid_statusline): """ Check that the statusline is valid, eg. starts with a numeric code. If not, replace with passed in valid_statusline """ - code = self.statusline.split(' ', 1)[0] + code = self.get_statuscode() try: code = int(code) assert(code > 0) diff --git a/pywb/webapp/live_rewrite_handler.py b/pywb/webapp/live_rewrite_handler.py new file mode 100644 index 00000000..d554c010 --- /dev/null +++ b/pywb/webapp/live_rewrite_handler.py @@ -0,0 +1,65 @@ +from pywb.framework.basehandlers import WbUrlHandler +from pywb.framework.wbrequestresponse import WbResponse +from pywb.framework.archivalrouter import ArchivalRouter, Route + +from pywb.rewrite.rewrite_live import LiveRewriter + +from handlers import StaticHandler + +from pywb.utils.canonicalize import canonicalize +from pywb.utils.timeutils import datetime_to_timestamp +from pywb.utils.statusandheaders import StatusAndHeaders + +from pywb.rewrite.rewriterules import use_lxml_parser + +import datetime + +from views import J2TemplateView, HeadInsertView + + +class RewriteHandler(WbUrlHandler): + def __init__(self, config={}): + #use_lxml_parser() + self.rewriter = LiveRewriter(defmod='mp_') + + head_insert = config.get('head_insert_html', + 'ui/head_insert.html') + + frame_insert = config.get('frame_insert_html', + 'ui/frame_insert.html') + + view = HeadInsertView.create_template(head_insert, 'Head Insert') + self.head_insert_view = view + + view = J2TemplateView.create_template(frame_insert, 'Frame Insert') + self.frame_insert_view = view + + def __call__(self, wbrequest): + + url = wbrequest.wb_url.url + + if not wbrequest.wb_url.mod: + embed_url = wbrequest.wb_url.to_str(mod='mp_') + timestamp = datetime_to_timestamp(datetime.datetime.utcnow()) + + return self.frame_insert_view.render_response(embed_url=embed_url, + wbrequest=wbrequest, + timestamp=timestamp, + url=url) + + head_insert_func = self.head_insert_view.create_insert_func(wbrequest) + + result = self.rewriter.fetch_request(url, wbrequest.urlrewriter, + head_insert_func=head_insert_func, + env=wbrequest.env) + + status_headers, gen, is_rewritten = result + + return WbResponse(status_headers, gen) + + +def create_live_rewriter_app(): + routes = [Route('rewrite', RewriteHandler()), + Route('static/default', StaticHandler('pywb/static/')) + ] + return ArchivalRouter(routes, hostpaths=['http://localhost:8080']) diff --git a/pywb/webapp/replay_views.py b/pywb/webapp/replay_views.py index c45b5983..7c0f1d7f 100644 --- a/pywb/webapp/replay_views.py +++ b/pywb/webapp/replay_views.py @@ -115,8 +115,8 @@ class ReplayView(object): head_insert_func = None if self.head_insert_view: - head_insert_func = self.head_insert_view.create_insert_func(wbrequest, - cdx) + head_insert_func = (self.head_insert_view. + create_insert_func(wbrequest)) result = (self.content_rewriter. rewrite_content(urlrewriter, @@ -124,7 +124,8 @@ class ReplayView(object): stream=stream, head_insert_func=head_insert_func, urlkey=cdx['urlkey'], - sanitize_only=wbrequest.wb_url.is_identity)) + sanitize_only=wbrequest.wb_url.is_identity, + cdx=cdx)) (status_headers, response_iter, is_rewritten) = result diff --git a/pywb/webapp/rewrite_handler.py b/pywb/webapp/rewrite_handler.py deleted file mode 100644 index ce672516..00000000 --- a/pywb/webapp/rewrite_handler.py +++ /dev/null @@ -1,126 +0,0 @@ -from pywb.framework.basehandlers import WbUrlHandler -from pywb.framework.wbrequestresponse import WbResponse -from pywb.framework.archivalrouter import ArchivalRouter, Route - -from pywb.rewrite.rewrite_content import RewriteContent - -from handlers import StaticHandler - -from pywb.utils.canonicalize import canonicalize -from pywb.utils.timeutils import datetime_to_timestamp -from pywb.utils.statusandheaders import StatusAndHeaders - -from pywb.rewrite.rewriterules import use_lxml_parser - -import datetime -import requests - -from io import BytesIO, BufferedReader - -from views import J2TemplateView, HeadInsertView - - -class RewriteHandler(WbUrlHandler): # pragma: no cover - def __init__(self, head_insert_view=None): - #use_lxml_parser() - self.rewriter = RewriteContent(defmod='mp_') - self.head_insert_view = (HeadInsertView. - create_template('ui/head_insert.html', - 'Head Insert')) - - self.frame_insert_view = (J2TemplateView. - create_template('ui/frame_insert.html', - 'Frame Insert')) - - def proxy_request(self, url, env): - - method = env['REQUEST_METHOD'].upper() - input_ = env['wsgi.input'] - - ua = env['HTTP_USER_AGENT'] - - req_headers = {'User-Agent': ua} - - if url.startswith('//'): - url = 'http:' + url - - if method in ('POST', 'PUT'): - data = input_ - else: - data = None - - response = self.do_http_request(method, - url, - data, - req_headers) - code = response.status_code - - headers = response.headers.items() - stream = response.raw - - status_headers = StatusAndHeaders(str(code), headers) - - return (status_headers, stream) - - def do_http_request(self, method, url, data, req_headers): - req = requests.request(method=method, - url=url, - data=data, - headers=req_headers, - allow_redirects=False, - stream=True) - return req - - def __call__(self, wbrequest): - - url = wbrequest.wb_url.url - - if not wbrequest.wb_url.mod: - embed_url = wbrequest.wb_url.to_str(mod='mp_') - timestamp = datetime_to_timestamp(datetime.datetime.utcnow()) - - return self.frame_insert_view.render_response(embed_url=embed_url, - wbrequest=wbrequest, - timestamp=timestamp, - url=url) - - ts_err = url.split('///') - if len(ts_err) > 1: - url = 'http://' + ts_err[1] - - try: - status_headers, stream = self.proxy_request(url, wbrequest.env) - except Exception: - print 'ERR on ', url - raise - - urlkey = canonicalize(url) - - cdx = {'urlkey': urlkey, - 'timestamp': datetime_to_timestamp(datetime.datetime.utcnow()), - 'original': url, - 'statuscode' : status_headers.statusline.split(' ')[0], - 'mimetype' : status_headers.get_header('Content-Type') - } - - - #head_insert_func = self.get_head_insert_func(wbrequest, cdx) - head_insert_func = self.head_insert_view.create_insert_func(wbrequest, - cdx) - - result = self.rewriter.rewrite_content(wbrequest.urlrewriter, - status_headers, - stream, - head_insert_func=head_insert_func, - urlkey=urlkey) - - status_headers, gen, is_rewritten = result - - return WbResponse(status_headers, gen) - - -def create_rewrite_app(): # pragma: no cover - routes = [Route('rewrite', RewriteHandler()), - Route('static/default', StaticHandler('pywb/static/')) - ] - return ArchivalRouter(routes, hostpaths=['http://localhost:8080']) diff --git a/pywb/webapp/views.py b/pywb/webapp/views.py index 84e21624..5a560279 100644 --- a/pywb/webapp/views.py +++ b/pywb/webapp/views.py @@ -117,11 +117,11 @@ def add_env_globals(glb): #================================================================= class HeadInsertView(J2TemplateView): - def create_insert_func(self, wbrequest, cdx): + def create_insert_func(self, wbrequest): canon_url = wbrequest.wb_prefix + wbrequest.wb_url.to_str(mod='') - def make_head_insert(rule): + def make_head_insert(rule, cdx): return (self.render_to_string(wbrequest=wbrequest, cdx=cdx, canon_url=canon_url, diff --git a/setup.py b/setup.py index cb5717f1..329537c0 100755 --- a/setup.py +++ b/setup.py @@ -66,6 +66,7 @@ setup( install_requires=[ 'rfc3987', 'chardet', + 'requests', 'redis', 'jinja2', 'surt', @@ -84,8 +85,8 @@ setup( [console_scripts] wayback = pywb.apps.wayback:main cdx-server = pywb.apps.cdx_server:main - rewrite-live = pywb.apps.rewrite_live:main cdx-indexer = pywb.warc.archiveindexer:main + live-rewrite-server = pywb.apps.live_rewrite_server:main """, zip_safe=False, classifiers=[ diff --git a/tests/test_live_rewriter.py b/tests/test_live_rewriter.py new file mode 100644 index 00000000..2cc9e108 --- /dev/null +++ b/tests/test_live_rewriter.py @@ -0,0 +1,25 @@ +from pywb.webapp.live_rewrite_handler import create_live_rewriter_app +from pywb.framework.wsgi_wrappers import init_app +import webtest + +class TestLiveRewriter: + def setup(self): + self.app = init_app(create_live_rewriter_app, load_yaml=False) + self.testapp = webtest.TestApp(self.app) + + def test_live_rewrite_1(self): + headers = [('User-Agent', 'python')] + resp = self.testapp.get('/rewrite/mp_/http://example.com/', headers=headers) + assert resp.status_int == 200 + + def test_live_rewrite_redirect_2(self): + resp = self.testapp.get('/rewrite/mp_/http://facebook.com/') + assert resp.status_int == 301 + + def test_live_rewrite_frame(self): + resp = self.testapp.get('/rewrite/http://example.com/') + assert resp.status_int == 200 + assert '