diff --git a/README.rst b/README.rst index 9c4b380d..6aa256ac 100644 --- a/README.rst +++ b/README.rst @@ -1,4 +1,4 @@ -PyWb 0.5.1 +PyWb 0.5.2 ========== .. image:: https://travis-ci.org/ikreymer/pywb.png?branch=develop diff --git a/pywb/framework/test/test_wbrequestresponse.py b/pywb/framework/test/test_wbrequestresponse.py index e066d4d1..5bbb65b8 100644 --- a/pywb/framework/test/test_wbrequestresponse.py +++ b/pywb/framework/test/test_wbrequestresponse.py @@ -40,13 +40,13 @@ # WbResponse Tests # ================= >>> WbResponse.text_response('Test') -{'body': ['Test'], 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [('Content-Type', 'text/plain')])} +{'body': ['Test'], 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [('Content-Type', 'text/plain'), ('Content-Length', '4')])} >>> WbResponse.text_stream(['Test', 'Another'], '404') {'body': ['Test', 'Another'], 'status_headers': StatusAndHeaders(protocol = '', statusline = '404', headers = [('Content-Type', 'text/plain')])} >>> WbResponse.redir_response('http://example.com/otherfile') -{'body': [], 'status_headers': StatusAndHeaders(protocol = '', statusline = '302 Redirect', headers = [('Location', 'http://example.com/otherfile')])} +{'body': [], 'status_headers': StatusAndHeaders(protocol = '', statusline = '302 Redirect', headers = [('Location', 'http://example.com/otherfile'), ('Content-Length', '0')])} """ diff --git a/pywb/framework/wbrequestresponse.py b/pywb/framework/wbrequestresponse.py index 0f1a9f32..da456474 100644 --- a/pywb/framework/wbrequestresponse.py +++ b/pywb/framework/wbrequestresponse.py @@ -125,7 +125,7 @@ class WbRequest(object): if not self.wb_url: return - mime = self.env.get('CONTENT_TYPE') + mime = self.env.get('CONTENT_TYPE').split(';')[0] length = self.env.get('CONTENT_LENGTH') stream = self.env['wsgi.input'] @@ -152,23 +152,31 @@ class WbResponse(object): pass @staticmethod - def text_stream(stream, status='200 OK', content_type='text/plain'): - status_headers = StatusAndHeaders(status, - [('Content-Type', content_type)]) + def text_stream(stream, status='200 OK', content_type='text/plain', + headers=None): + def_headers = [('Content-Type', content_type)] + if headers: + def_headers += headers + + status_headers = StatusAndHeaders(status, def_headers) return WbResponse(status_headers, value=stream) @staticmethod def text_response(text, status='200 OK', content_type='text/plain'): status_headers = StatusAndHeaders(status, - [('Content-Type', content_type)]) + [('Content-Type', content_type), + ('Content-Length', str(len(text)))]) return WbResponse(status_headers, value=[text]) @staticmethod - def redir_response(location, status='302 Redirect'): - return WbResponse(StatusAndHeaders(status, - [('Location', location)])) + def redir_response(location, status='302 Redirect', headers=None): + redir_headers = [('Location', location), ('Content-Length', '0')] + if headers: + redir_headers += headers + + return WbResponse(StatusAndHeaders(status, redir_headers)) def __call__(self, env, start_response): diff --git a/pywb/rewrite/rewrite_content.py b/pywb/rewrite/rewrite_content.py index ec93593a..93ec396b 100644 --- a/pywb/rewrite/rewrite_content.py +++ b/pywb/rewrite/rewrite_content.py @@ -1,6 +1,8 @@ #import chardet import pkgutil import yaml +import re + from chardet.universaldetector import UniversalDetector from io import BytesIO @@ -52,11 +54,12 @@ class RewriteContent: return (rewritten_headers, stream) - def rewrite_content(self, urlrewriter, headers, stream, + def rewrite_content(self, wb_url, urlrewriter, headers, stream, head_insert_func=None, urlkey='', - sanitize_only=False, cdx=None, mod=None): + cdx=None): - if sanitize_only: + if (wb_url.is_identity or + (not head_insert_func and wb_url.is_banner_only)): status_headers, stream = self.sanitize_content(headers, stream) return (status_headers, self.stream_to_gen(stream), False) @@ -78,6 +81,8 @@ class RewriteContent: # see known js/css modifier specified, the context should run # default text_type + mod = wb_url.mod + if mod == 'js_': text_type = 'js' elif mod == 'cs_': @@ -118,6 +123,10 @@ class RewriteContent: if head_insert_func: head_insert_str = head_insert_func(rule, cdx) + if wb_url.is_banner_only: + gen = self._head_insert_only_gen(head_insert_str, stream) + return (status_headers, gen, False) + rewriter = rewriter_class(urlrewriter, js_rewriter_class=rule.rewriters['js'], css_rewriter_class=rule.rewriters['css'], @@ -125,7 +134,10 @@ class RewriteContent: defmod=self.defmod) else: - # apply one of (js, css, xml) rewriters + if wb_url.is_banner_only: + return (status_headers, self.stream_to_gen(stream), False) + + # apply one of (js, css, xml) rewriters rewriter = rewriter_class(urlrewriter) # Create rewriting generator @@ -134,6 +146,32 @@ class RewriteContent: return (status_headers, gen, True) + HEAD_REGEX = re.compile(r'<\s*head\b[^>]*[>]+', re.I) + + def _head_insert_only_gen(self, insert_str, stream): + max_len = 1024 + buff = '' + while max_len > 0: + curr = stream.read(max_len) + if not curr: + break + + max_len -= len(buff) + buff += curr + + matcher = self.HEAD_REGEX.search(buff) + + if matcher: + yield buff[:matcher.end()] + insert_str + yield buff[matcher.end():] + else: + yield insert_str + yield buff + + for buff in self.stream_to_gen(stream): + yield buff + + # Create rewrite stream, may even be chunked by front-end def _rewriting_stream_gen(self, rewriter, encoding, stream, first_buff=None): diff --git a/pywb/rewrite/rewrite_live.py b/pywb/rewrite/rewrite_live.py index b81b0144..97024600 100644 --- a/pywb/rewrite/rewrite_live.py +++ b/pywb/rewrite/rewrite_live.py @@ -14,8 +14,9 @@ from pywb.utils.timeutils import datetime_to_timestamp from pywb.utils.statusandheaders import StatusAndHeaders from pywb.utils.canonicalize import canonicalize -from pywb.rewrite.url_rewriter import UrlRewriter -from pywb.rewrite.rewrite_content import RewriteContent +from url_rewriter import UrlRewriter +from wburl import WbUrl +from rewrite_content import RewriteContent #================================================================= @@ -114,15 +115,20 @@ class LiveRewriter(object): return (status_headers, stream) - def fetch_request(self, url, urlrewriter, + def fetch_request(self, wb_url, urlrewriter, head_insert_func=None, urlkey=None, env=None, req_headers={}, timestamp=None, follow_redirects=False, - proxies=None, - mod=None): + proxies=None): + + if isinstance(wb_url, str): + url = wb_url + wb_url = WbUrl(url) + else: + url = wb_url.url ts_err = url.split('///') @@ -155,13 +161,13 @@ class LiveRewriter(object): } result = (self.rewriter. - rewrite_content(urlrewriter, + rewrite_content(wb_url, + urlrewriter, status_headers, stream, head_insert_func=head_insert_func, urlkey=urlkey, - cdx=cdx, - mod=mod)) + cdx=cdx)) return result @@ -174,41 +180,3 @@ class LiveRewriter(object): buff = ''.join(gen) return (status_headers, buff) - - -#================================================================= -def main(): # pragma: no cover - import sys - - if len(sys.argv) < 2: - msg = 'Usage: {0} url-to-fetch [wb-url-target] [extra-prefix]' - print msg.format(sys.argv[0]) - return 1 - else: - url = sys.argv[1] - - if len(sys.argv) >= 3: - wburl_str = sys.argv[2] - if wburl_str.startswith('/'): - wburl_str = wburl_str[1:] - - prefix, wburl_str = wburl_str.split('/', 1) - prefix = '/' + prefix + '/' - else: - wburl_str = (datetime_to_timestamp(datetime.datetime.now()) + - '/http://example.com/path/sample.html') - prefix = '/pywb_rewrite/' - - urlrewriter = UrlRewriter(wburl_str, prefix) - - liverewriter = LiveRewriter() - - status_headers, buff = liverewriter.get_rewritten(url, urlrewriter) - - sys.stdout.write(buff) - return 0 - - -#================================================================= -if __name__ == "__main__": - exit(main()) diff --git a/pywb/rewrite/wburl.py b/pywb/rewrite/wburl.py index 3cd9ad72..f826108f 100644 --- a/pywb/rewrite/wburl.py +++ b/pywb/rewrite/wburl.py @@ -196,8 +196,11 @@ class WbUrl(BaseWbUrl): @property def is_embed(self): return (self.mod and - self.mod != 'id_' and - self.mod != 'mp_') + self.mod not in ('id_', 'mp_', 'bn_')) + + @property + def is_banner_only(self): + return (self.mod == 'bn_') @property def is_identity(self): diff --git a/pywb/ui/head_insert.html b/pywb/ui/head_insert.html index be810823..b1ff4a26 100644 --- a/pywb/ui/head_insert.html +++ b/pywb/ui/head_insert.html @@ -1,5 +1,5 @@ -{% if rule.js_rewrite_location %} +{% if rule.js_rewrite_location and include_wombat %} diff --git a/pywb/webapp/handlers.py b/pywb/webapp/handlers.py index 6228de3e..ce30793d 100644 --- a/pywb/webapp/handlers.py +++ b/pywb/webapp/handlers.py @@ -115,6 +115,14 @@ class StaticHandler(BaseHandler): try: data = self.block_loader.load(full_path) + try: + data.seek(0, 2) + size = data.tell() + data.seek(0) + headers = [('Content-Length', str(size))] + except IOError: + headers = None + if 'wsgi.file_wrapper' in wbrequest.env: reader = wbrequest.env['wsgi.file_wrapper'](data) else: @@ -122,7 +130,9 @@ class StaticHandler(BaseHandler): content_type, _ = mimetypes.guess_type(full_path) - return WbResponse.text_stream(data, content_type=content_type) + return WbResponse.text_stream(data, + content_type=content_type, + headers=headers) except IOError: raise NotFoundException('Static File Not Found: ' + diff --git a/pywb/webapp/replay_views.py b/pywb/webapp/replay_views.py index c4e0f4f3..2542aee2 100644 --- a/pywb/webapp/replay_views.py +++ b/pywb/webapp/replay_views.py @@ -99,8 +99,8 @@ class RewriteLiveView(BaseContentView): if ref_wburl_str: wbrequest.env['REL_REFERER'] = WbUrl(ref_wburl_str).url - url = wbrequest.wb_url.url - result = self.rewriter.fetch_request(url, wbrequest.urlrewriter, + wb_url = wbrequest.wb_url + result = self.rewriter.fetch_request(wb_url, wbrequest.urlrewriter, head_insert_func=head_insert_func, env=wbrequest.env) @@ -211,14 +211,13 @@ class ReplayView(BaseContentView): create_insert_func(wbrequest)) result = (self.content_rewriter. - rewrite_content(urlrewriter, + rewrite_content(wbrequest.wb_url, + urlrewriter, headers=status_headers, stream=stream, head_insert_func=head_insert_func, urlkey=cdx['urlkey'], - sanitize_only=wbrequest.wb_url.is_identity, - cdx=cdx, - mod=wbrequest.wb_url.mod)) + cdx=cdx)) (status_headers, response_iter, is_rewritten) = result diff --git a/pywb/webapp/views.py b/pywb/webapp/views.py index c49be8c9..0fc5589d 100644 --- a/pywb/webapp/views.py +++ b/pywb/webapp/views.py @@ -121,16 +121,18 @@ def add_env_globals(glb): #================================================================= class HeadInsertView(J2TemplateView): - def create_insert_func(self, wbrequest, include_ts=True): + def create_insert_func(self, wbrequest, + include_ts=True): canon_url = wbrequest.wb_prefix + wbrequest.wb_url.to_str(mod='') - include_ts = include_ts + include_wombat = not wbrequest.wb_url.is_banner_only def make_head_insert(rule, cdx): return (self.render_to_string(wbrequest=wbrequest, cdx=cdx, canon_url=canon_url, include_ts=include_ts, + include_wombat=include_wombat, rule=rule)) return make_head_insert diff --git a/setup.py b/setup.py index 3e89abed..a6e9c885 100755 --- a/setup.py +++ b/setup.py @@ -34,7 +34,7 @@ class PyTest(TestCommand): setup( name='pywb', - version='0.5.1', + version='0.5.2', url='https://github.com/ikreymer/pywb', author='Ilya Kreymer', author_email='ikreymer@gmail.com', diff --git a/tests/test_integration.py b/tests/test_integration.py index 94ce45cf..456d50f8 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -98,6 +98,7 @@ class TestWb: assert 'Mon, Jan 27 2014 17:12:38' in resp.body assert 'wb.js' in resp.body + assert 'WB_wombat_init' in resp.body assert '/pywb/20140127171238mp_/http://www.iana.org/time-zones"' in resp.body def test_replay_non_frame_content(self): @@ -141,6 +142,19 @@ class TestWb: assert lines[0].startswith('org,iana)/_css/2013.1/print.css 20140127171239') + def test_replay_banner_only(self): + resp = self.testapp.get('/pywb/20140126201054bn_/http://www.iana.org/domains/reserved') + + # wb.js header insertion + assert 'wb.js' in resp.body + + # no wombat present + assert 'WB_wombat_init' not in resp.body + + # url not rewritten + #assert '"http://www.iana.org/domains/example"' in resp.body + assert '"/_css/2013.1/screen.css"' in resp.body + def test_replay_identity_1(self): resp = self.testapp.get('/pywb/20140127171251id_/http://example.com')